mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-24 17:41:03 +01:00
fix(grpc/client,node/task/handler): add RetryWithBackoff, stabilize reconnection, and retry gRPC ops
- add RetryWithBackoff helper to grpc client for exponential retry with backoff and reconnection-aware handling - increase reconnectionClientTimeout to 90s and introduce connectionStabilizationDelay; wait briefly after reconnection to avoid immediate flapping - refresh reconnection flag while waiting for client registration and improve cancellation message - replace direct heartbeat RPC with RetryWithBackoff in WorkerService (use extended timeout) - use RetryWithBackoff for worker node status updates in task handler and propagate errors
This commit is contained in:
@@ -438,15 +438,25 @@ func (svc *Service) updateNodeStatus() (err error) {
|
||||
currentGoroutines := runtime.NumGoroutine()
|
||||
svc.Debugf("Node status update - runners: %d, goroutines: %d", n.CurrentRunners, currentGoroutines)
|
||||
|
||||
// save node
|
||||
// save node with retry for reconnection scenarios
|
||||
n.SetUpdated(n.CreatedBy)
|
||||
if svc.cfgSvc.IsMaster() {
|
||||
err = service.NewModelService[models.Node]().ReplaceById(n.Id, *n)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
err = client.NewModelService[models.Node]().ReplaceById(n.Id, *n)
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
// Worker node: use gRPC with retry logic
|
||||
ctx, cancel := context.WithTimeout(svc.ctx, 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
err = grpcclient.RetryWithBackoff(ctx, func() error {
|
||||
return client.NewModelService[models.Node]().ReplaceById(n.Id, *n)
|
||||
}, 3, svc.Logger, "node status update")
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
|
||||
Reference in New Issue
Block a user