fix(grpc/client,node/task/handler): add RetryWithBackoff, stabilize reconnection, and retry gRPC ops

- add RetryWithBackoff helper to grpc client for exponential retry with backoff and reconnection-aware handling
- increase reconnectionClientTimeout to 90s and introduce connectionStabilizationDelay; wait briefly after reconnection to avoid immediate flapping
- refresh reconnection flag while waiting for client registration and improve cancellation message
- replace direct heartbeat RPC with RetryWithBackoff in WorkerService (use extended timeout)
- use RetryWithBackoff for worker node status updates in task handler and propagate errors
This commit is contained in:
Marvin Zhang
2025-10-20 13:01:10 +08:00
parent f441265cc2
commit 2dfc66743b
3 changed files with 109 additions and 26 deletions

View File

@@ -438,15 +438,25 @@ func (svc *Service) updateNodeStatus() (err error) {
currentGoroutines := runtime.NumGoroutine()
svc.Debugf("Node status update - runners: %d, goroutines: %d", n.CurrentRunners, currentGoroutines)
// save node
// save node with retry for reconnection scenarios
n.SetUpdated(n.CreatedBy)
if svc.cfgSvc.IsMaster() {
err = service.NewModelService[models.Node]().ReplaceById(n.Id, *n)
if err != nil {
return err
}
} else {
err = client.NewModelService[models.Node]().ReplaceById(n.Id, *n)
}
if err != nil {
return err
// Worker node: use gRPC with retry logic
ctx, cancel := context.WithTimeout(svc.ctx, 30*time.Second)
defer cancel()
err = grpcclient.RetryWithBackoff(ctx, func() error {
return client.NewModelService[models.Node]().ReplaceById(n.Id, *n)
}, 3, svc.Logger, "node status update")
if err != nil {
return err
}
}
return nil