fix(grpc/client,node/task/handler): add RetryWithBackoff, stabilize reconnection, and retry gRPC ops

- add RetryWithBackoff helper to grpc client for exponential retry with backoff and reconnection-aware handling
- increase reconnectionClientTimeout to 90s and introduce connectionStabilizationDelay; wait briefly after reconnection to avoid immediate flapping
- refresh reconnection flag while waiting for client registration and improve cancellation message
- replace direct heartbeat RPC with RetryWithBackoff in WorkerService (use extended timeout)
- use RetryWithBackoff for worker node status updates in task handler and propagate errors
This commit is contained in:
Marvin Zhang
2025-10-20 13:01:10 +08:00
parent f441265cc2
commit 2dfc66743b
3 changed files with 109 additions and 26 deletions

View File

@@ -338,18 +338,25 @@ func (svc *WorkerService) subscribe() {
}
func (svc *WorkerService) sendHeartbeat() {
ctx, cancel := context.WithTimeout(svc.ctx, svc.heartbeatInterval)
// Use extended timeout to allow for reconnection scenarios
ctx, cancel := context.WithTimeout(svc.ctx, 30*time.Second)
defer cancel()
nodeClient, err := client.GetGrpcClient().GetNodeClient()
// Retry up to 3 times with exponential backoff for reconnection scenarios
err := client.RetryWithBackoff(ctx, func() error {
nodeClient, err := client.GetGrpcClient().GetNodeClient()
if err != nil {
return err
}
_, err = nodeClient.SendHeartbeat(ctx, &grpc.NodeServiceSendHeartbeatRequest{
NodeKey: svc.cfgSvc.GetNodeKey(),
})
return err
}, 3, svc.Logger, "heartbeat")
if err != nil {
svc.Errorf("failed to get node client: %v", err)
return
}
_, err = nodeClient.SendHeartbeat(ctx, &grpc.NodeServiceSendHeartbeatRequest{
NodeKey: svc.cfgSvc.GetNodeKey(),
})
if err != nil {
svc.Errorf("failed to send heartbeat to master: %v", err)
svc.Errorf("failed to send heartbeat: %v", err)
}
}