mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-24 17:41:03 +01:00
fix(grpc/client,node/task/handler): add RetryWithBackoff, stabilize reconnection, and retry gRPC ops
- add RetryWithBackoff helper to grpc client for exponential retry with backoff and reconnection-aware handling - increase reconnectionClientTimeout to 90s and introduce connectionStabilizationDelay; wait briefly after reconnection to avoid immediate flapping - refresh reconnection flag while waiting for client registration and improve cancellation message - replace direct heartbeat RPC with RetryWithBackoff in WorkerService (use extended timeout) - use RetryWithBackoff for worker node status updates in task handler and propagate errors
This commit is contained in:
@@ -338,18 +338,25 @@ func (svc *WorkerService) subscribe() {
|
||||
}
|
||||
|
||||
func (svc *WorkerService) sendHeartbeat() {
|
||||
ctx, cancel := context.WithTimeout(svc.ctx, svc.heartbeatInterval)
|
||||
// Use extended timeout to allow for reconnection scenarios
|
||||
ctx, cancel := context.WithTimeout(svc.ctx, 30*time.Second)
|
||||
defer cancel()
|
||||
nodeClient, err := client.GetGrpcClient().GetNodeClient()
|
||||
|
||||
// Retry up to 3 times with exponential backoff for reconnection scenarios
|
||||
err := client.RetryWithBackoff(ctx, func() error {
|
||||
nodeClient, err := client.GetGrpcClient().GetNodeClient()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = nodeClient.SendHeartbeat(ctx, &grpc.NodeServiceSendHeartbeatRequest{
|
||||
NodeKey: svc.cfgSvc.GetNodeKey(),
|
||||
})
|
||||
return err
|
||||
}, 3, svc.Logger, "heartbeat")
|
||||
|
||||
if err != nil {
|
||||
svc.Errorf("failed to get node client: %v", err)
|
||||
return
|
||||
}
|
||||
_, err = nodeClient.SendHeartbeat(ctx, &grpc.NodeServiceSendHeartbeatRequest{
|
||||
NodeKey: svc.cfgSvc.GetNodeKey(),
|
||||
})
|
||||
if err != nil {
|
||||
svc.Errorf("failed to send heartbeat to master: %v", err)
|
||||
svc.Errorf("failed to send heartbeat: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user