feat: enhance gRPC client handling with improved reconnection logic and monitoring

This commit is contained in:
Marvin Zhang
2025-09-12 18:16:52 +08:00
parent 07bb7f8ba9
commit e221e3c640
3 changed files with 177 additions and 15 deletions

View File

@@ -8,6 +8,7 @@ import (
"github.com/apex/log"
"github.com/cenkalti/backoff/v4"
"github.com/crawlab-team/crawlab/core/constants"
"github.com/crawlab-team/crawlab/core/grpc/client"
"github.com/crawlab-team/crawlab/core/grpc/server"
"github.com/crawlab-team/crawlab/core/interfaces"
"github.com/crawlab-team/crawlab/core/models/common"
@@ -108,12 +109,15 @@ func (svc *MasterService) startMonitoring() {
ticker := time.NewTicker(svc.monitorInterval)
for {
// monitor
// monitor worker nodes
err := svc.monitor()
if err != nil {
svc.Errorf("master[%s] monitor error: %v", svc.cfgSvc.GetNodeKey(), err)
}
// monitor gRPC client health on master
svc.monitorGrpcClientHealth()
// wait
<-ticker.C
}
@@ -207,6 +211,9 @@ func (svc *MasterService) monitor() (err error) {
return
}
// if both subscribe and ping succeed, ensure node is marked as online
go svc.setWorkerNodeOnline(n)
// handle reconnection - reconcile disconnected tasks
go svc.taskReconciliationSvc.HandleNodeReconnection(n)
@@ -236,6 +243,32 @@ func (svc *MasterService) setWorkerNodeOffline(node *models.Node) {
svc.sendNotification(node)
}
func (svc *MasterService) setWorkerNodeOnline(node *models.Node) {
// Only update if the node is currently offline
if node.Status == constants.NodeStatusOnline {
return
}
oldStatus := node.Status
node.Status = constants.NodeStatusOnline
node.Active = true
node.ActiveAt = time.Now()
err := backoff.Retry(func() error {
return service.NewModelService[models.Node]().ReplaceById(node.Id, *node)
}, backoff.WithMaxRetries(backoff.NewConstantBackOff(1*time.Second), 3))
if err != nil {
svc.Errorf("failed to set worker node[%s] online: %v", node.Key, err)
return
}
svc.Infof("worker node[%s] status changed from '%s' to 'online'", node.Key, oldStatus)
// send notification if status changed
if utils.IsPro() && oldStatus != constants.NodeStatusOnline {
svc.sendNotification(node)
}
}
func (svc *MasterService) subscribeNode(n *models.Node) (ok bool) {
_, ok = svc.server.NodeSvr.GetSubscribeStream(n.Id)
return ok
@@ -277,6 +310,18 @@ func (svc *MasterService) sendMasterStatusNotification(oldStatus, newStatus stri
go notification.GetNotificationService().SendNodeNotification(node)
}
func (svc *MasterService) monitorGrpcClientHealth() {
grpcClient := client.GetGrpcClient()
// Check if gRPC client is in a bad state
if !grpcClient.IsReady() && grpcClient.IsClosed() {
svc.Warnf("master node gRPC client is in SHUTDOWN state, forcing FULL RESET")
// Reset the gRPC client to get a fresh instance
client.ResetGrpcClient()
svc.Infof("master node gRPC client has been reset")
}
}
func newMasterService() *MasterService {
cfgSvc := config.GetNodeConfigService()
server := server.GetGrpcServer()

View File

@@ -217,8 +217,8 @@ func (svc *WorkerService) subscribe() {
// Configure exponential backoff
b := backoff.NewExponentialBackOff()
b.InitialInterval = 1 * time.Second
b.MaxInterval = 1 * time.Minute
b.MaxElapsedTime = 10 * time.Minute
b.MaxInterval = 30 * time.Second // Reduced from 1 minute
b.MaxElapsedTime = 0 * time.Minute // Never give up
b.Multiplier = 2.0
for {
@@ -232,7 +232,43 @@ func (svc *WorkerService) subscribe() {
// Use backoff for connection attempts
operation := func() error {
svc.Debugf("attempting to subscribe to master")
nodeClient, err := client.GetGrpcClient().GetNodeClient()
// Wait for gRPC client to be ready and registered after reconnection
grpcClient := client.GetGrpcClient()
waitStart := time.Now()
checkCount := 0
for !grpcClient.IsReadyAndRegistered() {
select {
case <-svc.ctx.Done():
return svc.ctx.Err()
case <-time.After(500 * time.Millisecond):
checkCount++
// Log periodically while waiting
if checkCount%20 == 0 { // Every 10 seconds
svc.Warnf("still waiting for gRPC client (%.1fs)", time.Since(waitStart).Seconds())
// Force a reconnection attempt if we've been waiting too long
if time.Since(waitStart) > 15*time.Second {
svc.Warnf("forcing gRPC client reset due to prolonged wait")
grpcClient = client.ResetGrpcClient()
waitStart = time.Now()
checkCount = 0
}
// Check if client is in SHUTDOWN state and force restart
if !grpcClient.IsReady() && grpcClient.IsClosed() {
svc.Warnf("gRPC client is in SHUTDOWN state, forcing reset")
grpcClient = client.ResetGrpcClient()
waitStart = time.Now()
checkCount = 0
}
}
}
}
svc.Debugf("gRPC client is ready and registered after %.1fs", time.Since(waitStart).Seconds())
nodeClient, err := grpcClient.GetNodeClient()
if err != nil {
svc.Errorf("failed to get node client: %v", err)
return err
@@ -246,7 +282,7 @@ func (svc *WorkerService) subscribe() {
svc.Errorf("failed to subscribe to master: %v", err)
return err
}
svc.Debugf("subscribed to master")
svc.Infof("successfully subscribed to master")
// Handle messages
for {
@@ -284,17 +320,21 @@ func (svc *WorkerService) subscribe() {
if err != nil {
if svc.ctx.Err() != nil {
// Context was cancelled, exit gracefully
svc.Debugf("subscription retry cancelled due to context")
svc.Infof("subscription retry cancelled due to context")
return
}
svc.Errorf("subscription failed after max retries: %v", err)
svc.Errorf("subscription attempt failed: %v", err)
// Reset backoff for next attempt
b.Reset()
} else {
svc.Debugf("subscription completed successfully")
}
// Wait before attempting to reconnect, but respect context cancellation
select {
case <-svc.ctx.Done():
return
case <-time.After(time.Second):
case <-time.After(2 * time.Second):
}
}
}