mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-21 17:21:09 +01:00
feat: enhance gRPC client handling with improved reconnection logic and monitoring
This commit is contained in:
@@ -28,7 +28,8 @@ const (
|
||||
registrationCheckInterval = 100 * time.Millisecond
|
||||
idleGracePeriod = 2 * time.Minute // Increased from 30 seconds
|
||||
connectionTimeout = 30 * time.Second
|
||||
defaultClientTimeout = 5 * time.Second
|
||||
defaultClientTimeout = 15 * time.Second // Increased from 5s for better reconnection handling
|
||||
reconnectionClientTimeout = 60 * time.Second // Extended timeout during reconnection scenarios
|
||||
)
|
||||
|
||||
// Circuit breaker states
|
||||
@@ -257,7 +258,7 @@ func (c *GrpcClient) monitorState() {
|
||||
|
||||
var (
|
||||
idleStartTime = time.Time{}
|
||||
ticker = time.NewTicker(stateMonitorInterval) // Reduce frequency
|
||||
ticker = time.NewTicker(stateMonitorInterval)
|
||||
)
|
||||
defer ticker.Stop()
|
||||
|
||||
@@ -310,9 +311,12 @@ func (c *GrpcClient) checkAndHandleStateChange(idleStartTime *time.Time) {
|
||||
|
||||
case connectivity.Shutdown:
|
||||
c.setRegistered(false)
|
||||
c.Warnf("connection state changed to SHUTDOWN - stopped flag: %v", c.stopped)
|
||||
if !c.stopped {
|
||||
c.Errorf("connection shutdown unexpectedly")
|
||||
c.triggerReconnection(fmt.Sprintf("state change to %s", current))
|
||||
} else {
|
||||
c.Debugf("connection shutdown expected (client stopped)")
|
||||
}
|
||||
|
||||
case connectivity.Idle:
|
||||
@@ -418,35 +422,75 @@ func (c *GrpcClient) WaitForRegistered() {
|
||||
// These methods will wait for registration to complete or return an error if the client is stopped
|
||||
|
||||
func (c *GrpcClient) GetNodeClient() (grpc2.NodeServiceClient, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), defaultClientTimeout)
|
||||
// Use longer timeout during reconnection scenarios
|
||||
timeout := defaultClientTimeout
|
||||
c.reconnectMux.Lock()
|
||||
if c.reconnecting {
|
||||
timeout = reconnectionClientTimeout
|
||||
}
|
||||
c.reconnectMux.Unlock()
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
|
||||
return c.GetNodeClientWithContext(ctx)
|
||||
}
|
||||
|
||||
func (c *GrpcClient) GetTaskClient() (grpc2.TaskServiceClient, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), defaultClientTimeout)
|
||||
// Use longer timeout during reconnection scenarios
|
||||
timeout := defaultClientTimeout
|
||||
c.reconnectMux.Lock()
|
||||
if c.reconnecting {
|
||||
timeout = reconnectionClientTimeout
|
||||
}
|
||||
c.reconnectMux.Unlock()
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
|
||||
return c.GetTaskClientWithContext(ctx)
|
||||
}
|
||||
|
||||
func (c *GrpcClient) GetModelBaseServiceClient() (grpc2.ModelBaseServiceClient, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), defaultClientTimeout)
|
||||
// Use longer timeout during reconnection scenarios
|
||||
timeout := defaultClientTimeout
|
||||
c.reconnectMux.Lock()
|
||||
if c.reconnecting {
|
||||
timeout = reconnectionClientTimeout
|
||||
}
|
||||
c.reconnectMux.Unlock()
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
|
||||
return c.GetModelBaseServiceClientWithContext(ctx)
|
||||
}
|
||||
|
||||
func (c *GrpcClient) GetDependencyClient() (grpc2.DependencyServiceClient, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), defaultClientTimeout)
|
||||
// Use longer timeout during reconnection scenarios
|
||||
timeout := defaultClientTimeout
|
||||
c.reconnectMux.Lock()
|
||||
if c.reconnecting {
|
||||
timeout = reconnectionClientTimeout
|
||||
}
|
||||
c.reconnectMux.Unlock()
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
|
||||
return c.GetDependencyClientWithContext(ctx)
|
||||
}
|
||||
|
||||
func (c *GrpcClient) GetMetricClient() (grpc2.MetricServiceClient, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), defaultClientTimeout)
|
||||
// Use longer timeout during reconnection scenarios
|
||||
timeout := defaultClientTimeout
|
||||
c.reconnectMux.Lock()
|
||||
if c.reconnecting {
|
||||
timeout = reconnectionClientTimeout
|
||||
}
|
||||
c.reconnectMux.Unlock()
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
|
||||
return c.GetMetricClientWithContext(ctx)
|
||||
@@ -586,6 +630,11 @@ func (c *GrpcClient) getClientWithContext(ctx context.Context, getter func() int
|
||||
return getter(), nil
|
||||
}
|
||||
|
||||
// Check if we're reconnecting to provide better error context
|
||||
c.reconnectMux.Lock()
|
||||
isReconnecting := c.reconnecting
|
||||
c.reconnectMux.Unlock()
|
||||
|
||||
// Wait for registration with context
|
||||
ticker := time.NewTicker(registrationCheckInterval)
|
||||
defer ticker.Stop()
|
||||
@@ -593,6 +642,9 @@ func (c *GrpcClient) getClientWithContext(ctx context.Context, getter func() int
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
if isReconnecting {
|
||||
return nil, fmt.Errorf("context cancelled while waiting for %s client registration during reconnection (this is normal during network restoration)", clientType)
|
||||
}
|
||||
return nil, fmt.Errorf("context cancelled while waiting for %s client registration", clientType)
|
||||
case <-c.stop:
|
||||
return nil, fmt.Errorf("client stopped while waiting for %s client registration", clientType)
|
||||
@@ -881,6 +933,7 @@ func newGrpcClient() (c *GrpcClient) {
|
||||
|
||||
var _client *GrpcClient
|
||||
var _clientOnce sync.Once
|
||||
var _clientMux sync.Mutex
|
||||
|
||||
func GetGrpcClient() *GrpcClient {
|
||||
_clientOnce.Do(func() {
|
||||
@@ -889,3 +942,27 @@ func GetGrpcClient() *GrpcClient {
|
||||
})
|
||||
return _client
|
||||
}
|
||||
|
||||
// ResetGrpcClient creates a completely new gRPC client instance
|
||||
// This is needed when the client gets stuck and needs to be fully restarted
|
||||
func ResetGrpcClient() *GrpcClient {
|
||||
_clientMux.Lock()
|
||||
defer _clientMux.Unlock()
|
||||
|
||||
// Stop the old client if it exists
|
||||
if _client != nil {
|
||||
_client.Stop()
|
||||
}
|
||||
|
||||
// Reset the sync.Once so we can create a new client
|
||||
_clientOnce = sync.Once{}
|
||||
_client = nil
|
||||
|
||||
// Create and start the new client
|
||||
_clientOnce.Do(func() {
|
||||
_client = newGrpcClient()
|
||||
go _client.Start()
|
||||
})
|
||||
|
||||
return _client
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
"github.com/apex/log"
|
||||
"github.com/cenkalti/backoff/v4"
|
||||
"github.com/crawlab-team/crawlab/core/constants"
|
||||
"github.com/crawlab-team/crawlab/core/grpc/client"
|
||||
"github.com/crawlab-team/crawlab/core/grpc/server"
|
||||
"github.com/crawlab-team/crawlab/core/interfaces"
|
||||
"github.com/crawlab-team/crawlab/core/models/common"
|
||||
@@ -108,12 +109,15 @@ func (svc *MasterService) startMonitoring() {
|
||||
ticker := time.NewTicker(svc.monitorInterval)
|
||||
|
||||
for {
|
||||
// monitor
|
||||
// monitor worker nodes
|
||||
err := svc.monitor()
|
||||
if err != nil {
|
||||
svc.Errorf("master[%s] monitor error: %v", svc.cfgSvc.GetNodeKey(), err)
|
||||
}
|
||||
|
||||
// monitor gRPC client health on master
|
||||
svc.monitorGrpcClientHealth()
|
||||
|
||||
// wait
|
||||
<-ticker.C
|
||||
}
|
||||
@@ -207,6 +211,9 @@ func (svc *MasterService) monitor() (err error) {
|
||||
return
|
||||
}
|
||||
|
||||
// if both subscribe and ping succeed, ensure node is marked as online
|
||||
go svc.setWorkerNodeOnline(n)
|
||||
|
||||
// handle reconnection - reconcile disconnected tasks
|
||||
go svc.taskReconciliationSvc.HandleNodeReconnection(n)
|
||||
|
||||
@@ -236,6 +243,32 @@ func (svc *MasterService) setWorkerNodeOffline(node *models.Node) {
|
||||
svc.sendNotification(node)
|
||||
}
|
||||
|
||||
func (svc *MasterService) setWorkerNodeOnline(node *models.Node) {
|
||||
// Only update if the node is currently offline
|
||||
if node.Status == constants.NodeStatusOnline {
|
||||
return
|
||||
}
|
||||
|
||||
oldStatus := node.Status
|
||||
node.Status = constants.NodeStatusOnline
|
||||
node.Active = true
|
||||
node.ActiveAt = time.Now()
|
||||
err := backoff.Retry(func() error {
|
||||
return service.NewModelService[models.Node]().ReplaceById(node.Id, *node)
|
||||
}, backoff.WithMaxRetries(backoff.NewConstantBackOff(1*time.Second), 3))
|
||||
if err != nil {
|
||||
svc.Errorf("failed to set worker node[%s] online: %v", node.Key, err)
|
||||
return
|
||||
}
|
||||
|
||||
svc.Infof("worker node[%s] status changed from '%s' to 'online'", node.Key, oldStatus)
|
||||
|
||||
// send notification if status changed
|
||||
if utils.IsPro() && oldStatus != constants.NodeStatusOnline {
|
||||
svc.sendNotification(node)
|
||||
}
|
||||
}
|
||||
|
||||
func (svc *MasterService) subscribeNode(n *models.Node) (ok bool) {
|
||||
_, ok = svc.server.NodeSvr.GetSubscribeStream(n.Id)
|
||||
return ok
|
||||
@@ -277,6 +310,18 @@ func (svc *MasterService) sendMasterStatusNotification(oldStatus, newStatus stri
|
||||
go notification.GetNotificationService().SendNodeNotification(node)
|
||||
}
|
||||
|
||||
func (svc *MasterService) monitorGrpcClientHealth() {
|
||||
grpcClient := client.GetGrpcClient()
|
||||
|
||||
// Check if gRPC client is in a bad state
|
||||
if !grpcClient.IsReady() && grpcClient.IsClosed() {
|
||||
svc.Warnf("master node gRPC client is in SHUTDOWN state, forcing FULL RESET")
|
||||
// Reset the gRPC client to get a fresh instance
|
||||
client.ResetGrpcClient()
|
||||
svc.Infof("master node gRPC client has been reset")
|
||||
}
|
||||
}
|
||||
|
||||
func newMasterService() *MasterService {
|
||||
cfgSvc := config.GetNodeConfigService()
|
||||
server := server.GetGrpcServer()
|
||||
|
||||
@@ -217,8 +217,8 @@ func (svc *WorkerService) subscribe() {
|
||||
// Configure exponential backoff
|
||||
b := backoff.NewExponentialBackOff()
|
||||
b.InitialInterval = 1 * time.Second
|
||||
b.MaxInterval = 1 * time.Minute
|
||||
b.MaxElapsedTime = 10 * time.Minute
|
||||
b.MaxInterval = 30 * time.Second // Reduced from 1 minute
|
||||
b.MaxElapsedTime = 0 * time.Minute // Never give up
|
||||
b.Multiplier = 2.0
|
||||
|
||||
for {
|
||||
@@ -232,7 +232,43 @@ func (svc *WorkerService) subscribe() {
|
||||
// Use backoff for connection attempts
|
||||
operation := func() error {
|
||||
svc.Debugf("attempting to subscribe to master")
|
||||
nodeClient, err := client.GetGrpcClient().GetNodeClient()
|
||||
|
||||
// Wait for gRPC client to be ready and registered after reconnection
|
||||
grpcClient := client.GetGrpcClient()
|
||||
|
||||
waitStart := time.Now()
|
||||
checkCount := 0
|
||||
for !grpcClient.IsReadyAndRegistered() {
|
||||
select {
|
||||
case <-svc.ctx.Done():
|
||||
return svc.ctx.Err()
|
||||
case <-time.After(500 * time.Millisecond):
|
||||
checkCount++
|
||||
// Log periodically while waiting
|
||||
if checkCount%20 == 0 { // Every 10 seconds
|
||||
svc.Warnf("still waiting for gRPC client (%.1fs)", time.Since(waitStart).Seconds())
|
||||
|
||||
// Force a reconnection attempt if we've been waiting too long
|
||||
if time.Since(waitStart) > 15*time.Second {
|
||||
svc.Warnf("forcing gRPC client reset due to prolonged wait")
|
||||
grpcClient = client.ResetGrpcClient()
|
||||
waitStart = time.Now()
|
||||
checkCount = 0
|
||||
}
|
||||
|
||||
// Check if client is in SHUTDOWN state and force restart
|
||||
if !grpcClient.IsReady() && grpcClient.IsClosed() {
|
||||
svc.Warnf("gRPC client is in SHUTDOWN state, forcing reset")
|
||||
grpcClient = client.ResetGrpcClient()
|
||||
waitStart = time.Now()
|
||||
checkCount = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
svc.Debugf("gRPC client is ready and registered after %.1fs", time.Since(waitStart).Seconds())
|
||||
|
||||
nodeClient, err := grpcClient.GetNodeClient()
|
||||
if err != nil {
|
||||
svc.Errorf("failed to get node client: %v", err)
|
||||
return err
|
||||
@@ -246,7 +282,7 @@ func (svc *WorkerService) subscribe() {
|
||||
svc.Errorf("failed to subscribe to master: %v", err)
|
||||
return err
|
||||
}
|
||||
svc.Debugf("subscribed to master")
|
||||
svc.Infof("successfully subscribed to master")
|
||||
|
||||
// Handle messages
|
||||
for {
|
||||
@@ -284,17 +320,21 @@ func (svc *WorkerService) subscribe() {
|
||||
if err != nil {
|
||||
if svc.ctx.Err() != nil {
|
||||
// Context was cancelled, exit gracefully
|
||||
svc.Debugf("subscription retry cancelled due to context")
|
||||
svc.Infof("subscription retry cancelled due to context")
|
||||
return
|
||||
}
|
||||
svc.Errorf("subscription failed after max retries: %v", err)
|
||||
svc.Errorf("subscription attempt failed: %v", err)
|
||||
// Reset backoff for next attempt
|
||||
b.Reset()
|
||||
} else {
|
||||
svc.Debugf("subscription completed successfully")
|
||||
}
|
||||
|
||||
// Wait before attempting to reconnect, but respect context cancellation
|
||||
select {
|
||||
case <-svc.ctx.Done():
|
||||
return
|
||||
case <-time.After(time.Second):
|
||||
case <-time.After(2 * time.Second):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user