mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-25 17:42:25 +01:00
feat: add PING message handling for connection health checks
- Implemented PING message handling in TaskServiceServer to acknowledge health check pings. - Updated isConnectionHealthy method in Runner to use a non-blocking approach for health checks, preventing interference with log streams. - Introduced lastConnCheck timestamp to optimize health check frequency based on recent activity. - Added PING code to TaskServiceConnectCode enum in proto definition and generated files. - Updated gRPC client and server interfaces to support new PING functionality.
This commit is contained in:
@@ -590,11 +590,13 @@ func (r *Runner) monitorConnectionHealth() {
|
||||
}
|
||||
|
||||
// isConnectionHealthy checks if the gRPC connection is still healthy
|
||||
// Uses a non-blocking approach to prevent interfering with log streams
|
||||
func (r *Runner) isConnectionHealthy() bool {
|
||||
r.connMutex.RLock()
|
||||
defer r.connMutex.RUnlock()
|
||||
conn := r.conn
|
||||
r.connMutex.RUnlock()
|
||||
|
||||
if r.conn == nil {
|
||||
if conn == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
@@ -606,31 +608,53 @@ func (r *Runner) isConnectionHealthy() bool {
|
||||
default:
|
||||
}
|
||||
|
||||
// Try to send a ping-like message to test connection with timeout
|
||||
// Use a simple log message as ping since PING code doesn't exist
|
||||
testMsg := &grpc.TaskServiceConnectRequest{
|
||||
Code: grpc.TaskServiceConnectCode_INSERT_LOGS,
|
||||
TaskId: r.tid.Hex(),
|
||||
Data: []byte(`["[HEALTH CHECK] connection test"]`),
|
||||
// FIXED: Use a completely non-blocking approach to prevent stream interference
|
||||
// Instead of sending data that could block the log stream, just check connection state
|
||||
// and use timing-based health assessment
|
||||
|
||||
// Check if we've had recent successful operations
|
||||
timeSinceLastCheck := time.Since(r.lastConnCheck)
|
||||
|
||||
// If we haven't checked recently, consider it healthy if not too old
|
||||
// This prevents health checks from interfering with active log streaming
|
||||
if timeSinceLastCheck < 2*time.Minute {
|
||||
r.Debugf("connection considered healthy based on recent activity")
|
||||
return true
|
||||
}
|
||||
|
||||
// Use a channel to make the Send operation timeout-aware
|
||||
// For older connections, try a non-blocking ping only if no active log streaming
|
||||
// This is a compromise to avoid blocking the critical log data flow
|
||||
pingMsg := &grpc.TaskServiceConnectRequest{
|
||||
Code: grpc.TaskServiceConnectCode_PING,
|
||||
TaskId: r.tid.Hex(),
|
||||
Data: nil,
|
||||
}
|
||||
|
||||
// Use a very short timeout and non-blocking approach
|
||||
done := make(chan error, 1)
|
||||
go func() {
|
||||
done <- r.conn.Send(testMsg)
|
||||
// Re-acquire lock only for the send operation
|
||||
r.connMutex.RLock()
|
||||
defer r.connMutex.RUnlock()
|
||||
if r.conn != nil {
|
||||
done <- r.conn.Send(pingMsg)
|
||||
} else {
|
||||
done <- fmt.Errorf("connection is nil")
|
||||
}
|
||||
}()
|
||||
|
||||
// Wait for either completion or timeout
|
||||
// Very short timeout to prevent blocking log operations
|
||||
select {
|
||||
case err := <-done:
|
||||
if err != nil {
|
||||
r.Debugf("connection health check failed: %v", err)
|
||||
return false
|
||||
}
|
||||
r.Debugf("connection health check successful")
|
||||
return true
|
||||
case <-time.After(5 * time.Second):
|
||||
r.Debugf("connection health check timed out")
|
||||
return false
|
||||
case <-time.After(1 * time.Second): // Much shorter timeout
|
||||
r.Debugf("connection health check timed out quickly - assume healthy to avoid blocking logs")
|
||||
return true // Assume healthy to avoid disrupting log flow
|
||||
case <-r.ctx.Done():
|
||||
r.Debugf("connection health check cancelled")
|
||||
return false
|
||||
@@ -752,12 +776,12 @@ func (r *Runner) sendNotification() {
|
||||
r.Errorf("failed to get task client: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
// Use independent context for async notification - prevents cancellation due to task lifecycle
|
||||
// This ensures notifications are sent even if the task runner is being cleaned up
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
|
||||
_, err = taskClient.SendNotification(ctx, req)
|
||||
if err != nil {
|
||||
if !errors.Is(ctx.Err(), context.DeadlineExceeded) {
|
||||
|
||||
@@ -157,6 +157,9 @@ func (r *Runner) handleIPCInsertDataMessage(ipcMsg entity.IPCMessage) {
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Update last successful connection time to help health check avoid unnecessary pings
|
||||
r.lastConnCheck = time.Now()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user