feat: add PING message handling for connection health checks

- Implemented PING message handling in TaskServiceServer to acknowledge health check pings. - Updated isConnectionHealthy method in Runner to use a non-blocking approach for health checks, preventing interference with log streams. - Introduced lastConnCheck timestamp to optimize health check frequency based on recent activity. - Added PING code to TaskServiceConnectCode enum in proto definition and generated files. - Updated gRPC client and server interfaces to support new PING functionality.
2026-01-25 17:42:25 +01:00 · 2025-09-12 13:58:16 +08:00
parent 333dfd44c0 3edd2a1210
commit d39c265483
6 changed files with 309 additions and 128 deletions
--- a/core/task/handler/runner.go
+++ b/core/task/handler/runner.go
@@ -590,11 +590,13 @@ func (r *Runner) monitorConnectionHealth() {
 }

 // isConnectionHealthy checks if the gRPC connection is still healthy
+// Uses a non-blocking approach to prevent interfering with log streams
 func (r *Runner) isConnectionHealthy() bool {
 	r.connMutex.RLock()
-	defer r.connMutex.RUnlock()
+	conn := r.conn
+	r.connMutex.RUnlock()

-	if r.conn == nil {
+	if conn == nil {
 		return false
 	}

@@ -606,31 +608,53 @@ func (r *Runner) isConnectionHealthy() bool {
 	default:
 	}

-	// Try to send a ping-like message to test connection with timeout
-	// Use a simple log message as ping since PING code doesn't exist
-	testMsg := &grpc.TaskServiceConnectRequest{
-		Code:   grpc.TaskServiceConnectCode_INSERT_LOGS,
-		TaskId: r.tid.Hex(),
-		Data:   []byte(`["[HEALTH CHECK] connection test"]`),
+	// FIXED: Use a completely non-blocking approach to prevent stream interference
+	// Instead of sending data that could block the log stream, just check connection state
+	// and use timing-based health assessment
+
+	// Check if we've had recent successful operations
+	timeSinceLastCheck := time.Since(r.lastConnCheck)
+
+	// If we haven't checked recently, consider it healthy if not too old
+	// This prevents health checks from interfering with active log streaming
+	if timeSinceLastCheck < 2*time.Minute {
+		r.Debugf("connection considered healthy based on recent activity")
+		return true
 	}

-	// Use a channel to make the Send operation timeout-aware
+	// For older connections, try a non-blocking ping only if no active log streaming
+	// This is a compromise to avoid blocking the critical log data flow
+	pingMsg := &grpc.TaskServiceConnectRequest{
+		Code:   grpc.TaskServiceConnectCode_PING,
+		TaskId: r.tid.Hex(),
+		Data:   nil,
+	}
+
+	// Use a very short timeout and non-blocking approach
 	done := make(chan error, 1)
 	go func() {
-		done <- r.conn.Send(testMsg)
+		// Re-acquire lock only for the send operation
+		r.connMutex.RLock()
+		defer r.connMutex.RUnlock()
+		if r.conn != nil {
+			done <- r.conn.Send(pingMsg)
+		} else {
+			done <- fmt.Errorf("connection is nil")
+		}
 	}()

-	// Wait for either completion or timeout
+	// Very short timeout to prevent blocking log operations
 	select {
 	case err := <-done:
 		if err != nil {
 			r.Debugf("connection health check failed: %v", err)
 			return false
 		}
+		r.Debugf("connection health check successful")
 		return true
-	case <-time.After(5 * time.Second):
-		r.Debugf("connection health check timed out")
-		return false
+	case <-time.After(1 * time.Second): // Much shorter timeout
+		r.Debugf("connection health check timed out quickly - assume healthy to avoid blocking logs")
+		return true // Assume healthy to avoid disrupting log flow
 	case <-r.ctx.Done():
 		r.Debugf("connection health check cancelled")
 		return false
@@ -752,12 +776,12 @@ func (r *Runner) sendNotification() {
 		r.Errorf("failed to get task client: %v", err)
 		return
 	}
-	
+
 	// Use independent context for async notification - prevents cancellation due to task lifecycle
 	// This ensures notifications are sent even if the task runner is being cleaned up
 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
-	
+
 	_, err = taskClient.SendNotification(ctx, req)
 	if err != nil {
 		if !errors.Is(ctx.Err(), context.DeadlineExceeded) {
--- a/core/task/handler/runner_ipc.go
+++ b/core/task/handler/runner_ipc.go
@@ -157,6 +157,9 @@ func (r *Runner) handleIPCInsertDataMessage(ipcMsg entity.IPCMessage) {
 			}
 			return
 		}
+
+		// Update last successful connection time to help health check avoid unnecessary pings
+		r.lastConnCheck = time.Now()
 	}
 }