fix(node/master): add failure counter and grace period to node health checks; increase monitor interval

2026-01-27 17:50:53 +01:00 · 2025-10-23 14:49:30 +08:00
parent ef70312430
commit b8e62c7b6b
1 changed files with 39 additions and 13 deletions
--- a/core/node/service/master_service.go
+++ b/core/node/service/master_service.go
@@ -42,6 +42,10 @@ type MasterService struct {
 	// settings
 	monitorInterval time.Duration

+	// node health tracking - grace period before marking offline
+	nodeFailureCount map[primitive.ObjectID]int
+	nodeFailureMux   sync.RWMutex
+
 	// internals
 	interfaces.Logger
 }
@@ -200,21 +204,27 @@ func (svc *MasterService) monitor() (err error) {
 		go func(n *models.Node) {
 			defer wg.Done()

-			// subscribe
-			ok := svc.subscribeNode(n)
-			if !ok {
-				go svc.setWorkerNodeOffline(n)
+			// Check node health: subscribe and ping
+			subscribeOk := svc.subscribeNode(n)
+			pingOk := svc.pingNodeClient(n)
+			healthCheckPassed := subscribeOk && pingOk
+
+			if !healthCheckPassed {
+				// Health check failed - increment failure counter
+				failures := svc.incrementNodeFailureCount(n.Id)
+				svc.Debugf("worker node[%s] health check failed (failures: %d)", n.Key, failures)
+
+				// Only mark offline after consecutive failures (grace period)
+				// This prevents flapping during brief reconnection windows
+				if failures >= 2 {
+					svc.Infof("worker node[%s] failed %d consecutive health checks, marking offline", n.Key, failures)
+					go svc.setWorkerNodeOffline(n)
+				}
 				return
 			}

-			// ping client
-			ok = svc.pingNodeClient(n)
-			if !ok {
-				go svc.setWorkerNodeOffline(n)
-				return
-			}
-
-			// if both subscribe and ping succeed, ensure node is marked as online
+			// Health check passed - reset failure counter and mark online
+			svc.resetNodeFailureCount(n.Id)
 			go svc.setWorkerNodeOnline(n)

 			// handle reconnection - reconcile disconnected tasks
@@ -329,13 +339,29 @@ func (svc *MasterService) monitorGrpcClientHealth() {
 	}
 }

+// incrementNodeFailureCount increments the failure counter for a node and returns the new count
+func (svc *MasterService) incrementNodeFailureCount(nodeId primitive.ObjectID) int {
+	svc.nodeFailureMux.Lock()
+	defer svc.nodeFailureMux.Unlock()
+	svc.nodeFailureCount[nodeId]++
+	return svc.nodeFailureCount[nodeId]
+}
+
+// resetNodeFailureCount resets the failure counter for a node to zero
+func (svc *MasterService) resetNodeFailureCount(nodeId primitive.ObjectID) {
+	svc.nodeFailureMux.Lock()
+	defer svc.nodeFailureMux.Unlock()
+	delete(svc.nodeFailureCount, nodeId)
+}
+
 func newMasterService() *MasterService {
 	cfgSvc := config.GetNodeConfigService()
 	server := server.GetGrpcServer()

 	return &MasterService{
 		cfgSvc:                cfgSvc,
-		monitorInterval:       15 * time.Second,
+		monitorInterval:       20 * time.Second, // Increased from 15s to give more reconnection time
+		nodeFailureCount:      make(map[primitive.ObjectID]int),
 		server:                server,
 		taskSchedulerSvc:      scheduler.GetTaskSchedulerService(),
 		taskHandlerSvc:        handler.GetTaskHandlerService(),