mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-21 17:21:09 +01:00
feat: add node disconnection handling and update task statuses accordingly
This commit is contained in:
@@ -1,13 +1,14 @@
|
||||
package constants
|
||||
|
||||
const (
|
||||
TaskStatusPending = "pending"
|
||||
TaskStatusAssigned = "assigned"
|
||||
TaskStatusRunning = "running"
|
||||
TaskStatusFinished = "finished"
|
||||
TaskStatusError = "error"
|
||||
TaskStatusCancelled = "cancelled"
|
||||
TaskStatusAbnormal = "abnormal"
|
||||
TaskStatusPending = "pending"
|
||||
TaskStatusAssigned = "assigned"
|
||||
TaskStatusRunning = "running"
|
||||
TaskStatusFinished = "finished"
|
||||
TaskStatusError = "error"
|
||||
TaskStatusCancelled = "cancelled"
|
||||
TaskStatusAbnormal = "abnormal"
|
||||
TaskStatusNodeDisconnected = "node_disconnected"
|
||||
)
|
||||
|
||||
const (
|
||||
|
||||
@@ -28,13 +28,15 @@ import (
|
||||
|
||||
type MasterService struct {
|
||||
// dependencies
|
||||
cfgSvc interfaces.NodeConfigService
|
||||
server *server.GrpcServer
|
||||
taskSchedulerSvc *scheduler.Service
|
||||
taskHandlerSvc *handler.Service
|
||||
scheduleSvc *schedule.Service
|
||||
systemSvc *system.Service
|
||||
healthSvc *HealthService
|
||||
cfgSvc interfaces.NodeConfigService
|
||||
server *server.GrpcServer
|
||||
taskSchedulerSvc *scheduler.Service
|
||||
taskHandlerSvc *handler.Service
|
||||
scheduleSvc *schedule.Service
|
||||
systemSvc *system.Service
|
||||
healthSvc *HealthService
|
||||
nodeMonitoringSvc *NodeMonitoringService
|
||||
taskReconciliationSvc *TaskReconciliationService
|
||||
|
||||
// settings
|
||||
monitorInterval time.Duration
|
||||
@@ -55,9 +57,9 @@ func (svc *MasterService) Start() {
|
||||
// start health service
|
||||
go svc.healthSvc.Start(func() bool {
|
||||
// Master-specific health check: verify gRPC server and core services are running
|
||||
return svc.server != nil &&
|
||||
svc.taskSchedulerSvc != nil &&
|
||||
svc.taskHandlerSvc != nil &&
|
||||
return svc.server != nil &&
|
||||
svc.taskSchedulerSvc != nil &&
|
||||
svc.taskHandlerSvc != nil &&
|
||||
svc.scheduleSvc != nil
|
||||
})
|
||||
|
||||
@@ -165,15 +167,21 @@ func (svc *MasterService) Register() (err error) {
|
||||
|
||||
func (svc *MasterService) monitor() (err error) {
|
||||
// update master node status in db
|
||||
if err := svc.updateMasterNodeStatus(); err != nil {
|
||||
oldStatus, newStatus, err := svc.nodeMonitoringSvc.UpdateMasterNodeStatus()
|
||||
if err != nil {
|
||||
if errors.Is(err, mongo2.ErrNoDocuments) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
// send notification if status changed
|
||||
if utils.IsPro() && oldStatus != newStatus {
|
||||
go svc.sendMasterStatusNotification(oldStatus, newStatus)
|
||||
}
|
||||
|
||||
// all worker nodes
|
||||
workerNodes, err := svc.getAllWorkerNodes()
|
||||
workerNodes, err := svc.nodeMonitoringSvc.GetAllWorkerNodes()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -199,8 +207,11 @@ func (svc *MasterService) monitor() (err error) {
|
||||
return
|
||||
}
|
||||
|
||||
// handle reconnection - reconcile disconnected tasks
|
||||
go svc.taskReconciliationSvc.HandleNodeReconnection(n)
|
||||
|
||||
// update node available runners
|
||||
_ = svc.updateNodeRunners(n)
|
||||
_ = svc.nodeMonitoringSvc.UpdateNodeRunners(n)
|
||||
}(&n)
|
||||
}
|
||||
|
||||
@@ -209,49 +220,6 @@ func (svc *MasterService) monitor() (err error) {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (svc *MasterService) getAllWorkerNodes() (nodes []models.Node, err error) {
|
||||
query := bson.M{
|
||||
"key": bson.M{"$ne": svc.cfgSvc.GetNodeKey()}, // not self
|
||||
"active": true, // active
|
||||
}
|
||||
nodes, err = service.NewModelService[models.Node]().GetMany(query, nil)
|
||||
if err != nil {
|
||||
if errors.Is(err, mongo2.ErrNoDocuments) {
|
||||
return nil, nil
|
||||
}
|
||||
svc.Errorf("get all worker nodes error: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
return nodes, nil
|
||||
}
|
||||
|
||||
func (svc *MasterService) updateMasterNodeStatus() (err error) {
|
||||
nodeKey := svc.cfgSvc.GetNodeKey()
|
||||
node, err := service.NewModelService[models.Node]().GetOne(bson.M{"key": nodeKey}, nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
oldStatus := node.Status
|
||||
|
||||
node.Status = constants.NodeStatusOnline
|
||||
node.Active = true
|
||||
node.ActiveAt = time.Now()
|
||||
newStatus := node.Status
|
||||
|
||||
err = service.NewModelService[models.Node]().ReplaceById(node.Id, *node)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if utils.IsPro() {
|
||||
if oldStatus != newStatus {
|
||||
go svc.sendNotification(node)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (svc *MasterService) setWorkerNodeOffline(node *models.Node) {
|
||||
node.Status = constants.NodeStatusOffline
|
||||
node.Active = false
|
||||
@@ -261,6 +229,10 @@ func (svc *MasterService) setWorkerNodeOffline(node *models.Node) {
|
||||
if err != nil {
|
||||
log.Errorf("failed to set worker node[%s] offline: %v", node.Key, err)
|
||||
}
|
||||
|
||||
// Update running tasks on the offline node to abnormal status
|
||||
svc.taskReconciliationSvc.HandleTasksForOfflineNode(node)
|
||||
|
||||
svc.sendNotification(node)
|
||||
}
|
||||
|
||||
@@ -285,25 +257,6 @@ func (svc *MasterService) pingNodeClient(n *models.Node) (ok bool) {
|
||||
return true
|
||||
}
|
||||
|
||||
func (svc *MasterService) updateNodeRunners(node *models.Node) (err error) {
|
||||
query := bson.M{
|
||||
"node_id": node.Id,
|
||||
"status": constants.TaskStatusRunning,
|
||||
}
|
||||
runningTasksCount, err := service.NewModelService[models.Task]().Count(query)
|
||||
if err != nil {
|
||||
svc.Errorf("failed to count running tasks for node[%s]: %v", node.Key, err)
|
||||
return err
|
||||
}
|
||||
node.CurrentRunners = runningTasksCount
|
||||
err = service.NewModelService[models.Node]().ReplaceById(node.Id, *node)
|
||||
if err != nil {
|
||||
svc.Errorf("failed to update node runners for node[%s]: %v", node.Key, err)
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (svc *MasterService) sendNotification(node *models.Node) {
|
||||
if !utils.IsPro() {
|
||||
return
|
||||
@@ -311,17 +264,35 @@ func (svc *MasterService) sendNotification(node *models.Node) {
|
||||
go notification.GetNotificationService().SendNodeNotification(node)
|
||||
}
|
||||
|
||||
func (svc *MasterService) sendMasterStatusNotification(oldStatus, newStatus string) {
|
||||
if !utils.IsPro() {
|
||||
return
|
||||
}
|
||||
nodeKey := svc.cfgSvc.GetNodeKey()
|
||||
node, err := service.NewModelService[models.Node]().GetOne(bson.M{"key": nodeKey}, nil)
|
||||
if err != nil {
|
||||
svc.Errorf("failed to get master node for notification: %v", err)
|
||||
return
|
||||
}
|
||||
go notification.GetNotificationService().SendNodeNotification(node)
|
||||
}
|
||||
|
||||
func newMasterService() *MasterService {
|
||||
cfgSvc := config.GetNodeConfigService()
|
||||
server := server.GetGrpcServer()
|
||||
|
||||
return &MasterService{
|
||||
cfgSvc: config.GetNodeConfigService(),
|
||||
monitorInterval: 15 * time.Second,
|
||||
server: server.GetGrpcServer(),
|
||||
taskSchedulerSvc: scheduler.GetTaskSchedulerService(),
|
||||
taskHandlerSvc: handler.GetTaskHandlerService(),
|
||||
scheduleSvc: schedule.GetScheduleService(),
|
||||
systemSvc: system.GetSystemService(),
|
||||
healthSvc: GetHealthService(),
|
||||
Logger: utils.NewLogger("MasterService"),
|
||||
cfgSvc: cfgSvc,
|
||||
monitorInterval: 15 * time.Second,
|
||||
server: server,
|
||||
taskSchedulerSvc: scheduler.GetTaskSchedulerService(),
|
||||
taskHandlerSvc: handler.GetTaskHandlerService(),
|
||||
scheduleSvc: schedule.GetScheduleService(),
|
||||
systemSvc: system.GetSystemService(),
|
||||
healthSvc: GetHealthService(),
|
||||
nodeMonitoringSvc: NewNodeMonitoringService(cfgSvc),
|
||||
taskReconciliationSvc: NewTaskReconciliationService(server),
|
||||
Logger: utils.NewLogger("MasterService"),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
98
core/node/service/node_monitoring_service.go
Normal file
98
core/node/service/node_monitoring_service.go
Normal file
@@ -0,0 +1,98 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/crawlab-team/crawlab/core/constants"
|
||||
"github.com/crawlab-team/crawlab/core/interfaces"
|
||||
"github.com/crawlab-team/crawlab/core/models/models"
|
||||
"github.com/crawlab-team/crawlab/core/models/service"
|
||||
"github.com/crawlab-team/crawlab/core/utils"
|
||||
"go.mongodb.org/mongo-driver/bson"
|
||||
mongo2 "go.mongodb.org/mongo-driver/mongo"
|
||||
)
|
||||
|
||||
// NodeMonitoringService handles monitoring of worker nodes
|
||||
type NodeMonitoringService struct {
|
||||
cfgSvc interfaces.NodeConfigService
|
||||
interfaces.Logger
|
||||
}
|
||||
|
||||
// GetAllWorkerNodes returns all active worker nodes (excluding the master node)
|
||||
func (svc *NodeMonitoringService) GetAllWorkerNodes() (nodes []models.Node, err error) {
|
||||
query := bson.M{
|
||||
"key": bson.M{"$ne": svc.cfgSvc.GetNodeKey()}, // not self
|
||||
"active": true, // active
|
||||
}
|
||||
nodes, err = service.NewModelService[models.Node]().GetMany(query, nil)
|
||||
if err != nil {
|
||||
if errors.Is(err, mongo2.ErrNoDocuments) {
|
||||
return nil, nil
|
||||
}
|
||||
svc.Errorf("get all worker nodes error: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
return nodes, nil
|
||||
}
|
||||
|
||||
// UpdateMasterNodeStatus updates the master node status in the database
|
||||
func (svc *NodeMonitoringService) UpdateMasterNodeStatus() (oldStatus, newStatus string, err error) {
|
||||
nodeKey := svc.cfgSvc.GetNodeKey()
|
||||
node, err := service.NewModelService[models.Node]().GetOne(bson.M{"key": nodeKey}, nil)
|
||||
if err != nil {
|
||||
return "", "", err
|
||||
}
|
||||
oldStatus = node.Status
|
||||
|
||||
node.Status = constants.NodeStatusOnline
|
||||
node.Active = true
|
||||
node.ActiveAt = time.Now()
|
||||
newStatus = node.Status
|
||||
|
||||
err = service.NewModelService[models.Node]().ReplaceById(node.Id, *node)
|
||||
if err != nil {
|
||||
return oldStatus, newStatus, err
|
||||
}
|
||||
|
||||
return oldStatus, newStatus, nil
|
||||
}
|
||||
|
||||
// UpdateNodeRunners updates the current runners count for a specific node
|
||||
func (svc *NodeMonitoringService) UpdateNodeRunners(node *models.Node) (err error) {
|
||||
query := bson.M{
|
||||
"node_id": node.Id,
|
||||
"status": constants.TaskStatusRunning,
|
||||
}
|
||||
runningTasksCount, err := service.NewModelService[models.Task]().Count(query)
|
||||
if err != nil {
|
||||
svc.Errorf("failed to count running tasks for node[%s]: %v", node.Key, err)
|
||||
return err
|
||||
}
|
||||
node.CurrentRunners = runningTasksCount
|
||||
err = service.NewModelService[models.Node]().ReplaceById(node.Id, *node)
|
||||
if err != nil {
|
||||
svc.Errorf("failed to update node runners for node[%s]: %v", node.Key, err)
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func NewNodeMonitoringService(cfgSvc interfaces.NodeConfigService) *NodeMonitoringService {
|
||||
return &NodeMonitoringService{
|
||||
cfgSvc: cfgSvc,
|
||||
Logger: utils.NewLogger("NodeMonitoringService"),
|
||||
}
|
||||
}
|
||||
|
||||
// Singleton pattern
|
||||
var nodeMonitoringService *NodeMonitoringService
|
||||
var nodeMonitoringServiceOnce sync.Once
|
||||
|
||||
func GetNodeMonitoringService() *NodeMonitoringService {
|
||||
nodeMonitoringServiceOnce.Do(func() {
|
||||
nodeMonitoringService = NewNodeMonitoringService(nil) // Will be set by the master service
|
||||
})
|
||||
return nodeMonitoringService
|
||||
}
|
||||
258
core/node/service/task_reconciliation_service.go
Normal file
258
core/node/service/task_reconciliation_service.go
Normal file
@@ -0,0 +1,258 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/cenkalti/backoff/v4"
|
||||
"github.com/crawlab-team/crawlab/core/constants"
|
||||
"github.com/crawlab-team/crawlab/core/grpc/server"
|
||||
"github.com/crawlab-team/crawlab/core/interfaces"
|
||||
"github.com/crawlab-team/crawlab/core/models/models"
|
||||
"github.com/crawlab-team/crawlab/core/models/service"
|
||||
"github.com/crawlab-team/crawlab/core/utils"
|
||||
"github.com/crawlab-team/crawlab/grpc"
|
||||
"go.mongodb.org/mongo-driver/bson"
|
||||
"go.mongodb.org/mongo-driver/bson/primitive"
|
||||
)
|
||||
|
||||
// TaskReconciliationService handles task status reconciliation for node disconnection scenarios
|
||||
type TaskReconciliationService struct {
|
||||
server *server.GrpcServer
|
||||
interfaces.Logger
|
||||
}
|
||||
|
||||
// HandleTasksForOfflineNode updates all running tasks on an offline node to abnormal status
|
||||
func (svc *TaskReconciliationService) HandleTasksForOfflineNode(node *models.Node) {
|
||||
// Find all running tasks on the offline node
|
||||
query := bson.M{
|
||||
"node_id": node.Id,
|
||||
"status": constants.TaskStatusRunning,
|
||||
}
|
||||
|
||||
runningTasks, err := service.NewModelService[models.Task]().GetMany(query, nil)
|
||||
if err != nil {
|
||||
svc.Errorf("failed to get running tasks for offline node[%s]: %v", node.Key, err)
|
||||
return
|
||||
}
|
||||
|
||||
if len(runningTasks) == 0 {
|
||||
svc.Debugf("no running tasks found for offline node[%s]", node.Key)
|
||||
return
|
||||
}
|
||||
|
||||
svc.Infof("updating %d running tasks to abnormal status for offline node[%s]", len(runningTasks), node.Key)
|
||||
|
||||
// Update each task status to node_disconnected (recoverable)
|
||||
for _, task := range runningTasks {
|
||||
task.Status = constants.TaskStatusNodeDisconnected
|
||||
task.Error = "Task temporarily disconnected due to worker node offline"
|
||||
|
||||
// Update the task in database
|
||||
err := backoff.Retry(func() error {
|
||||
return service.NewModelService[models.Task]().ReplaceById(task.Id, task)
|
||||
}, backoff.WithMaxRetries(backoff.NewConstantBackOff(500*time.Millisecond), 3))
|
||||
|
||||
if err != nil {
|
||||
svc.Errorf("failed to update task[%s] status for offline node[%s]: %v", task.Id.Hex(), node.Key, err)
|
||||
} else {
|
||||
svc.Debugf("updated task[%s] status to abnormal for offline node[%s]", task.Id.Hex(), node.Key)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// HandleNodeReconnection reconciles tasks that were marked as disconnected when the node comes back online
|
||||
func (svc *TaskReconciliationService) HandleNodeReconnection(node *models.Node) {
|
||||
// Find all disconnected tasks on this node
|
||||
query := bson.M{
|
||||
"node_id": node.Id,
|
||||
"status": constants.TaskStatusNodeDisconnected,
|
||||
}
|
||||
|
||||
disconnectedTasks, err := service.NewModelService[models.Task]().GetMany(query, nil)
|
||||
if err != nil {
|
||||
svc.Errorf("failed to get disconnected tasks for reconnected node[%s]: %v", node.Key, err)
|
||||
return
|
||||
}
|
||||
|
||||
if len(disconnectedTasks) == 0 {
|
||||
svc.Debugf("no disconnected tasks found for reconnected node[%s]", node.Key)
|
||||
return
|
||||
}
|
||||
|
||||
svc.Infof("reconciling %d disconnected tasks for reconnected node[%s]", len(disconnectedTasks), node.Key)
|
||||
|
||||
// For each disconnected task, try to get its actual status from the worker node
|
||||
for _, task := range disconnectedTasks {
|
||||
actualStatus, err := svc.GetActualTaskStatusFromWorker(node, &task)
|
||||
if err != nil {
|
||||
svc.Warnf("failed to get actual status for task[%s] from reconnected node[%s]: %v", task.Id.Hex(), node.Key, err)
|
||||
// If we can't determine the actual status, mark as abnormal after reconnection failure
|
||||
task.Status = constants.TaskStatusAbnormal
|
||||
task.Error = "Could not reconcile task status after node reconnection"
|
||||
} else {
|
||||
// Update with actual status from worker
|
||||
task.Status = actualStatus
|
||||
switch actualStatus {
|
||||
case constants.TaskStatusFinished:
|
||||
task.Error = "" // Clear error message for successfully completed tasks
|
||||
case constants.TaskStatusError:
|
||||
task.Error = "Task encountered an error during node disconnection"
|
||||
}
|
||||
}
|
||||
|
||||
// Update the task in database
|
||||
err = backoff.Retry(func() error {
|
||||
return service.NewModelService[models.Task]().ReplaceById(task.Id, task)
|
||||
}, backoff.WithMaxRetries(backoff.NewConstantBackOff(500*time.Millisecond), 3))
|
||||
|
||||
if err != nil {
|
||||
svc.Errorf("failed to update reconciled task[%s] status for node[%s]: %v", task.Id.Hex(), node.Key, err)
|
||||
} else {
|
||||
svc.Infof("reconciled task[%s] status from 'node_disconnected' to '%s' for node[%s]", task.Id.Hex(), task.Status, node.Key)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// GetActualTaskStatusFromWorker queries the worker node to get the actual status of a task
|
||||
func (svc *TaskReconciliationService) GetActualTaskStatusFromWorker(node *models.Node, task *models.Task) (status string, err error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// Check if there's an active stream for this task
|
||||
_, hasActiveStream := svc.server.TaskSvr.GetSubscribeStream(task.Id)
|
||||
|
||||
// Check if the node is still connected via subscription
|
||||
nodeStream, nodeConnected := svc.server.NodeSvr.GetSubscribeStream(node.Id)
|
||||
if !nodeConnected {
|
||||
svc.Warnf("node[%s] is not connected, using fallback detection for task[%s]", node.Key, task.Id.Hex())
|
||||
return svc.inferTaskStatusFromStream(task.Id, hasActiveStream), nil
|
||||
}
|
||||
|
||||
// Try to get more accurate status by checking recent task activity
|
||||
actualStatus, err := svc.detectTaskStatusFromActivity(task, hasActiveStream)
|
||||
if err != nil {
|
||||
svc.Warnf("failed to detect task status from activity for task[%s]: %v", task.Id.Hex(), err)
|
||||
return svc.inferTaskStatusFromStream(task.Id, hasActiveStream), nil
|
||||
}
|
||||
|
||||
// Ping the node to verify it's responsive
|
||||
if nodeStream != nil {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
svc.Warnf("timeout while pinging node[%s] for task[%s]", node.Key, task.Id.Hex())
|
||||
return svc.inferTaskStatusFromStream(task.Id, hasActiveStream), nil
|
||||
default:
|
||||
// Send a heartbeat to verify node responsiveness
|
||||
err := nodeStream.Send(&grpc.NodeServiceSubscribeResponse{
|
||||
Code: grpc.NodeServiceSubscribeCode_HEARTBEAT,
|
||||
})
|
||||
if err != nil {
|
||||
svc.Warnf("failed to ping node[%s] for task status check: %v", node.Key, err)
|
||||
return svc.inferTaskStatusFromStream(task.Id, hasActiveStream), nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return actualStatus, nil
|
||||
}
|
||||
|
||||
// detectTaskStatusFromActivity analyzes task activity to determine its actual status
|
||||
func (svc *TaskReconciliationService) detectTaskStatusFromActivity(task *models.Task, hasActiveStream bool) (string, error) {
|
||||
// Check if task has been updated recently (within last 30 seconds)
|
||||
if time.Since(task.UpdatedAt) < 30*time.Second {
|
||||
// Task was recently updated, likely still active
|
||||
if hasActiveStream {
|
||||
return constants.TaskStatusRunning, nil
|
||||
}
|
||||
// Recently updated but no stream - check if it finished
|
||||
return svc.checkTaskCompletion(task), nil
|
||||
}
|
||||
|
||||
// Task hasn't been updated recently
|
||||
if !hasActiveStream {
|
||||
// No stream and no recent activity - likely finished or failed
|
||||
return svc.checkTaskCompletion(task), nil
|
||||
}
|
||||
|
||||
// Has stream but no recent updates - might be stuck
|
||||
return constants.TaskStatusRunning, nil
|
||||
}
|
||||
|
||||
// checkTaskCompletion determines if a task completed successfully or failed
|
||||
func (svc *TaskReconciliationService) checkTaskCompletion(task *models.Task) string {
|
||||
// Refresh task from database to get latest status
|
||||
latestTask, err := service.NewModelService[models.Task]().GetById(task.Id)
|
||||
if err != nil {
|
||||
svc.Warnf("failed to refresh task[%s] from database: %v", task.Id.Hex(), err)
|
||||
return constants.TaskStatusError
|
||||
}
|
||||
|
||||
// If task status was already updated to a final state, return that
|
||||
switch latestTask.Status {
|
||||
case constants.TaskStatusFinished, constants.TaskStatusError, constants.TaskStatusCancelled:
|
||||
return latestTask.Status
|
||||
case constants.TaskStatusRunning:
|
||||
// Task still shows as running but has no active stream - likely finished
|
||||
if latestTask.Error != "" {
|
||||
return constants.TaskStatusError
|
||||
}
|
||||
return constants.TaskStatusFinished
|
||||
default:
|
||||
// Unknown or intermediate status
|
||||
return constants.TaskStatusError
|
||||
}
|
||||
}
|
||||
|
||||
// inferTaskStatusFromStream provides a fallback status inference based on stream presence
|
||||
func (svc *TaskReconciliationService) inferTaskStatusFromStream(taskId primitive.ObjectID, hasActiveStream bool) string {
|
||||
if !hasActiveStream {
|
||||
// No active stream could mean:
|
||||
// 1. Task finished successfully
|
||||
// 2. Task failed and stream was closed
|
||||
// 3. Worker disconnected ungracefully
|
||||
//
|
||||
// To determine which, we should check the task in the database
|
||||
task, err := service.NewModelService[models.Task]().GetById(taskId)
|
||||
if err != nil {
|
||||
// If we can't find the task, assume it's in an error state
|
||||
return constants.TaskStatusError
|
||||
}
|
||||
|
||||
// If the task was last seen running and now has no stream,
|
||||
// it likely finished or errored
|
||||
switch task.Status {
|
||||
case constants.TaskStatusRunning:
|
||||
// Task was running but stream is gone - likely finished
|
||||
return constants.TaskStatusFinished
|
||||
case constants.TaskStatusPending, constants.TaskStatusAssigned:
|
||||
// Task never started running - likely error
|
||||
return constants.TaskStatusError
|
||||
default:
|
||||
// Return the last known status
|
||||
return task.Status
|
||||
}
|
||||
}
|
||||
|
||||
// Stream exists, so task is likely still running
|
||||
return constants.TaskStatusRunning
|
||||
}
|
||||
|
||||
func NewTaskReconciliationService(server *server.GrpcServer) *TaskReconciliationService {
|
||||
return &TaskReconciliationService{
|
||||
server: server,
|
||||
Logger: utils.NewLogger("TaskReconciliationService"),
|
||||
}
|
||||
}
|
||||
|
||||
// Singleton pattern
|
||||
var taskReconciliationService *TaskReconciliationService
|
||||
var taskReconciliationServiceOnce sync.Once
|
||||
|
||||
func GetTaskReconciliationService() *TaskReconciliationService {
|
||||
taskReconciliationServiceOnce.Do(func() {
|
||||
taskReconciliationService = NewTaskReconciliationService(nil) // Will be set by the master service
|
||||
})
|
||||
return taskReconciliationService
|
||||
}
|
||||
@@ -6,6 +6,7 @@ import {
|
||||
TASK_STATUS_CANCELLED,
|
||||
TASK_STATUS_ERROR,
|
||||
TASK_STATUS_FINISHED,
|
||||
TASK_STATUS_NODE_DISCONNECTED,
|
||||
TASK_STATUS_PENDING,
|
||||
TASK_STATUS_RUNNING,
|
||||
} from '@/constants/task';
|
||||
@@ -81,6 +82,14 @@ const data = computed<TagProps>(() => {
|
||||
type: 'info',
|
||||
icon: ['fa', 'exclamation'],
|
||||
};
|
||||
case TASK_STATUS_NODE_DISCONNECTED:
|
||||
return {
|
||||
label: t('components.task.status.label.nodeDisconnected'),
|
||||
tooltip: t('components.task.status.tooltip.nodeDisconnected'),
|
||||
type: 'warning',
|
||||
icon: ['fa', 'unlink'],
|
||||
spinning: true,
|
||||
};
|
||||
default:
|
||||
return {
|
||||
label: t('components.task.status.label.unknown'),
|
||||
|
||||
@@ -5,6 +5,7 @@ export const TASK_STATUS_FINISHED = 'finished';
|
||||
export const TASK_STATUS_ERROR = 'error';
|
||||
export const TASK_STATUS_CANCELLED = 'cancelled';
|
||||
export const TASK_STATUS_ABNORMAL = 'abnormal';
|
||||
export const TASK_STATUS_NODE_DISCONNECTED = 'node_disconnected';
|
||||
|
||||
export const TASK_MODE_RANDOM = 'random';
|
||||
export const TASK_MODE_ALL_NODES = 'all-nodes';
|
||||
|
||||
@@ -38,6 +38,7 @@ const task: LComponentsTask = {
|
||||
error: 'Error',
|
||||
cancelled: 'Cancelled',
|
||||
abnormal: 'Abnormal',
|
||||
nodeDisconnected: 'Node Disconnected',
|
||||
unknown: 'Unknown',
|
||||
},
|
||||
tooltip: {
|
||||
@@ -48,6 +49,7 @@ const task: LComponentsTask = {
|
||||
error: 'Task ended with an error:',
|
||||
cancelled: 'Task has been cancelled',
|
||||
abnormal: 'Task ended abnormally',
|
||||
nodeDisconnected: 'Task temporarily disconnected due to worker node offline',
|
||||
unknown: 'Unknown task status',
|
||||
},
|
||||
},
|
||||
|
||||
@@ -38,6 +38,7 @@ const task: LComponentsTask = {
|
||||
error: '错误',
|
||||
cancelled: '已取消',
|
||||
abnormal: '异常',
|
||||
nodeDisconnected: '节点断开',
|
||||
unknown: '未知',
|
||||
},
|
||||
tooltip: {
|
||||
@@ -48,6 +49,7 @@ const task: LComponentsTask = {
|
||||
error: '任务发生错误:',
|
||||
cancelled: '任务已被取消',
|
||||
abnormal: '任务异常终止',
|
||||
nodeDisconnected: '任务因工作节点离线而临时断开',
|
||||
unknown: '未知任务状态',
|
||||
},
|
||||
},
|
||||
|
||||
@@ -38,6 +38,7 @@ interface LComponentsTask {
|
||||
error: string;
|
||||
cancelled: string;
|
||||
abnormal: string;
|
||||
nodeDisconnected: string;
|
||||
unknown: string;
|
||||
};
|
||||
tooltip: {
|
||||
@@ -48,6 +49,7 @@ interface LComponentsTask {
|
||||
error: string;
|
||||
cancelled: string;
|
||||
abnormal: string;
|
||||
nodeDisconnected: string;
|
||||
unknown: string;
|
||||
};
|
||||
};
|
||||
|
||||
@@ -50,6 +50,7 @@ export declare global {
|
||||
| 'cancelled'
|
||||
| 'error'
|
||||
| 'finished'
|
||||
| 'node_disconnected'
|
||||
| 'running'
|
||||
| 'assigned'
|
||||
| 'pending';
|
||||
|
||||
@@ -7,6 +7,7 @@ import {
|
||||
TASK_STATUS_CANCELLED,
|
||||
TASK_STATUS_ERROR,
|
||||
TASK_STATUS_FINISHED,
|
||||
TASK_STATUS_NODE_DISCONNECTED,
|
||||
TASK_STATUS_PENDING,
|
||||
TASK_STATUS_RUNNING,
|
||||
} from '@/constants/task';
|
||||
@@ -117,6 +118,10 @@ export const getStatusOptions = (): SelectOption[] => {
|
||||
label: t('components.task.status.label.abnormal'),
|
||||
value: TASK_STATUS_ABNORMAL,
|
||||
},
|
||||
{
|
||||
label: t('components.task.status.label.nodeDisconnected'),
|
||||
value: TASK_STATUS_NODE_DISCONNECTED,
|
||||
},
|
||||
];
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user