feat: implement zombie process prevention and cleanup mechanisms in task runner

This commit is contained in:
Marvin Zhang
2025-06-23 13:54:43 +08:00
parent 1008886715
commit 89514b0154
4 changed files with 840 additions and 85 deletions

View File

@@ -15,11 +15,13 @@ import (
"runtime"
"strings"
"sync"
"syscall"
"time"
"github.com/crawlab-team/crawlab/core/dependency"
"github.com/crawlab-team/crawlab/core/fs"
"github.com/hashicorp/go-multierror"
"github.com/shirou/gopsutil/process"
"github.com/crawlab-team/crawlab/core/models/models"
@@ -219,6 +221,9 @@ func (r *Runner) Run() (err error) {
// Start IPC handler
go r.handleIPC()
// ZOMBIE PREVENTION: Start zombie process monitor
go r.startZombieMonitor()
// Ensure cleanup when Run() exits
defer func() {
// 1. Signal all goroutines to stop
@@ -336,6 +341,15 @@ func (r *Runner) configureCmd() (err error) {
// set working directory
r.cmd.Dir = r.cwd
// ZOMBIE PREVENTION: Set process group to enable proper cleanup of child processes
if runtime.GOOS != "windows" {
// Create new process group on Unix systems to ensure child processes can be killed together
r.cmd.SysProcAttr = &syscall.SysProcAttr{
Setpgid: true, // Create new process group
Pgid: 0, // Use process ID as process group ID
}
}
// Configure pipes for IPC and logs
r.stdinPipe, err = r.cmd.StdinPipe()
if err != nil {
@@ -727,6 +741,8 @@ func (r *Runner) wait() (err error) {
case constants.TaskSignalLost:
err = constants.ErrTaskLost
status = constants.TaskStatusError
// ZOMBIE PREVENTION: Clean up any remaining processes when task is lost
go r.cleanupOrphanedProcesses()
default:
err = constants.ErrInvalidSignal
status = constants.TaskStatusError
@@ -1492,3 +1508,151 @@ func (r *Runner) GetConnectionStats() map[string]interface{} {
"connection_exists": r.conn != nil,
}
}
// ZOMBIE PROCESS PREVENTION METHODS
// cleanupOrphanedProcesses attempts to clean up any orphaned processes related to this task
func (r *Runner) cleanupOrphanedProcesses() {
r.Warnf("cleaning up orphaned processes for task %s (PID: %d)", r.tid.Hex(), r.pid)
if r.pid <= 0 {
r.Debugf("no PID to clean up")
return
}
// Try to kill the process group if it exists
if runtime.GOOS != "windows" {
r.killProcessGroup()
}
// Force kill the main process if it still exists
if utils.ProcessIdExists(r.pid) {
r.Warnf("forcefully killing remaining process %d", r.pid)
if r.cmd != nil && r.cmd.Process != nil {
if err := utils.KillProcess(r.cmd, true); err != nil {
r.Errorf("failed to force kill process: %v", err)
}
}
}
// Scan for any remaining child processes and kill them
r.scanAndKillChildProcesses()
}
// killProcessGroup kills the entire process group on Unix systems
func (r *Runner) killProcessGroup() {
if r.pid <= 0 {
return
}
r.Debugf("attempting to kill process group for PID %d", r.pid)
// Kill the process group (negative PID kills the group)
err := syscall.Kill(-r.pid, syscall.SIGTERM)
if err != nil {
r.Debugf("failed to send SIGTERM to process group: %v", err)
// Try SIGKILL as last resort
err = syscall.Kill(-r.pid, syscall.SIGKILL)
if err != nil {
r.Debugf("failed to send SIGKILL to process group: %v", err)
}
} else {
r.Debugf("successfully sent SIGTERM to process group %d", r.pid)
}
}
// scanAndKillChildProcesses scans for and kills any remaining child processes
func (r *Runner) scanAndKillChildProcesses() {
r.Debugf("scanning for orphaned child processes of task %s", r.tid.Hex())
processes, err := utils.GetProcesses()
if err != nil {
r.Errorf("failed to get process list: %v", err)
return
}
taskIdEnv := "CRAWLAB_TASK_ID=" + r.tid.Hex()
killedCount := 0
for _, proc := range processes {
// Check if this process has our task ID in its environment
if r.isTaskRelatedProcess(proc, taskIdEnv) {
pid := int(proc.Pid)
r.Warnf("found orphaned task process PID %d, killing it", pid)
// Kill the orphaned process
if err := proc.Kill(); err != nil {
r.Errorf("failed to kill orphaned process %d: %v", pid, err)
} else {
killedCount++
r.Infof("successfully killed orphaned process %d", pid)
}
}
}
if killedCount > 0 {
r.Infof("cleaned up %d orphaned processes for task %s", killedCount, r.tid.Hex())
} else {
r.Debugf("no orphaned processes found for task %s", r.tid.Hex())
}
}
// isTaskRelatedProcess checks if a process is related to this task
func (r *Runner) isTaskRelatedProcess(proc *process.Process, taskIdEnv string) bool {
// Get process environment variables
environ, err := proc.Environ()
if err != nil {
// If we can't read environment, skip this process
return false
}
// Check if this process has our task ID
for _, env := range environ {
if env == taskIdEnv {
return true
}
}
return false
}
// startZombieMonitor starts a background goroutine to monitor for zombie processes
func (r *Runner) startZombieMonitor() {
r.wg.Add(1)
go func() {
defer r.wg.Done()
// Check for zombies every 5 minutes
ticker := time.NewTicker(5 * time.Minute)
defer ticker.Stop()
for {
select {
case <-r.ctx.Done():
return
case <-ticker.C:
r.checkForZombieProcesses()
}
}
}()
}
// checkForZombieProcesses periodically checks for and cleans up zombie processes
func (r *Runner) checkForZombieProcesses() {
r.Debugf("checking for zombie processes related to task %s", r.tid.Hex())
// Check if our main process still exists and is in the expected state
if r.pid > 0 && utils.ProcessIdExists(r.pid) {
// Process exists, check if it's a zombie
if proc, err := process.NewProcess(int32(r.pid)); err == nil {
if status, err := proc.Status(); err == nil {
// Status returns a string, check if it indicates zombie
statusStr := string(status)
if statusStr == "Z" || statusStr == "zombie" {
r.Warnf("detected zombie process %d for task %s", r.pid, r.tid.Hex())
go r.cleanupOrphanedProcesses()
}
}
}
}
}