mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-30 18:00:56 +01:00
feat: implement zombie process prevention and cleanup mechanisms in task runner
This commit is contained in:
@@ -15,11 +15,13 @@ import (
|
||||
"runtime"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/crawlab-team/crawlab/core/dependency"
|
||||
"github.com/crawlab-team/crawlab/core/fs"
|
||||
"github.com/hashicorp/go-multierror"
|
||||
"github.com/shirou/gopsutil/process"
|
||||
|
||||
"github.com/crawlab-team/crawlab/core/models/models"
|
||||
|
||||
@@ -219,6 +221,9 @@ func (r *Runner) Run() (err error) {
|
||||
// Start IPC handler
|
||||
go r.handleIPC()
|
||||
|
||||
// ZOMBIE PREVENTION: Start zombie process monitor
|
||||
go r.startZombieMonitor()
|
||||
|
||||
// Ensure cleanup when Run() exits
|
||||
defer func() {
|
||||
// 1. Signal all goroutines to stop
|
||||
@@ -336,6 +341,15 @@ func (r *Runner) configureCmd() (err error) {
|
||||
// set working directory
|
||||
r.cmd.Dir = r.cwd
|
||||
|
||||
// ZOMBIE PREVENTION: Set process group to enable proper cleanup of child processes
|
||||
if runtime.GOOS != "windows" {
|
||||
// Create new process group on Unix systems to ensure child processes can be killed together
|
||||
r.cmd.SysProcAttr = &syscall.SysProcAttr{
|
||||
Setpgid: true, // Create new process group
|
||||
Pgid: 0, // Use process ID as process group ID
|
||||
}
|
||||
}
|
||||
|
||||
// Configure pipes for IPC and logs
|
||||
r.stdinPipe, err = r.cmd.StdinPipe()
|
||||
if err != nil {
|
||||
@@ -727,6 +741,8 @@ func (r *Runner) wait() (err error) {
|
||||
case constants.TaskSignalLost:
|
||||
err = constants.ErrTaskLost
|
||||
status = constants.TaskStatusError
|
||||
// ZOMBIE PREVENTION: Clean up any remaining processes when task is lost
|
||||
go r.cleanupOrphanedProcesses()
|
||||
default:
|
||||
err = constants.ErrInvalidSignal
|
||||
status = constants.TaskStatusError
|
||||
@@ -1492,3 +1508,151 @@ func (r *Runner) GetConnectionStats() map[string]interface{} {
|
||||
"connection_exists": r.conn != nil,
|
||||
}
|
||||
}
|
||||
|
||||
// ZOMBIE PROCESS PREVENTION METHODS
|
||||
|
||||
// cleanupOrphanedProcesses attempts to clean up any orphaned processes related to this task
|
||||
func (r *Runner) cleanupOrphanedProcesses() {
|
||||
r.Warnf("cleaning up orphaned processes for task %s (PID: %d)", r.tid.Hex(), r.pid)
|
||||
|
||||
if r.pid <= 0 {
|
||||
r.Debugf("no PID to clean up")
|
||||
return
|
||||
}
|
||||
|
||||
// Try to kill the process group if it exists
|
||||
if runtime.GOOS != "windows" {
|
||||
r.killProcessGroup()
|
||||
}
|
||||
|
||||
// Force kill the main process if it still exists
|
||||
if utils.ProcessIdExists(r.pid) {
|
||||
r.Warnf("forcefully killing remaining process %d", r.pid)
|
||||
if r.cmd != nil && r.cmd.Process != nil {
|
||||
if err := utils.KillProcess(r.cmd, true); err != nil {
|
||||
r.Errorf("failed to force kill process: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Scan for any remaining child processes and kill them
|
||||
r.scanAndKillChildProcesses()
|
||||
}
|
||||
|
||||
// killProcessGroup kills the entire process group on Unix systems
|
||||
func (r *Runner) killProcessGroup() {
|
||||
if r.pid <= 0 {
|
||||
return
|
||||
}
|
||||
|
||||
r.Debugf("attempting to kill process group for PID %d", r.pid)
|
||||
|
||||
// Kill the process group (negative PID kills the group)
|
||||
err := syscall.Kill(-r.pid, syscall.SIGTERM)
|
||||
if err != nil {
|
||||
r.Debugf("failed to send SIGTERM to process group: %v", err)
|
||||
// Try SIGKILL as last resort
|
||||
err = syscall.Kill(-r.pid, syscall.SIGKILL)
|
||||
if err != nil {
|
||||
r.Debugf("failed to send SIGKILL to process group: %v", err)
|
||||
}
|
||||
} else {
|
||||
r.Debugf("successfully sent SIGTERM to process group %d", r.pid)
|
||||
}
|
||||
}
|
||||
|
||||
// scanAndKillChildProcesses scans for and kills any remaining child processes
|
||||
func (r *Runner) scanAndKillChildProcesses() {
|
||||
r.Debugf("scanning for orphaned child processes of task %s", r.tid.Hex())
|
||||
|
||||
processes, err := utils.GetProcesses()
|
||||
if err != nil {
|
||||
r.Errorf("failed to get process list: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
taskIdEnv := "CRAWLAB_TASK_ID=" + r.tid.Hex()
|
||||
killedCount := 0
|
||||
|
||||
for _, proc := range processes {
|
||||
// Check if this process has our task ID in its environment
|
||||
if r.isTaskRelatedProcess(proc, taskIdEnv) {
|
||||
pid := int(proc.Pid)
|
||||
r.Warnf("found orphaned task process PID %d, killing it", pid)
|
||||
|
||||
// Kill the orphaned process
|
||||
if err := proc.Kill(); err != nil {
|
||||
r.Errorf("failed to kill orphaned process %d: %v", pid, err)
|
||||
} else {
|
||||
killedCount++
|
||||
r.Infof("successfully killed orphaned process %d", pid)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if killedCount > 0 {
|
||||
r.Infof("cleaned up %d orphaned processes for task %s", killedCount, r.tid.Hex())
|
||||
} else {
|
||||
r.Debugf("no orphaned processes found for task %s", r.tid.Hex())
|
||||
}
|
||||
}
|
||||
|
||||
// isTaskRelatedProcess checks if a process is related to this task
|
||||
func (r *Runner) isTaskRelatedProcess(proc *process.Process, taskIdEnv string) bool {
|
||||
// Get process environment variables
|
||||
environ, err := proc.Environ()
|
||||
if err != nil {
|
||||
// If we can't read environment, skip this process
|
||||
return false
|
||||
}
|
||||
|
||||
// Check if this process has our task ID
|
||||
for _, env := range environ {
|
||||
if env == taskIdEnv {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// startZombieMonitor starts a background goroutine to monitor for zombie processes
|
||||
func (r *Runner) startZombieMonitor() {
|
||||
r.wg.Add(1)
|
||||
go func() {
|
||||
defer r.wg.Done()
|
||||
|
||||
// Check for zombies every 5 minutes
|
||||
ticker := time.NewTicker(5 * time.Minute)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-r.ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
r.checkForZombieProcesses()
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// checkForZombieProcesses periodically checks for and cleans up zombie processes
|
||||
func (r *Runner) checkForZombieProcesses() {
|
||||
r.Debugf("checking for zombie processes related to task %s", r.tid.Hex())
|
||||
|
||||
// Check if our main process still exists and is in the expected state
|
||||
if r.pid > 0 && utils.ProcessIdExists(r.pid) {
|
||||
// Process exists, check if it's a zombie
|
||||
if proc, err := process.NewProcess(int32(r.pid)); err == nil {
|
||||
if status, err := proc.Status(); err == nil {
|
||||
// Status returns a string, check if it indicates zombie
|
||||
statusStr := string(status)
|
||||
if statusStr == "Z" || statusStr == "zombie" {
|
||||
r.Warnf("detected zombie process %d for task %s", r.pid, r.tid.Hex())
|
||||
go r.cleanupOrphanedProcesses()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user