mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-21 17:21:09 +01:00
- Added service_operations.go for task management including run, cancel, and execution logic. - Introduced stream_manager.go to handle task streams and manage cancellation signals. - Created worker_pool.go to manage a bounded pool of workers for executing tasks concurrently. - Implemented graceful shutdown and cleanup mechanisms for task runners and streams. - Enhanced error handling and logging throughout the task management process.
180 lines
4.6 KiB
Go
180 lines
4.6 KiB
Go
package handler
|
|
|
|
import (
|
|
"bufio"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"runtime"
|
|
"strings"
|
|
"syscall"
|
|
|
|
"github.com/crawlab-team/crawlab/core/entity"
|
|
"github.com/crawlab-team/crawlab/core/models/client"
|
|
"github.com/crawlab-team/crawlab/core/models/models"
|
|
"github.com/crawlab-team/crawlab/core/utils"
|
|
)
|
|
|
|
// configurePythonPath sets up the Python environment paths, handling both pyenv and default installations
|
|
func (r *Runner) configurePythonPath() {
|
|
// Configure global node_modules path
|
|
pyenvRoot := utils.GetPyenvPath()
|
|
pyenvShimsPath := pyenvRoot + "/shims"
|
|
pyenvBinPath := pyenvRoot + "/bin"
|
|
|
|
// Configure global pyenv path
|
|
_ = os.Setenv("PYENV_ROOT", pyenvRoot)
|
|
_ = os.Setenv("PATH", pyenvShimsPath+":"+os.Getenv("PATH"))
|
|
_ = os.Setenv("PATH", pyenvBinPath+":"+os.Getenv("PATH"))
|
|
}
|
|
|
|
// configureNodePath sets up the Node.js environment paths, handling both nvm and default installations
|
|
func (r *Runner) configureNodePath() {
|
|
// Configure nvm-based Node.js paths
|
|
envPath := os.Getenv("PATH")
|
|
|
|
// Configure global node_modules path
|
|
nodePath := utils.GetNodeModulesPath()
|
|
if !strings.Contains(envPath, nodePath) {
|
|
_ = os.Setenv("PATH", nodePath+":"+envPath)
|
|
}
|
|
_ = os.Setenv("NODE_PATH", nodePath)
|
|
|
|
// Configure global node_bin path
|
|
nodeBinPath := utils.GetNodeBinPath()
|
|
if !strings.Contains(envPath, nodeBinPath) {
|
|
_ = os.Setenv("PATH", nodeBinPath+":"+os.Getenv("PATH"))
|
|
}
|
|
}
|
|
|
|
func (r *Runner) configureGoPath() {
|
|
// Configure global go path
|
|
goPath := utils.GetGoPath()
|
|
if goPath != "" {
|
|
_ = os.Setenv("GOPATH", goPath)
|
|
}
|
|
}
|
|
|
|
// configureEnv sets up the environment variables for the task process, including:
|
|
// - Node.js paths
|
|
// - Crawlab-specific variables
|
|
// - Global environment variables from the system
|
|
func (r *Runner) configureEnv() {
|
|
// Configure Python path
|
|
r.configurePythonPath()
|
|
|
|
// Configure Node.js path
|
|
r.configureNodePath()
|
|
|
|
// Configure Go path
|
|
r.configureGoPath()
|
|
|
|
// Default envs
|
|
r.cmd.Env = os.Environ()
|
|
|
|
// Remove CRAWLAB_ prefixed environment variables
|
|
for i := 0; i < len(r.cmd.Env); i++ {
|
|
env := r.cmd.Env[i]
|
|
if strings.HasPrefix(env, "CRAWLAB_") {
|
|
r.cmd.Env = append(r.cmd.Env[:i], r.cmd.Env[i+1:]...)
|
|
i--
|
|
}
|
|
}
|
|
|
|
// Task-specific environment variables
|
|
r.cmd.Env = append(r.cmd.Env, "CRAWLAB_TASK_ID="+r.tid.Hex())
|
|
|
|
// Global environment variables
|
|
envs, err := client.NewModelService[models.Environment]().GetMany(nil, nil)
|
|
if err != nil {
|
|
r.Errorf("failed to get environments: %v", err)
|
|
}
|
|
for _, env := range envs {
|
|
r.cmd.Env = append(r.cmd.Env, env.Key+"="+env.Value)
|
|
}
|
|
|
|
// Add environment variable for child processes to identify they're running under Crawlab
|
|
r.cmd.Env = append(r.cmd.Env, "CRAWLAB_PARENT_PID="+fmt.Sprint(os.Getpid()))
|
|
}
|
|
|
|
// configureCwd sets the working directory for the task based on the spider's configuration
|
|
func (r *Runner) configureCwd() {
|
|
workspacePath := utils.GetWorkspace()
|
|
if r.s.GitId.IsZero() {
|
|
// not git
|
|
r.cwd = filepath.Join(workspacePath, r.s.Id.Hex())
|
|
} else {
|
|
// git
|
|
r.cwd = filepath.Join(workspacePath, r.s.GitId.Hex(), r.s.GitRootPath)
|
|
}
|
|
}
|
|
|
|
// configureCmd builds and configures the command to be executed, including setting up IPC pipes
|
|
// and processing command parameters
|
|
func (r *Runner) configureCmd() (err error) {
|
|
var cmdStr string
|
|
|
|
// command
|
|
if r.t.Cmd == "" {
|
|
cmdStr = r.s.Cmd
|
|
} else {
|
|
cmdStr = r.t.Cmd
|
|
}
|
|
|
|
// parameters
|
|
if r.t.Param != "" {
|
|
cmdStr += " " + r.t.Param
|
|
} else if r.s.Param != "" {
|
|
cmdStr += " " + r.s.Param
|
|
}
|
|
|
|
// get cmd instance
|
|
r.cmd, err = utils.BuildCmd(cmdStr)
|
|
if err != nil {
|
|
r.Errorf("error building command: %v", err)
|
|
return err
|
|
}
|
|
|
|
// set working directory
|
|
r.cmd.Dir = r.cwd
|
|
|
|
// ZOMBIE PREVENTION: Set process group to enable proper cleanup of child processes
|
|
if runtime.GOOS != "windows" {
|
|
// Create new process group on Unix systems to ensure child processes can be killed together
|
|
r.cmd.SysProcAttr = &syscall.SysProcAttr{
|
|
Setpgid: true, // Create new process group
|
|
Pgid: 0, // Use process ID as process group ID
|
|
}
|
|
}
|
|
|
|
// Configure pipes for IPC and logs
|
|
r.stdinPipe, err = r.cmd.StdinPipe()
|
|
if err != nil {
|
|
r.Errorf("error creating stdin pipe: %v", err)
|
|
return err
|
|
}
|
|
|
|
// Add stdout pipe for IPC and logs
|
|
r.stdoutPipe, err = r.cmd.StdoutPipe()
|
|
if err != nil {
|
|
r.Errorf("error creating stdout pipe: %v", err)
|
|
return err
|
|
}
|
|
|
|
// Add stderr pipe for error logs
|
|
stderrPipe, err := r.cmd.StderrPipe()
|
|
if err != nil {
|
|
r.Errorf("error creating stderr pipe: %v", err)
|
|
return err
|
|
}
|
|
|
|
// Create buffered readers
|
|
r.readerStdout = bufio.NewReader(r.stdoutPipe)
|
|
r.readerStderr = bufio.NewReader(stderrPipe)
|
|
|
|
// Initialize IPC channel
|
|
r.ipcChan = make(chan entity.IPCMessage)
|
|
|
|
return nil
|
|
}
|