fix: runner cancel issue

This commit is contained in:
Marvin Zhang
2024-11-22 13:32:27 +08:00
parent eee10ea08c
commit 8357dc6f30
4 changed files with 110 additions and 34 deletions

View File

@@ -6,8 +6,6 @@ import (
"encoding/json"
"errors"
"fmt"
"github.com/crawlab-team/crawlab/core/fs"
"github.com/hashicorp/go-multierror"
"io"
"net/http"
"os"
@@ -17,6 +15,9 @@ import (
"sync"
"time"
"github.com/crawlab-team/crawlab/core/fs"
"github.com/hashicorp/go-multierror"
"github.com/crawlab-team/crawlab/core/models/models"
"github.com/apex/log"
@@ -187,19 +188,22 @@ func (r *Runner) Cancel(force bool) (err error) {
return err
}
// Wait for process to be killed and goroutines to stop
ticker := time.NewTicker(time.Second)
// Create a context with timeout
ctx, cancel := context.WithTimeout(context.Background(), r.svc.GetCancelTimeout())
defer cancel()
// Wait for process to be killed with context
ticker := time.NewTicker(100 * time.Millisecond)
defer ticker.Stop()
for {
select {
case <-ticker.C:
if utils.ProcessIdExists(r.pid) {
continue
}
return nil
case <-time.After(r.svc.GetCancelTimeout()):
// timeout
case <-ctx.Done():
return fmt.Errorf("timeout waiting for task to stop")
case <-ticker.C:
if !utils.ProcessIdExists(r.pid) {
return nil
}
}
}
}

View File

@@ -1,14 +1,17 @@
package handler
import (
"bufio"
"encoding/json"
"fmt"
"github.com/apex/log"
"github.com/crawlab-team/crawlab/core/utils"
"io"
"runtime"
"testing"
"time"
"github.com/apex/log"
"github.com/crawlab-team/crawlab/core/utils"
"github.com/crawlab-team/crawlab/core/constants"
"github.com/crawlab-team/crawlab/core/models/models"
"github.com/crawlab-team/crawlab/core/models/service"
@@ -38,7 +41,12 @@ func setupTest(t *testing.T) *Runner {
Type: "test",
Mode: "test",
NodeId: primitive.NewObjectID(),
Cmd: "python script.py",
}
switch runtime.GOOS {
case "windows":
task.Cmd = "ping -n 10 127.0.0.1"
default: // linux and darwin (macOS)
task.Cmd = "sleep 10"
}
taskId, err := service.NewModelService[models.Task]().InsertOne(*task)
require.NoError(t, err)
@@ -119,21 +127,50 @@ func TestRunner_Cancel(t *testing.T) {
// Setup
runner := setupTest(t)
// Start a long-running command
runner.t.Cmd = "sleep 10"
// Create pipes for stdout
pr, pw := io.Pipe()
runner.cmd.Stdout = pw
runner.cmd.Stderr = pw
// Start the command
err := runner.cmd.Start()
assert.NoError(t, err)
log.Infof("started process with PID: %d", runner.cmd.Process.Pid)
runner.pid = runner.cmd.Process.Pid
// Read and print command output
go func() {
scanner := bufio.NewScanner(pr)
for scanner.Scan() {
log.Info(scanner.Text())
}
}()
// Wait a bit longer on Windows for the process to start properly
waitTime := 100 * time.Millisecond
if runtime.GOOS == "windows" {
waitTime = 1 * time.Second
}
time.Sleep(waitTime)
// Verify process exists before attempting to cancel
if !utils.ProcessIdExists(runner.pid) {
t.Fatalf("Process with PID %d was not started successfully", runner.pid)
}
// Test cancel
go func() {
err = runner.Cancel(true)
assert.NoError(t, err)
}()
// Verify process was killed
// Wait a short time for the process to be killed
time.Sleep(100 * time.Millisecond)
exists := utils.ProcessIdExists(runner.pid)
assert.False(t, exists)
// Wait for process to be killed, with shorter timeout
deadline := time.Now().Add(5 * time.Second)
for time.Now().Before(deadline) {
if !utils.ProcessIdExists(runner.pid) {
return // Process was killed
}
time.Sleep(100 * time.Millisecond)
}
t.Errorf("Process with PID %d was not killed within timeout", runner.pid)
}