feat: implement synchronization and error handling improvements in task reconciliation and file synchronization

This commit is contained in:
Marvin Zhang
2025-09-28 17:42:23 +08:00
parent e80256aa61
commit 29ef8d67da
7 changed files with 334 additions and 46 deletions

View File

@@ -12,36 +12,38 @@ import (
)
const (
DefaultWorkspace = "crawlab_workspace"
DefaultTaskLogPath = "/var/log/crawlab/tasks"
DefaultServerHost = "0.0.0.0"
DefaultServerPort = 8000
DefaultGrpcHost = "localhost"
DefaultGrpcPort = 9666
DefaultGrpcServerHost = "0.0.0.0"
DefaultGrpcServerPort = 9666
DefaultAuthKey = "Crawlab2024!"
DefaultApiEndpoint = "http://localhost:8000"
DefaultApiAllowOrigin = "*"
DefaultApiAllowCredentials = "true"
DefaultApiAllowMethods = "DELETE, POST, OPTIONS, GET, PUT"
DefaultApiAllowHeaders = "Content-Type, Content-Length, Accept-Encoding, X-CSRF-Token, Authorization, accept, origin, Cache-Control, X-Requested-With"
DefaultApiPort = 8080
DefaultApiPath = "/api"
DefaultNodeMaxRunners = 20 // Default max concurrent task runners per node
DefaultTaskQueueSize = 100 // Default task queue size per node
DefaultInstallRoot = "/app/install"
DefaultInstallEnvs = ""
MetadataConfigDirName = ".crawlab"
MetadataConfigName = "config.json"
DefaultPyenvPath = "/root/.pyenv"
DefaultNodeModulesPath = "/usr/lib/node_modules"
DefaultNodeBinPath = "/usr/lib/node_bin"
DefaultGoPath = "/root/go"
DefaultMCPServerHost = "0.0.0.0"
DefaultMCPServerPort = 9777
DefaultMCPClientBaseUrl = "http://localhost:9777/sse"
DefaultOpenAPIUrlPath = "/openapi.json"
DefaultWorkspace = "crawlab_workspace"
DefaultTaskLogPath = "/var/log/crawlab/tasks"
DefaultServerHost = "0.0.0.0"
DefaultServerPort = 8000
DefaultGrpcHost = "localhost"
DefaultGrpcPort = 9666
DefaultGrpcServerHost = "0.0.0.0"
DefaultGrpcServerPort = 9666
DefaultAuthKey = "Crawlab2024!"
DefaultApiEndpoint = "http://localhost:8000"
DefaultApiAllowOrigin = "*"
DefaultApiAllowCredentials = "true"
DefaultApiAllowMethods = "DELETE, POST, OPTIONS, GET, PUT"
DefaultApiAllowHeaders = "Content-Type, Content-Length, Accept-Encoding, X-CSRF-Token, Authorization, accept, origin, Cache-Control, X-Requested-With"
DefaultApiPort = 8080
DefaultApiPath = "/api"
DefaultNodeMaxRunners = 20 // Default max concurrent task runners per node
DefaultTaskQueueSize = 100 // Default task queue size per node
DefaultInstallRoot = "/app/install"
DefaultInstallEnvs = ""
MetadataConfigDirName = ".crawlab"
MetadataConfigName = "config.json"
DefaultPyenvPath = "/root/.pyenv"
DefaultNodeModulesPath = "/usr/lib/node_modules"
DefaultNodeBinPath = "/usr/lib/node_bin"
DefaultGoPath = "/root/go"
DefaultMCPServerHost = "0.0.0.0"
DefaultMCPServerPort = 9777
DefaultMCPClientBaseUrl = "http://localhost:9777/sse"
DefaultOpenAPIUrlPath = "/openapi.json"
DefaultSyncDownloadMaxConcurrency = 16
DefaultMinFileDescriptorLimit = 8192
)
func IsDev() bool {
@@ -332,3 +334,17 @@ func GetOpenAPIUrl() string {
}
return GetApiEndpoint() + DefaultOpenAPIUrlPath
}
func GetSyncDownloadMaxConcurrency() int64 {
if res := viper.GetInt("sync.download.max_concurrency"); res > 0 {
return int64(res)
}
return int64(DefaultSyncDownloadMaxConcurrency)
}
func GetMinFileDescriptorLimit() uint64 {
if res := viper.GetUint64("system.fd_min"); res > 0 {
return res
}
return DefaultMinFileDescriptorLimit
}

View File

@@ -7,12 +7,16 @@ import (
"fmt"
"io"
"io/fs"
"maps"
"os"
"path"
"path/filepath"
"regexp"
"sync"
"time"
"github.com/crawlab-team/crawlab/core/entity"
"golang.org/x/sync/singleflight"
)
func OpenFile(fileName string) *os.File {
@@ -184,11 +188,54 @@ func GetFileHash(filePath string) (res string, err error) {
}
const IgnoreFileRegexPattern = `(^node_modules|__pycache__)/|\.(tmp|temp|log|swp|swo|bak|orig|lock|pid|pyc|pyo)$`
const scanDirectoryCacheTTL = 3 * time.Second
func ScanDirectory(dir string) (res entity.FsFileInfoMap, err error) {
var (
scanDirectoryGroup singleflight.Group
scanDirectoryCache = struct {
sync.RWMutex
items map[string]scanDirectoryCacheEntry
}{items: make(map[string]scanDirectoryCacheEntry)}
)
type scanDirectoryCacheEntry struct {
data entity.FsFileInfoMap
expiresAt time.Time
}
func ScanDirectory(dir string) (entity.FsFileInfoMap, error) {
if res, ok := getScanDirectoryCache(dir); ok {
return cloneFsFileInfoMap(res), nil
}
v, err, _ := scanDirectoryGroup.Do(dir, func() (any, error) {
if res, ok := getScanDirectoryCache(dir); ok {
return cloneFsFileInfoMap(res), nil
}
files, err := scanDirectoryInternal(dir)
if err != nil {
return nil, err
}
setScanDirectoryCache(dir, files)
return cloneFsFileInfoMap(files), nil
})
if err != nil {
return nil, err
}
res, ok := v.(entity.FsFileInfoMap)
if !ok {
return nil, fmt.Errorf("unexpected cache value type: %T", v)
}
return cloneFsFileInfoMap(res), nil
}
func scanDirectoryInternal(dir string) (entity.FsFileInfoMap, error) {
files := make(entity.FsFileInfoMap)
// Compile the ignore pattern regex
ignoreRegex, err := regexp.Compile(IgnoreFileRegexPattern)
if err != nil {
return nil, fmt.Errorf("failed to compile ignore pattern: %v", err)
@@ -204,7 +251,6 @@ func ScanDirectory(dir string) (res entity.FsFileInfoMap, err error) {
return err
}
// Skip files that match the ignore pattern
if ignoreRegex.MatchString(relPath) {
if info.IsDir() {
return filepath.SkipDir
@@ -239,3 +285,33 @@ func ScanDirectory(dir string) (res entity.FsFileInfoMap, err error) {
return files, nil
}
func getScanDirectoryCache(dir string) (entity.FsFileInfoMap, bool) {
scanDirectoryCache.RLock()
defer scanDirectoryCache.RUnlock()
entry, ok := scanDirectoryCache.items[dir]
if !ok || time.Now().After(entry.expiresAt) {
return nil, false
}
return entry.data, true
}
func setScanDirectoryCache(dir string, data entity.FsFileInfoMap) {
scanDirectoryCache.Lock()
defer scanDirectoryCache.Unlock()
scanDirectoryCache.items[dir] = scanDirectoryCacheEntry{
data: data,
expiresAt: time.Now().Add(scanDirectoryCacheTTL),
}
}
func cloneFsFileInfoMap(src entity.FsFileInfoMap) entity.FsFileInfoMap {
if src == nil {
return nil
}
dst := make(entity.FsFileInfoMap, len(src))
maps.Copy(dst, src)
return dst
}

View File

@@ -0,0 +1,7 @@
//go:build windows || plan9
package utils
func EnsureFileDescriptorLimit(_ uint64) {
// no-op on unsupported platforms
}

30
core/utils/rlimit_unix.go Normal file
View File

@@ -0,0 +1,30 @@
//go:build !windows && !plan9
package utils
import "syscall"
func EnsureFileDescriptorLimit(min uint64) {
var rLimit syscall.Rlimit
if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &rLimit); err != nil {
logger.Warnf("failed to get rlimit: %v", err)
return
}
if rLimit.Cur >= min {
return
}
newLimit := min
if rLimit.Max < newLimit {
rLimit.Max = newLimit
}
rLimit.Cur = newLimit
if err := syscall.Setrlimit(syscall.RLIMIT_NOFILE, &rLimit); err != nil {
logger.Warnf("failed to raise rlimit to %d: %v", newLimit, err)
return
}
logger.Infof("increased file descriptor limit to %d", newLimit)
}