mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-28 17:50:56 +01:00
feat: implement synchronization and error handling improvements in task reconciliation and file synchronization
This commit is contained in:
@@ -12,36 +12,38 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
DefaultWorkspace = "crawlab_workspace"
|
||||
DefaultTaskLogPath = "/var/log/crawlab/tasks"
|
||||
DefaultServerHost = "0.0.0.0"
|
||||
DefaultServerPort = 8000
|
||||
DefaultGrpcHost = "localhost"
|
||||
DefaultGrpcPort = 9666
|
||||
DefaultGrpcServerHost = "0.0.0.0"
|
||||
DefaultGrpcServerPort = 9666
|
||||
DefaultAuthKey = "Crawlab2024!"
|
||||
DefaultApiEndpoint = "http://localhost:8000"
|
||||
DefaultApiAllowOrigin = "*"
|
||||
DefaultApiAllowCredentials = "true"
|
||||
DefaultApiAllowMethods = "DELETE, POST, OPTIONS, GET, PUT"
|
||||
DefaultApiAllowHeaders = "Content-Type, Content-Length, Accept-Encoding, X-CSRF-Token, Authorization, accept, origin, Cache-Control, X-Requested-With"
|
||||
DefaultApiPort = 8080
|
||||
DefaultApiPath = "/api"
|
||||
DefaultNodeMaxRunners = 20 // Default max concurrent task runners per node
|
||||
DefaultTaskQueueSize = 100 // Default task queue size per node
|
||||
DefaultInstallRoot = "/app/install"
|
||||
DefaultInstallEnvs = ""
|
||||
MetadataConfigDirName = ".crawlab"
|
||||
MetadataConfigName = "config.json"
|
||||
DefaultPyenvPath = "/root/.pyenv"
|
||||
DefaultNodeModulesPath = "/usr/lib/node_modules"
|
||||
DefaultNodeBinPath = "/usr/lib/node_bin"
|
||||
DefaultGoPath = "/root/go"
|
||||
DefaultMCPServerHost = "0.0.0.0"
|
||||
DefaultMCPServerPort = 9777
|
||||
DefaultMCPClientBaseUrl = "http://localhost:9777/sse"
|
||||
DefaultOpenAPIUrlPath = "/openapi.json"
|
||||
DefaultWorkspace = "crawlab_workspace"
|
||||
DefaultTaskLogPath = "/var/log/crawlab/tasks"
|
||||
DefaultServerHost = "0.0.0.0"
|
||||
DefaultServerPort = 8000
|
||||
DefaultGrpcHost = "localhost"
|
||||
DefaultGrpcPort = 9666
|
||||
DefaultGrpcServerHost = "0.0.0.0"
|
||||
DefaultGrpcServerPort = 9666
|
||||
DefaultAuthKey = "Crawlab2024!"
|
||||
DefaultApiEndpoint = "http://localhost:8000"
|
||||
DefaultApiAllowOrigin = "*"
|
||||
DefaultApiAllowCredentials = "true"
|
||||
DefaultApiAllowMethods = "DELETE, POST, OPTIONS, GET, PUT"
|
||||
DefaultApiAllowHeaders = "Content-Type, Content-Length, Accept-Encoding, X-CSRF-Token, Authorization, accept, origin, Cache-Control, X-Requested-With"
|
||||
DefaultApiPort = 8080
|
||||
DefaultApiPath = "/api"
|
||||
DefaultNodeMaxRunners = 20 // Default max concurrent task runners per node
|
||||
DefaultTaskQueueSize = 100 // Default task queue size per node
|
||||
DefaultInstallRoot = "/app/install"
|
||||
DefaultInstallEnvs = ""
|
||||
MetadataConfigDirName = ".crawlab"
|
||||
MetadataConfigName = "config.json"
|
||||
DefaultPyenvPath = "/root/.pyenv"
|
||||
DefaultNodeModulesPath = "/usr/lib/node_modules"
|
||||
DefaultNodeBinPath = "/usr/lib/node_bin"
|
||||
DefaultGoPath = "/root/go"
|
||||
DefaultMCPServerHost = "0.0.0.0"
|
||||
DefaultMCPServerPort = 9777
|
||||
DefaultMCPClientBaseUrl = "http://localhost:9777/sse"
|
||||
DefaultOpenAPIUrlPath = "/openapi.json"
|
||||
DefaultSyncDownloadMaxConcurrency = 16
|
||||
DefaultMinFileDescriptorLimit = 8192
|
||||
)
|
||||
|
||||
func IsDev() bool {
|
||||
@@ -332,3 +334,17 @@ func GetOpenAPIUrl() string {
|
||||
}
|
||||
return GetApiEndpoint() + DefaultOpenAPIUrlPath
|
||||
}
|
||||
|
||||
func GetSyncDownloadMaxConcurrency() int64 {
|
||||
if res := viper.GetInt("sync.download.max_concurrency"); res > 0 {
|
||||
return int64(res)
|
||||
}
|
||||
return int64(DefaultSyncDownloadMaxConcurrency)
|
||||
}
|
||||
|
||||
func GetMinFileDescriptorLimit() uint64 {
|
||||
if res := viper.GetUint64("system.fd_min"); res > 0 {
|
||||
return res
|
||||
}
|
||||
return DefaultMinFileDescriptorLimit
|
||||
}
|
||||
|
||||
@@ -7,12 +7,16 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"io/fs"
|
||||
"maps"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/crawlab-team/crawlab/core/entity"
|
||||
"golang.org/x/sync/singleflight"
|
||||
)
|
||||
|
||||
func OpenFile(fileName string) *os.File {
|
||||
@@ -184,11 +188,54 @@ func GetFileHash(filePath string) (res string, err error) {
|
||||
}
|
||||
|
||||
const IgnoreFileRegexPattern = `(^node_modules|__pycache__)/|\.(tmp|temp|log|swp|swo|bak|orig|lock|pid|pyc|pyo)$`
|
||||
const scanDirectoryCacheTTL = 3 * time.Second
|
||||
|
||||
func ScanDirectory(dir string) (res entity.FsFileInfoMap, err error) {
|
||||
var (
|
||||
scanDirectoryGroup singleflight.Group
|
||||
scanDirectoryCache = struct {
|
||||
sync.RWMutex
|
||||
items map[string]scanDirectoryCacheEntry
|
||||
}{items: make(map[string]scanDirectoryCacheEntry)}
|
||||
)
|
||||
|
||||
type scanDirectoryCacheEntry struct {
|
||||
data entity.FsFileInfoMap
|
||||
expiresAt time.Time
|
||||
}
|
||||
|
||||
func ScanDirectory(dir string) (entity.FsFileInfoMap, error) {
|
||||
if res, ok := getScanDirectoryCache(dir); ok {
|
||||
return cloneFsFileInfoMap(res), nil
|
||||
}
|
||||
|
||||
v, err, _ := scanDirectoryGroup.Do(dir, func() (any, error) {
|
||||
if res, ok := getScanDirectoryCache(dir); ok {
|
||||
return cloneFsFileInfoMap(res), nil
|
||||
}
|
||||
|
||||
files, err := scanDirectoryInternal(dir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
setScanDirectoryCache(dir, files)
|
||||
return cloneFsFileInfoMap(files), nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
res, ok := v.(entity.FsFileInfoMap)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("unexpected cache value type: %T", v)
|
||||
}
|
||||
|
||||
return cloneFsFileInfoMap(res), nil
|
||||
}
|
||||
|
||||
func scanDirectoryInternal(dir string) (entity.FsFileInfoMap, error) {
|
||||
files := make(entity.FsFileInfoMap)
|
||||
|
||||
// Compile the ignore pattern regex
|
||||
ignoreRegex, err := regexp.Compile(IgnoreFileRegexPattern)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to compile ignore pattern: %v", err)
|
||||
@@ -204,7 +251,6 @@ func ScanDirectory(dir string) (res entity.FsFileInfoMap, err error) {
|
||||
return err
|
||||
}
|
||||
|
||||
// Skip files that match the ignore pattern
|
||||
if ignoreRegex.MatchString(relPath) {
|
||||
if info.IsDir() {
|
||||
return filepath.SkipDir
|
||||
@@ -239,3 +285,33 @@ func ScanDirectory(dir string) (res entity.FsFileInfoMap, err error) {
|
||||
|
||||
return files, nil
|
||||
}
|
||||
|
||||
func getScanDirectoryCache(dir string) (entity.FsFileInfoMap, bool) {
|
||||
scanDirectoryCache.RLock()
|
||||
defer scanDirectoryCache.RUnlock()
|
||||
|
||||
entry, ok := scanDirectoryCache.items[dir]
|
||||
if !ok || time.Now().After(entry.expiresAt) {
|
||||
return nil, false
|
||||
}
|
||||
return entry.data, true
|
||||
}
|
||||
|
||||
func setScanDirectoryCache(dir string, data entity.FsFileInfoMap) {
|
||||
scanDirectoryCache.Lock()
|
||||
defer scanDirectoryCache.Unlock()
|
||||
|
||||
scanDirectoryCache.items[dir] = scanDirectoryCacheEntry{
|
||||
data: data,
|
||||
expiresAt: time.Now().Add(scanDirectoryCacheTTL),
|
||||
}
|
||||
}
|
||||
|
||||
func cloneFsFileInfoMap(src entity.FsFileInfoMap) entity.FsFileInfoMap {
|
||||
if src == nil {
|
||||
return nil
|
||||
}
|
||||
dst := make(entity.FsFileInfoMap, len(src))
|
||||
maps.Copy(dst, src)
|
||||
return dst
|
||||
}
|
||||
|
||||
7
core/utils/rlimit_stub.go
Normal file
7
core/utils/rlimit_stub.go
Normal file
@@ -0,0 +1,7 @@
|
||||
//go:build windows || plan9
|
||||
|
||||
package utils
|
||||
|
||||
func EnsureFileDescriptorLimit(_ uint64) {
|
||||
// no-op on unsupported platforms
|
||||
}
|
||||
30
core/utils/rlimit_unix.go
Normal file
30
core/utils/rlimit_unix.go
Normal file
@@ -0,0 +1,30 @@
|
||||
//go:build !windows && !plan9
|
||||
|
||||
package utils
|
||||
|
||||
import "syscall"
|
||||
|
||||
func EnsureFileDescriptorLimit(min uint64) {
|
||||
var rLimit syscall.Rlimit
|
||||
if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &rLimit); err != nil {
|
||||
logger.Warnf("failed to get rlimit: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
if rLimit.Cur >= min {
|
||||
return
|
||||
}
|
||||
|
||||
newLimit := min
|
||||
if rLimit.Max < newLimit {
|
||||
rLimit.Max = newLimit
|
||||
}
|
||||
rLimit.Cur = newLimit
|
||||
|
||||
if err := syscall.Setrlimit(syscall.RLIMIT_NOFILE, &rLimit); err != nil {
|
||||
logger.Warnf("failed to raise rlimit to %d: %v", newLimit, err)
|
||||
return
|
||||
}
|
||||
|
||||
logger.Infof("increased file descriptor limit to %d", newLimit)
|
||||
}
|
||||
Reference in New Issue
Block a user