mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-21 17:21:09 +01:00
fix(spider): optimize form update logic to watch specific fields for changes fix(grpc): adjust sync request ID handling for git and regular spiders
249 lines
6.2 KiB
Go
249 lines
6.2 KiB
Go
package handler
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"path/filepath"
|
|
"time"
|
|
|
|
"github.com/crawlab-team/crawlab/core/entity"
|
|
client2 "github.com/crawlab-team/crawlab/core/grpc/client"
|
|
"github.com/crawlab-team/crawlab/core/utils"
|
|
grpc2 "github.com/crawlab-team/crawlab/grpc"
|
|
"google.golang.org/grpc/codes"
|
|
"google.golang.org/grpc/status"
|
|
)
|
|
|
|
// syncFilesGRPC replaces HTTP-based syncFiles() with gRPC streaming
|
|
func (r *Runner) syncFilesGRPC() (err error) {
|
|
r.Infof("starting gRPC file synchronization for spider: %s", r.s.Id.Hex())
|
|
|
|
workingDir := ""
|
|
if !r.s.GitId.IsZero() {
|
|
workingDir = r.s.GitRootPath
|
|
r.Debugf("using git root path: %s", workingDir)
|
|
}
|
|
|
|
// Get sync service client
|
|
syncClient, err := client2.GetGrpcClient().GetSyncClient()
|
|
if err != nil {
|
|
r.Errorf("failed to get sync client: %v", err)
|
|
return err
|
|
}
|
|
|
|
// Determine the ID to use for sync request
|
|
// For git spiders, use GitId (where files are stored)
|
|
// For regular spiders, use SpiderId
|
|
var syncId string
|
|
if r.s.GitId.IsZero() {
|
|
syncId = r.s.Id.Hex()
|
|
} else {
|
|
syncId = r.s.GitId.Hex()
|
|
}
|
|
|
|
// Prepare request
|
|
req := &grpc2.FileSyncRequest{
|
|
SpiderId: syncId,
|
|
Path: workingDir,
|
|
NodeKey: utils.GetNodeKey(),
|
|
}
|
|
|
|
// Create context with timeout
|
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
|
defer cancel()
|
|
|
|
// Stream file list from master
|
|
r.Infof("fetching file list from master via gRPC")
|
|
stream, err := syncClient.StreamFileScan(ctx, req)
|
|
if err != nil {
|
|
r.Errorf("failed to start file scan stream: %v", err)
|
|
return err
|
|
}
|
|
|
|
// Receive file list in chunks
|
|
masterFilesMap := make(entity.FsFileInfoMap)
|
|
totalFiles := 0
|
|
|
|
for {
|
|
chunk, err := stream.Recv()
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
// Check for gRPC-specific errors
|
|
if st, ok := status.FromError(err); ok {
|
|
if st.Code() == codes.Unavailable {
|
|
r.Errorf("gRPC service unavailable, will retry: %v", err)
|
|
return fmt.Errorf("gRPC service unavailable: %w", err)
|
|
}
|
|
}
|
|
r.Errorf("error receiving file scan chunk: %v", err)
|
|
return err
|
|
}
|
|
|
|
// Check for error in chunk
|
|
if chunk.Error != "" {
|
|
r.Errorf("server error during file scan: %s", chunk.Error)
|
|
return fmt.Errorf("server error: %s", chunk.Error)
|
|
}
|
|
|
|
// Process files in chunk
|
|
for _, fileInfo := range chunk.Files {
|
|
fsFileInfo := entity.FsFileInfo{
|
|
Name: fileInfo.Name,
|
|
Path: fileInfo.Path,
|
|
FullPath: fileInfo.FullPath,
|
|
Extension: fileInfo.Extension,
|
|
IsDir: fileInfo.IsDir,
|
|
FileSize: fileInfo.FileSize,
|
|
ModTime: time.Unix(fileInfo.ModTime, 0),
|
|
Mode: os.FileMode(fileInfo.Mode),
|
|
Hash: fileInfo.Hash,
|
|
}
|
|
masterFilesMap[fileInfo.Path] = fsFileInfo
|
|
}
|
|
|
|
if chunk.IsComplete {
|
|
totalFiles = int(chunk.TotalFiles)
|
|
r.Infof("received complete file list: %d files", totalFiles)
|
|
break
|
|
}
|
|
}
|
|
|
|
// Create working directory if not exists
|
|
if _, err := os.Stat(r.cwd); os.IsNotExist(err) {
|
|
if err := os.MkdirAll(r.cwd, os.ModePerm); err != nil {
|
|
r.Errorf("error creating worker directory: %v", err)
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Get file list from worker
|
|
workerFiles, err := utils.ScanDirectory(r.cwd)
|
|
if err != nil {
|
|
r.Errorf("error scanning worker directory: %v", err)
|
|
return err
|
|
}
|
|
|
|
// Delete files that are deleted on master node
|
|
for path, workerFile := range workerFiles {
|
|
if _, exists := masterFilesMap[path]; !exists {
|
|
r.Infof("deleting file: %s", path)
|
|
err := os.Remove(workerFile.FullPath)
|
|
if err != nil {
|
|
r.Errorf("error deleting file: %v", err)
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
// Download new or modified files
|
|
downloadCount := 0
|
|
for path, masterFile := range masterFilesMap {
|
|
// Skip directories
|
|
if masterFile.IsDir {
|
|
continue
|
|
}
|
|
|
|
workerFile, exists := workerFiles[path]
|
|
needsDownload := false
|
|
|
|
if !exists {
|
|
r.Debugf("file not found locally: %s", path)
|
|
needsDownload = true
|
|
} else if workerFile.Hash != masterFile.Hash {
|
|
r.Debugf("file hash mismatch: %s (local: %s, master: %s)", path, workerFile.Hash, masterFile.Hash)
|
|
needsDownload = true
|
|
}
|
|
|
|
if needsDownload {
|
|
// For git spiders with sub-folders, prepend GitRootPath to the download path
|
|
// because the server stores files at workspace/{git_id}/{git_root_path}/...
|
|
downloadPath := path
|
|
if workingDir != "" {
|
|
downloadPath = filepath.Join(workingDir, path)
|
|
}
|
|
if err := r.downloadFileGRPC(syncClient, syncId, downloadPath); err != nil {
|
|
r.Errorf("error downloading file %s: %v", path, err)
|
|
return err
|
|
}
|
|
downloadCount++
|
|
}
|
|
}
|
|
|
|
r.Infof("file synchronization complete: %d files downloaded", downloadCount)
|
|
return nil
|
|
}
|
|
|
|
// downloadFileGRPC downloads a single file from master via gRPC streaming
|
|
func (r *Runner) downloadFileGRPC(client grpc2.SyncServiceClient, spiderId, path string) error {
|
|
r.Debugf("downloading file via gRPC: %s", path)
|
|
|
|
// Prepare request
|
|
req := &grpc2.FileDownloadRequest{
|
|
SpiderId: spiderId,
|
|
Path: path,
|
|
NodeKey: utils.GetNodeKey(),
|
|
}
|
|
|
|
// Create context with timeout
|
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
|
defer cancel()
|
|
|
|
// Stream file download
|
|
stream, err := client.StreamFileDownload(ctx, req)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to start download stream: %w", err)
|
|
}
|
|
|
|
// Create target file
|
|
targetPath := fmt.Sprintf("%s/%s", r.cwd, path)
|
|
|
|
// Create directory if not exists
|
|
targetDir := filepath.Dir(targetPath)
|
|
if err := os.MkdirAll(targetDir, os.ModePerm); err != nil {
|
|
return fmt.Errorf("failed to create directory: %w", err)
|
|
}
|
|
|
|
file, err := os.Create(targetPath)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create file: %w", err)
|
|
}
|
|
defer file.Close()
|
|
|
|
// Receive file content in chunks
|
|
bytesReceived := int64(0)
|
|
for {
|
|
chunk, err := stream.Recv()
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
return fmt.Errorf("error receiving download chunk: %w", err)
|
|
}
|
|
|
|
// Check for error in chunk
|
|
if chunk.Error != "" {
|
|
return fmt.Errorf("server error during download: %s", chunk.Error)
|
|
}
|
|
|
|
// Write chunk data
|
|
if len(chunk.Data) > 0 {
|
|
n, err := file.Write(chunk.Data)
|
|
if err != nil {
|
|
return fmt.Errorf("error writing file: %w", err)
|
|
}
|
|
bytesReceived += int64(n)
|
|
}
|
|
|
|
if chunk.IsComplete {
|
|
r.Debugf("download complete: %s (%d bytes)", path, bytesReceived)
|
|
break
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|