feat: added modules

This commit is contained in:
Marvin Zhang
2024-06-14 15:42:50 +08:00
parent 4d0adcb6f0
commit c4d795f47f
626 changed files with 60104 additions and 0 deletions

344
core/export/csv_service.go Normal file
View File

@@ -0,0 +1,344 @@
package export
import (
"context"
"encoding/csv"
"errors"
"fmt"
"github.com/ReneKroon/ttlcache"
"github.com/apex/log"
"github.com/crawlab-team/crawlab-db/mongo"
"github.com/crawlab-team/crawlab/core/constants"
"github.com/crawlab-team/crawlab/core/entity"
"github.com/crawlab-team/crawlab/core/interfaces"
"github.com/crawlab-team/crawlab/core/utils"
"github.com/crawlab-team/go-trace"
"github.com/hashicorp/go-uuid"
"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/bson/primitive"
mongo2 "go.mongodb.org/mongo-driver/mongo"
"os"
"path"
"sort"
"strconv"
"time"
)
type CsvService struct {
cache *ttlcache.Cache
}
func (svc *CsvService) GenerateId() (exportId string, err error) {
exportId, err = uuid.GenerateUUID()
if err != nil {
return "", trace.TraceError(err)
}
return exportId, nil
}
func (svc *CsvService) Export(exportType, target string, filter interfaces.Filter) (exportId string, err error) {
// generate export id
exportId, err = svc.GenerateId()
if err != nil {
return "", err
}
// export
export := &entity.Export{
Id: exportId,
Type: exportType,
Target: target,
Filter: filter,
Status: constants.TaskStatusRunning,
StartTs: time.Now(),
FileName: svc.getFileName(exportId),
DownloadPath: svc.getDownloadPath(exportId),
Limit: 100,
}
// save to cache
svc.cache.Set(exportId, export)
// execute export
go svc.export(export)
return exportId, nil
}
func (svc *CsvService) GetExport(exportId string) (export interfaces.Export, err error) {
// get export from cache
res, ok := svc.cache.Get(exportId)
if !ok {
return nil, trace.TraceError(errors.New("export not found"))
}
export = res.(interfaces.Export)
return export, nil
}
func (svc *CsvService) export(export *entity.Export) {
// check empty
if export.Target == "" {
err := errors.New("empty target")
export.Status = constants.TaskStatusError
export.EndTs = time.Now()
log.Errorf("export error (id: %s): %v", export.Id, err)
trace.PrintError(err)
svc.cache.Set(export.Id, export)
return
}
// mongo collection
col := mongo.GetMongoCol(export.Target)
// mongo query
query, err := utils.FilterToQuery(export.Filter)
if err != nil {
export.Status = constants.TaskStatusError
export.EndTs = time.Now()
log.Errorf("export error (id: %s): %v", export.Id, err)
trace.PrintError(err)
svc.cache.Set(export.Id, export)
return
}
// mongo cursor
cur := col.Find(query, nil).GetCursor()
// csv writer
csvWriter, csvFile, err := svc.getCsvWriter(export)
defer func() {
csvWriter.Flush()
_ = csvFile.Close()
}()
if err != nil {
export.Status = constants.TaskStatusError
export.EndTs = time.Now()
log.Errorf("export error (id: %s): %v", export.Id, err)
trace.PrintError(err)
svc.cache.Set(export.Id, export)
return
}
// write bom
bom := []byte{0xEF, 0xBB, 0xBF}
_, err = csvFile.Write(bom)
if err != nil {
trace.PrintError(err)
return
}
// write csv header row
columns, err := svc.getColumns(query, export)
err = csvWriter.Write(columns)
if err != nil {
export.Status = constants.TaskStatusError
export.EndTs = time.Now()
log.Errorf("export error (id: %s): %v", export.Id, err)
trace.PrintError(err)
svc.cache.Set(export.Id, export)
return
}
csvWriter.Flush()
// iterate cursor
i := 0
for {
// increment counter
i++
// check error
err := cur.Err()
if err != nil {
if err != mongo2.ErrNoDocuments {
// error
export.Status = constants.TaskStatusError
export.EndTs = time.Now()
log.Errorf("export error (id: %s): %v", export.Id, err)
trace.PrintError(err)
} else {
// no more data
export.Status = constants.TaskStatusFinished
export.EndTs = time.Now()
log.Infof("export finished (id: %s)", export.Id)
}
svc.cache.Set(export.Id, export)
return
}
// has data
if !cur.Next(context.Background()) {
// no more data
export.Status = constants.TaskStatusFinished
export.EndTs = time.Now()
log.Infof("export finished (id: %s)", export.Id)
svc.cache.Set(export.Id, export)
return
}
// convert raw data to entity
var data bson.M
err = cur.Decode(&data)
if err != nil {
// error
export.Status = constants.TaskStatusError
export.EndTs = time.Now()
log.Errorf("export error (id: %s): %v", export.Id, err)
trace.PrintError(err)
svc.cache.Set(export.Id, export)
return
}
// write csv row cells
cells := svc.getRowCells(columns, data)
err = csvWriter.Write(cells)
if err != nil {
// error
export.Status = constants.TaskStatusError
export.EndTs = time.Now()
log.Errorf("export error (id: %s): %v", export.Id, err)
trace.PrintError(err)
svc.cache.Set(export.Id, export)
return
}
// flush if limit reached
if i >= export.Limit {
csvWriter.Flush()
i = 0
}
}
}
func (svc *CsvService) getExportDir() (dir string, err error) {
tempDir := os.TempDir()
exportDir := path.Join(tempDir, "export", "csv")
if !utils.Exists(exportDir) {
err := os.MkdirAll(exportDir, 0755)
if err != nil {
return "", err
}
}
return exportDir, nil
}
func (svc *CsvService) getFileName(exportId string) (fileName string) {
return exportId + "_" + time.Now().Format("20060102150405") + ".csv"
}
// getDownloadPath returns the download path for the export
// format: <tempDir>/export/<exportId>/<exportId>_<timestamp>.csv
func (svc *CsvService) getDownloadPath(exportId string) (downloadPath string) {
exportDir, err := svc.getExportDir()
if err != nil {
return ""
}
downloadPath = path.Join(exportDir, svc.getFileName(exportId))
return downloadPath
}
func (svc *CsvService) getCsvWriter(export *entity.Export) (csvWriter *csv.Writer, csvFile *os.File, err error) {
// open file
csvFile, err = os.Create(export.DownloadPath)
if err != nil {
return nil, nil, trace.TraceError(err)
}
// create csv writer
csvWriter = csv.NewWriter(csvFile)
return csvWriter, csvFile, nil
}
func (svc *CsvService) getColumns(query bson.M, export interfaces.Export) (columns []string, err error) {
// get mongo collection
col := mongo.GetMongoCol(export.GetTarget())
// get 10 records
var data []bson.M
if err := col.Find(query, &mongo.FindOptions{Limit: 10}).All(&data); err != nil {
return nil, trace.TraceError(err)
}
// columns set
columnsSet := make(map[string]bool)
for _, d := range data {
for k := range d {
columnsSet[k] = true
}
}
// columns
columns = make([]string, 0, len(columnsSet))
for k := range columnsSet {
// skip task key
if k == constants.TaskKey {
continue
}
// skip _id
if k == "_id" {
continue
}
// append to columns
columns = append(columns, k)
}
// order columns
sort.Strings(columns)
return columns, nil
}
func (svc *CsvService) getRowCells(columns []string, data bson.M) (cells []string) {
for _, c := range columns {
v, ok := data[c]
if !ok {
cells = append(cells, "")
continue
}
switch v.(type) {
case string:
cells = append(cells, v.(string))
case time.Time:
cells = append(cells, v.(time.Time).Format("2006-01-02 15:04:05"))
case int:
cells = append(cells, strconv.Itoa(v.(int)))
case int32:
cells = append(cells, strconv.Itoa(int(v.(int32))))
case int64:
cells = append(cells, strconv.FormatInt(v.(int64), 10))
case float32:
cells = append(cells, strconv.FormatFloat(float64(v.(float32)), 'f', -1, 32))
case float64:
cells = append(cells, strconv.FormatFloat(v.(float64), 'f', -1, 64))
case bool:
cells = append(cells, strconv.FormatBool(v.(bool)))
case primitive.ObjectID:
cells = append(cells, v.(primitive.ObjectID).Hex())
case primitive.DateTime:
cells = append(cells, v.(primitive.DateTime).Time().Format("2006-01-02 15:04:05"))
default:
cells = append(cells, fmt.Sprintf("%v", v))
}
}
return cells
}
func NewCsvService() (svc2 interfaces.ExportService) {
cache := ttlcache.NewCache()
cache.SetTTL(time.Minute * 5)
svc := &CsvService{
cache: cache,
}
return svc
}
var _csvService interfaces.ExportService
func GetCsvService() (svc interfaces.ExportService) {
if _csvService == nil {
_csvService = NewCsvService()
}
return _csvService
}

View File

@@ -0,0 +1,139 @@
package export
import (
"encoding/csv"
"fmt"
"github.com/crawlab-team/crawlab-db/mongo"
"github.com/crawlab-team/crawlab/core/constants"
"github.com/stretchr/testify/require"
"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/bson/primitive"
"os"
"strconv"
"testing"
"time"
)
func TestCsvService_Export(t *testing.T) {
// test data rows
var rows []interface{}
for i := 0; i < 10; i++ {
data := bson.M{
"no": i,
"string_field": "test",
"int_field": 1,
"float_field": 1.1,
"bool_field": true,
"time_field": time.Now(),
"object_id_field": primitive.NewObjectID(),
}
rows = append(rows, data)
}
// test mongo collection name
collectionName := "test_collection"
// test mongo collection
collection := mongo.GetMongoCol(collectionName)
// delete records of test mongo collection after test
t.Cleanup(func() {
_ = collection.Delete(bson.M{})
})
// save test data rows to mongo collection
_, err := collection.InsertMany(rows)
require.Nil(t, err)
// export service
csvSvc := NewCsvService()
// export
exportId, err := csvSvc.Export(collectionName, collectionName, nil)
require.Nil(t, err)
// get export
export, err := csvSvc.GetExport(exportId)
require.Nil(t, err)
require.NotNil(t, export)
require.Equal(t, exportId, export.GetId())
require.NotNil(t, export.GetDownloadPath())
// wait for export to finish with timeout of 5 seconds
timeout := time.After(5 * time.Second)
finished := false
for {
if finished {
break
}
select {
case <-timeout:
t.Fatal("export timeout")
default:
if export.GetStatus() == constants.TaskStatusFinished {
finished = true
continue
}
time.Sleep(100 * time.Millisecond)
}
}
// export file path
exportFilePath := export.GetDownloadPath()
require.FileExists(t, exportFilePath)
// csv file
csvFile, err := os.Open(exportFilePath)
require.Nil(t, err)
defer csvFile.Close()
// csv file reader
csvFileReader := csv.NewReader(csvFile)
// csv file rows
csvFileRows, err := csvFileReader.ReadAll()
require.Nil(t, err)
require.Equal(t, len(rows), len(csvFileRows)-1)
// csv file columns
csvFileColumns := csvFileRows[0]
// iterate csv file records and compare with test data rows
for i, row := range rows {
// csv file record
csvFileRecord := csvFileRows[i+1]
// iterate csv file columns and compare with test data row
for j, column := range csvFileColumns {
// csv file column value
csvFileColumnValue := csvFileRecord[j]
// compare csv file column value with test data row
switch column {
case "no":
// convert int to string
stringValue := fmt.Sprintf("%d", row.(bson.M)["no"].(int))
require.Equal(t, stringValue, csvFileColumnValue)
case "string_field":
require.Equal(t, row.(bson.M)["string_field"], csvFileColumnValue)
case "int_field":
// convert int to string
stringValue := fmt.Sprintf("%d", row.(bson.M)["int_field"])
require.Equal(t, stringValue, csvFileColumnValue)
case "float_field":
// convert string to float
floatValue, err := strconv.ParseFloat(csvFileColumnValue, 64)
require.Nil(t, err)
require.Equal(t, row.(bson.M)["float_field"].(float64), floatValue)
case "bool_field":
// convert bool to string
stringValue := fmt.Sprintf("%t", row.(bson.M)["bool_field"])
require.Equal(t, stringValue, csvFileColumnValue)
case "time_field":
// convert time to string
stringValue := row.(bson.M)["time_field"].(time.Time).Format("2006-01-02 15:04:05")
require.Equal(t, stringValue, csvFileColumnValue)
}
}
}
}

223
core/export/json_service.go Normal file
View File

@@ -0,0 +1,223 @@
package export
import (
"context"
"encoding/json"
"errors"
"github.com/ReneKroon/ttlcache"
"github.com/apex/log"
"github.com/crawlab-team/crawlab-db/mongo"
"github.com/crawlab-team/crawlab/core/constants"
"github.com/crawlab-team/crawlab/core/entity"
"github.com/crawlab-team/crawlab/core/interfaces"
"github.com/crawlab-team/crawlab/core/utils"
"github.com/crawlab-team/go-trace"
"github.com/hashicorp/go-uuid"
mongo2 "go.mongodb.org/mongo-driver/mongo"
"os"
"path"
"time"
)
type JsonService struct {
cache *ttlcache.Cache
}
func (svc *JsonService) GenerateId() (exportId string, err error) {
exportId, err = uuid.GenerateUUID()
if err != nil {
return "", trace.TraceError(err)
}
return exportId, nil
}
func (svc *JsonService) Export(exportType, target string, filter interfaces.Filter) (exportId string, err error) {
// generate export id
exportId, err = svc.GenerateId()
if err != nil {
return "", err
}
// export
export := &entity.Export{
Id: exportId,
Type: exportType,
Target: target,
Filter: filter,
Status: constants.TaskStatusRunning,
StartTs: time.Now(),
FileName: svc.getFileName(exportId),
DownloadPath: svc.getDownloadPath(exportId),
Limit: 100,
}
// save to cache
svc.cache.Set(exportId, export)
// execute export
go svc.export(export)
return exportId, nil
}
func (svc *JsonService) GetExport(exportId string) (export interfaces.Export, err error) {
// get export from cache
res, ok := svc.cache.Get(exportId)
if !ok {
return nil, trace.TraceError(errors.New("export not found"))
}
export = res.(interfaces.Export)
return export, nil
}
func (svc *JsonService) export(export *entity.Export) {
// check empty
if export.Target == "" {
err := errors.New("empty target")
export.Status = constants.TaskStatusError
export.EndTs = time.Now()
log.Errorf("export error (id: %s): %v", export.Id, err)
trace.PrintError(err)
svc.cache.Set(export.Id, export)
return
}
// mongo collection
col := mongo.GetMongoCol(export.Target)
// mongo query
query, err := utils.FilterToQuery(export.Filter)
if err != nil {
export.Status = constants.TaskStatusError
export.EndTs = time.Now()
log.Errorf("export error (id: %s): %v", export.Id, err)
trace.PrintError(err)
svc.cache.Set(export.Id, export)
return
}
// mongo cursor
cur := col.Find(query, nil).GetCursor()
// data
var jsonData []interface{}
// iterate cursor
i := 0
for {
// increment counter
i++
// check error
err := cur.Err()
if err != nil {
if err != mongo2.ErrNoDocuments {
// error
export.Status = constants.TaskStatusError
export.EndTs = time.Now()
log.Errorf("export error (id: %s): %v", export.Id, err)
trace.PrintError(err)
} else {
// no more data
export.Status = constants.TaskStatusFinished
export.EndTs = time.Now()
log.Infof("export finished (id: %s)", export.Id)
}
svc.cache.Set(export.Id, export)
return
}
// has data
if !cur.Next(context.Background()) {
// no more data
export.Status = constants.TaskStatusFinished
export.EndTs = time.Now()
log.Infof("export finished (id: %s)", export.Id)
svc.cache.Set(export.Id, export)
break
}
// convert raw data to entity
var data map[string]interface{}
err = cur.Decode(&data)
if err != nil {
// error
export.Status = constants.TaskStatusError
export.EndTs = time.Now()
log.Errorf("export error (id: %s): %v", export.Id, err)
trace.PrintError(err)
svc.cache.Set(export.Id, export)
return
}
jsonData = append(jsonData, data)
}
jsonBytes, err := json.Marshal(jsonData)
if err != nil {
// error
export.Status = constants.TaskStatusError
export.EndTs = time.Now()
log.Errorf("export error (id: %s): %v", export.Id, err)
trace.PrintError(err)
svc.cache.Set(export.Id, export)
return
}
jsonString := string(jsonBytes)
f := utils.OpenFile(export.DownloadPath)
_, err = f.WriteString(jsonString)
if err != nil {
// error
export.Status = constants.TaskStatusError
export.EndTs = time.Now()
log.Errorf("export error (id: %s): %v", export.Id, err)
trace.PrintError(err)
svc.cache.Set(export.Id, export)
return
}
}
func (svc *JsonService) getExportDir() (dir string, err error) {
tempDir := os.TempDir()
exportDir := path.Join(tempDir, "export", "json")
if !utils.Exists(exportDir) {
err := os.MkdirAll(exportDir, 0755)
if err != nil {
return "", err
}
}
return exportDir, nil
}
func (svc *JsonService) getFileName(exportId string) (fileName string) {
return exportId + "_" + time.Now().Format("20060102150405") + ".json"
}
// getDownloadPath returns the download path for the export
// format: <tempDir>/export/<exportId>/<exportId>_<timestamp>.csv
func (svc *JsonService) getDownloadPath(exportId string) (downloadPath string) {
exportDir, err := svc.getExportDir()
if err != nil {
return ""
}
downloadPath = path.Join(exportDir, svc.getFileName(exportId))
return downloadPath
}
func NewJsonService() (svc2 interfaces.ExportService) {
cache := ttlcache.NewCache()
cache.SetTTL(time.Minute * 5)
svc := &JsonService{
cache: cache,
}
return svc
}
var _jsonService interfaces.ExportService
func GetJsonService() (svc interfaces.ExportService) {
if _jsonService == nil {
_jsonService = NewJsonService()
}
return _jsonService
}