feat: added modules

This commit is contained in:
Marvin Zhang
2024-06-14 15:42:50 +08:00
parent f1833fed21
commit 0b67fd9ece
626 changed files with 60104 additions and 0 deletions

16
core/result/options.go Normal file
View File

@@ -0,0 +1,16 @@
package result
import "go.mongodb.org/mongo-driver/bson/primitive"
type Option func(opts *Options)
type Options struct {
registryKey string // registry key
SpiderId primitive.ObjectID // data source id
}
func WithRegistryKey(key string) Option {
return func(opts *Options) {
opts.registryKey = key
}
}

89
core/result/service.go Normal file
View File

@@ -0,0 +1,89 @@
package result
import (
"fmt"
"github.com/crawlab-team/crawlab/core/errors"
"github.com/crawlab-team/crawlab/core/interfaces"
"github.com/crawlab-team/crawlab/core/models/models"
"github.com/crawlab-team/crawlab/core/models/service"
"github.com/crawlab-team/go-trace"
"go.mongodb.org/mongo-driver/bson/primitive"
"sync"
)
func NewResultService(registryKey string, s *models.Spider) (svc2 interfaces.ResultService, err error) {
// result service function
var fn interfaces.ResultServiceRegistryFn
if registryKey == "" {
// default
fn = NewResultServiceMongo
} else {
// from registry
reg := GetResultServiceRegistry()
fn = reg.Get(registryKey)
if fn == nil {
return nil, errors.NewResultError(fmt.Sprintf("%s is not implemented", registryKey))
}
}
// generate result service
svc, err := fn(s.ColId, s.DataSourceId)
if err != nil {
return nil, trace.TraceError(err)
}
return svc, nil
}
var store = sync.Map{}
func GetResultService(spiderId primitive.ObjectID, opts ...Option) (svc2 interfaces.ResultService, err error) {
// model service
modelSvc, err := service.GetService()
if err != nil {
return nil, trace.TraceError(err)
}
// spider
s, err := modelSvc.GetSpiderById(spiderId)
if err != nil {
return nil, trace.TraceError(err)
}
// apply options
_opts := &Options{}
for _, opt := range opts {
opt(_opts)
}
// store key
storeKey := s.ColId.Hex() + ":" + s.DataSourceId.Hex()
// attempt to load result service from store
res, _ := store.Load(storeKey)
if res != nil {
svc, ok := res.(interfaces.ResultService)
if ok {
return svc, nil
}
}
// registry key
var registryKey string
ds, _ := modelSvc.GetDataSourceById(s.DataSourceId)
if ds != nil {
registryKey = ds.Type
}
// create a new result service if not exists
svc, err := NewResultService(registryKey, s)
if err != nil {
return nil, err
}
// save into store
store.Store(storeKey, svc)
return svc, nil
}

View File

@@ -0,0 +1,146 @@
package result
import (
"time"
"github.com/crawlab-team/crawlab-db/generic"
"github.com/crawlab-team/crawlab-db/mongo"
"github.com/crawlab-team/crawlab/core/constants"
"github.com/crawlab-team/crawlab/core/interfaces"
"github.com/crawlab-team/crawlab/core/models/models"
"github.com/crawlab-team/crawlab/core/models/service"
"github.com/crawlab-team/crawlab/core/utils"
"github.com/crawlab-team/go-trace"
"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/bson/primitive"
mongo2 "go.mongodb.org/mongo-driver/mongo"
"go.mongodb.org/mongo-driver/mongo/options"
)
type ServiceMongo struct {
// dependencies
modelSvc service.ModelService
modelColSvc interfaces.ModelBaseService
// internals
colId primitive.ObjectID // _id of models.DataCollection
dc *models.DataCollection // models.DataCollection
t time.Time
}
func (svc *ServiceMongo) List(query generic.ListQuery, opts *generic.ListOptions) (results []interface{}, err error) {
_query := svc.getQuery(query)
_opts := svc.getOpts(opts)
return svc.getList(_query, _opts)
}
func (svc *ServiceMongo) Count(query generic.ListQuery) (n int, err error) {
_query := svc.getQuery(query)
return svc.modelColSvc.Count(_query)
}
func (svc *ServiceMongo) Insert(docs ...interface{}) (err error) {
if svc.dc.Dedup.Enabled && len(svc.dc.Dedup.Keys) > 0 {
for _, doc := range docs {
hash, err := utils.GetResultHash(doc, svc.dc.Dedup.Keys)
if err != nil {
return err
}
doc.(interfaces.Result).SetValue(constants.HashKey, hash)
query := bson.M{constants.HashKey: hash}
switch svc.dc.Dedup.Type {
case constants.DedupTypeOverwrite:
err = mongo.GetMongoCol(svc.dc.Name).ReplaceWithOptions(query, doc, &options.ReplaceOptions{Upsert: &[]bool{true}[0]})
if err != nil {
return trace.TraceError(err)
}
default:
var o bson.M
err := mongo.GetMongoCol(svc.dc.Name).Find(query, &mongo.FindOptions{Limit: 1}).One(&o)
if err == nil {
// exists, ignore
continue
}
if err != mongo2.ErrNoDocuments {
// error
return trace.TraceError(err)
}
// not exists, insert
_, err = mongo.GetMongoCol(svc.dc.Name).Insert(doc)
if err != nil {
return trace.TraceError(err)
}
}
}
} else {
_, err = mongo.GetMongoCol(svc.dc.Name).InsertMany(docs)
if err != nil {
return trace.TraceError(err)
}
}
return nil
}
func (svc *ServiceMongo) Index(fields []string) {
for _, field := range fields {
_ = mongo.GetMongoCol(svc.dc.Name).CreateIndex(mongo2.IndexModel{Keys: bson.M{field: 1}})
}
}
func (svc *ServiceMongo) SetTime(t time.Time) {
svc.t = t
}
func (svc *ServiceMongo) GetTime() (t time.Time) {
return svc.t
}
func (svc *ServiceMongo) getList(query bson.M, opts *mongo.FindOptions) (results []interface{}, err error) {
list, err := svc.modelColSvc.GetList(query, opts)
if err != nil {
return nil, err
}
for _, d := range list.GetModels() {
r, ok := d.(interfaces.Result)
if ok {
results = append(results, r)
}
}
return results, nil
}
func (svc *ServiceMongo) getQuery(query generic.ListQuery) (res bson.M) {
return utils.GetMongoQuery(query)
}
func (svc *ServiceMongo) getOpts(opts *generic.ListOptions) (res *mongo.FindOptions) {
return utils.GetMongoOpts(opts)
}
func NewResultServiceMongo(colId primitive.ObjectID, _ primitive.ObjectID) (svc2 interfaces.ResultService, err error) {
// service
svc := &ServiceMongo{
colId: colId,
t: time.Now(),
}
// dependency injection
svc.modelSvc, err = service.GetService()
if err != nil {
return nil, err
}
// data collection
svc.dc, _ = svc.modelSvc.GetDataCollectionById(colId)
go func() {
for {
time.Sleep(1 * time.Second)
svc.dc, _ = svc.modelSvc.GetDataCollectionById(colId)
}
}()
// data collection model service
svc.modelColSvc = service.GetBaseServiceByColName(interfaces.ModelIdResult, svc.dc.Name)
return svc, nil
}

View File

@@ -0,0 +1,48 @@
package result
import (
"github.com/crawlab-team/crawlab/core/interfaces"
"sync"
)
type ServiceRegistry struct {
// internals
services sync.Map
}
func (r *ServiceRegistry) Register(key string, fn interfaces.ResultServiceRegistryFn) {
r.services.Store(key, fn)
}
func (r *ServiceRegistry) Unregister(key string) {
r.services.Delete(key)
}
func (r *ServiceRegistry) Get(key string) (fn interfaces.ResultServiceRegistryFn) {
res, ok := r.services.Load(key)
if ok {
fn, ok = res.(interfaces.ResultServiceRegistryFn)
if !ok {
return nil
}
return fn
}
return nil
}
func NewResultServiceRegistry() (r interfaces.ResultServiceRegistry) {
r = &ServiceRegistry{
services: sync.Map{},
}
return r
}
var _svc interfaces.ResultServiceRegistry
func GetResultServiceRegistry() (r interfaces.ResultServiceRegistry) {
if _svc != nil {
return _svc
}
_svc = NewResultServiceRegistry()
return _svc
}

76
core/result/test/base.go Normal file
View File

@@ -0,0 +1,76 @@
package test
import (
"github.com/crawlab-team/crawlab-db/mongo"
"github.com/crawlab-team/crawlab/core/interfaces"
"github.com/crawlab-team/crawlab/core/models/delegate"
"github.com/crawlab-team/crawlab/core/models/models"
"github.com/crawlab-team/crawlab/core/models/service"
"github.com/crawlab-team/crawlab/core/result"
"go.uber.org/dig"
"testing"
)
func init() {
T = NewTest()
}
var T *Test
type Test struct {
// dependencies
modelSvc service.ModelService
resultSvc interfaces.ResultService
// test data
TestColName string
TestCol *mongo.Col
TestDc *models.DataCollection
}
func (t *Test) Setup(t2 *testing.T) {
t2.Cleanup(t.Cleanup)
}
func (t *Test) Cleanup() {
_ = t.modelSvc.DropAll()
}
func NewTest() *Test {
var err error
// test
t := &Test{
TestColName: "test_results",
}
// dependency injection
c := dig.New()
if err := c.Provide(service.NewService); err != nil {
panic(err)
}
if err := c.Invoke(func(
modelSvc service.ModelService,
) {
t.modelSvc = modelSvc
}); err != nil {
panic(err)
}
// data collection
t.TestDc = &models.DataCollection{
Name: t.TestColName,
}
if err := delegate.NewModelDelegate(t.TestDc).Add(); err != nil {
panic(err)
}
t.TestCol = mongo.GetMongoCol(t.TestColName)
// result service
t.resultSvc, err = result.GetResultService(t.TestDc.GetId())
if err != nil {
panic(err)
}
return t
}

View File

@@ -0,0 +1,67 @@
package test
import (
"github.com/crawlab-team/crawlab/core/models/models"
"github.com/stretchr/testify/require"
"testing"
)
func TestResultService_GetList(t *testing.T) {
var err error
T.Setup(t)
n := 1000
var docs []interface{}
for i := 0; i < n; i++ {
d := &models.Result{
"i": i,
}
docs = append(docs, d)
}
_, err = T.TestCol.InsertMany(docs)
require.Nil(t, err)
// get all
results, err := T.resultSvc.List(nil, nil)
require.Nil(t, err)
require.Equal(t, n, len(results))
//query := bson.M{
// "i": bson.M{
// "$lt": n / 2,
// },
//}
//results, err = T.resultSvc.List(query, nil)
//require.Nil(t, err)
//require.Equal(t, n/2, len(results))
}
func TestResultService_Count(t *testing.T) {
var err error
T.Setup(t)
n := 1000
var docs []interface{}
for i := 0; i < n; i++ {
d := &models.Result{
"i": i,
}
docs = append(docs, d)
}
_, err = T.TestCol.InsertMany(docs)
require.Nil(t, err)
// get all
total, err := T.resultSvc.Count(nil)
require.Nil(t, err)
require.Equal(t, n, total)
//query := bson.M{
// "i": bson.M{
// "$lt": n / 2,
// },
//}
//total, err = T.resultSvc.Count(query)
//require.Nil(t, err)
//require.Equal(t, n/2, total)
}