mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-21 17:21:09 +01:00
支持去重
This commit is contained in:
@@ -59,12 +59,17 @@ type Spider struct {
|
||||
// 长任务
|
||||
IsLongTask bool `json:"is_long_task" bson:"is_long_task"` // 是否为长任务
|
||||
|
||||
// 去重
|
||||
IsDedup bool `json:"is_dedup" bson:"is_dedup"` // 是否去重
|
||||
DedupField string `json:"dedup_field" bson:"dedup_field"` // 去重字段
|
||||
DedupMethod string `json:"dedup_method" bson:"dedup_method"` // 去重方式
|
||||
|
||||
// 前端展示
|
||||
LastRunTs time.Time `json:"last_run_ts"` // 最后一次执行时间
|
||||
LastStatus string `json:"last_status"` // 最后执行状态
|
||||
Config entity.ConfigSpiderData `json:"config"` // 可配置爬虫配置
|
||||
LatestTasks []Task `json:"latest_tasks"` // 最近任务列表
|
||||
Username string `json:"username""`
|
||||
Username string `json:"username"` // 用户名称
|
||||
|
||||
// 时间
|
||||
UserId bson.ObjectId `json:"user_id" bson:"user_id"`
|
||||
|
||||
@@ -158,6 +158,19 @@ func PostSpider(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
// 获取爬虫
|
||||
spider, err := model.GetSpider(bson.ObjectIdHex(id))
|
||||
if err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
|
||||
// 去重处理
|
||||
if err := services.UpdateSpiderDedup(spider); err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, Response{
|
||||
Status: "ok",
|
||||
Message: "success",
|
||||
|
||||
@@ -433,6 +433,27 @@ func CopySpider(spider model.Spider, newName string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func UpdateSpiderDedup(spider model.Spider) error {
|
||||
s, c := database.GetCol(spider.Col)
|
||||
defer s.Close()
|
||||
|
||||
if !spider.IsDedup {
|
||||
if err := c.DropIndex(spider.DedupField); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := c.EnsureIndex(mgo.Index{
|
||||
Key: []string{spider.DedupField},
|
||||
Unique: true,
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func InitDemoSpiders() {
|
||||
// 添加Demo爬虫
|
||||
templateSpidersDir := "./template/spiders"
|
||||
|
||||
@@ -49,7 +49,9 @@ func (s *SpiderSync) CheckIsScrapy() {
|
||||
return
|
||||
}
|
||||
s.Spider.IsScrapy = utils.Exists(path.Join(s.Spider.Src, "scrapy.cfg"))
|
||||
s.Spider.Cmd = "scrapy crawl"
|
||||
if s.Spider.IsScrapy {
|
||||
s.Spider.Cmd = "scrapy crawl"
|
||||
}
|
||||
if err := s.Spider.Save(); err != nil {
|
||||
log.Errorf(err.Error())
|
||||
debug.PrintStack()
|
||||
|
||||
@@ -107,7 +107,7 @@ func AssignTask(task model.Task) error {
|
||||
}
|
||||
|
||||
// 设置环境变量
|
||||
func SetEnv(cmd *exec.Cmd, envs []model.Env, taskId string, dataCol string) *exec.Cmd {
|
||||
func SetEnv(cmd *exec.Cmd, envs []model.Env, task model.Task, spider model.Spider) *exec.Cmd {
|
||||
// 默认把Node.js的全局node_modules加入环境变量
|
||||
envPath := os.Getenv("PATH")
|
||||
homePath := os.Getenv("HOME")
|
||||
@@ -117,8 +117,8 @@ func SetEnv(cmd *exec.Cmd, envs []model.Env, taskId string, dataCol string) *exe
|
||||
_ = os.Setenv("NODE_PATH", nodePath)
|
||||
|
||||
// 默认环境变量
|
||||
cmd.Env = append(os.Environ(), "CRAWLAB_TASK_ID="+taskId)
|
||||
cmd.Env = append(cmd.Env, "CRAWLAB_COLLECTION="+dataCol)
|
||||
cmd.Env = append(os.Environ(), "CRAWLAB_TASK_ID="+task.Id)
|
||||
cmd.Env = append(cmd.Env, "CRAWLAB_COLLECTION="+spider.Col)
|
||||
cmd.Env = append(cmd.Env, "CRAWLAB_MONGO_HOST="+viper.GetString("mongo.host"))
|
||||
cmd.Env = append(cmd.Env, "CRAWLAB_MONGO_PORT="+viper.GetString("mongo.port"))
|
||||
if viper.GetString("mongo.db") != "" {
|
||||
@@ -136,6 +136,13 @@ func SetEnv(cmd *exec.Cmd, envs []model.Env, taskId string, dataCol string) *exe
|
||||
cmd.Env = append(cmd.Env, "PYTHONUNBUFFERED=0")
|
||||
cmd.Env = append(cmd.Env, "PYTHONIOENCODING=utf-8")
|
||||
cmd.Env = append(cmd.Env, "TZ=Asia/Shanghai")
|
||||
cmd.Env = append(cmd.Env, "CRAWLAB_DEDUP_FIELD="+spider.DedupField)
|
||||
cmd.Env = append(cmd.Env, "CRAWLAB_DEDUP_METHOD="+spider.DedupMethod)
|
||||
if spider.IsDedup {
|
||||
cmd.Env = append(cmd.Env, "CRAWLAB_IS_DEDUP=1")
|
||||
} else {
|
||||
cmd.Env = append(cmd.Env, "CRAWLAB_IS_DEDUP=0")
|
||||
}
|
||||
|
||||
//任务环境变量
|
||||
for _, env := range envs {
|
||||
@@ -270,7 +277,7 @@ func ExecuteShellCmd(cmdStr string, cwd string, t model.Task, s model.Spider) (e
|
||||
envs = append(envs, model.Env{Name: "CRAWLAB_SETTING_" + envName, Value: envValue})
|
||||
}
|
||||
}
|
||||
cmd = SetEnv(cmd, envs, t.Id, s.Col)
|
||||
cmd = SetEnv(cmd, envs, t, s)
|
||||
|
||||
// 起一个goroutine来监控进程
|
||||
ch := utils.TaskExecChanMap.ChanBlocked(t.Id)
|
||||
|
||||
@@ -45,7 +45,7 @@
|
||||
/>
|
||||
</el-form-item>
|
||||
</template>
|
||||
<el-form-item :label="$t('Results Collection')" prop="col">
|
||||
<el-form-item :label="$t('Results Collection')" prop="col" required>
|
||||
<el-input
|
||||
v-model="spiderForm.col"
|
||||
:placeholder="$t('Results Collection')"
|
||||
@@ -96,6 +96,32 @@
|
||||
</el-form-item>
|
||||
</el-col>
|
||||
</el-row>
|
||||
<el-form-item v-if="!isView" :label="$t('Is De-Duplicated')" prop="dedup_field" :rules="dedupRules">
|
||||
<div style="display: flex; align-items: center; height: 40px">
|
||||
<el-switch
|
||||
v-model="spiderForm.is_dedup"
|
||||
active-color="#13ce66"
|
||||
:disabled="isView || isPublic"
|
||||
@change="onIsDedupChange"
|
||||
/>
|
||||
<el-select
|
||||
v-if="spiderForm.is_dedup"
|
||||
v-model="spiderForm.dedup_method"
|
||||
active-color="#13ce66"
|
||||
:disabled="isView || isPublic"
|
||||
style="margin-left: 20px; width: 180px"
|
||||
>
|
||||
<el-option value="overwrite" :label="$t('Overwrite')"/>
|
||||
<el-option value="ignore" :label="$t('Ignore')"/>
|
||||
</el-select>
|
||||
<el-input
|
||||
v-if="spiderForm.is_dedup"
|
||||
v-model="spiderForm.dedup_field"
|
||||
:placeholder="$t('Please enter de-duplicated field')"
|
||||
style="margin-left: 20px"
|
||||
/>
|
||||
</div>
|
||||
</el-form-item>
|
||||
<el-row>
|
||||
<el-col :span="6">
|
||||
<el-form-item v-if="!isView" :label="$t('Is Public')" prop="is_public">
|
||||
@@ -167,6 +193,17 @@ export default {
|
||||
}
|
||||
callback()
|
||||
}
|
||||
const dedupValidator = (rule, value, callback) => {
|
||||
if (!this.spiderForm.is_dedup) {
|
||||
return callback()
|
||||
} else {
|
||||
if (value) {
|
||||
return callback()
|
||||
} else {
|
||||
return callback(new Error('dedup field cannot be empty'))
|
||||
}
|
||||
}
|
||||
}
|
||||
return {
|
||||
uploadLoading: false,
|
||||
fileList: [],
|
||||
@@ -176,6 +213,9 @@ export default {
|
||||
],
|
||||
cronRules: [
|
||||
{ validator: cronValidator, trigger: 'blur' }
|
||||
],
|
||||
dedupRules: [
|
||||
{ validator: dedupValidator, trigger: 'blur' }
|
||||
]
|
||||
}
|
||||
},
|
||||
@@ -250,6 +290,11 @@ export default {
|
||||
if (value) {
|
||||
this.spiderForm.cmd = 'scrapy crawl'
|
||||
}
|
||||
},
|
||||
onIsDedupChange (value) {
|
||||
if (value && !this.spiderForm.dedup_method) {
|
||||
this.spiderForm.dedup_method = 'overwrite'
|
||||
}
|
||||
}
|
||||
},
|
||||
async created () {
|
||||
|
||||
@@ -228,6 +228,10 @@ export default {
|
||||
'Is Public': '是否公共',
|
||||
'Owner': '所有者',
|
||||
'Convert to Customized': '转化为自定义',
|
||||
'Is De-Duplicated': '是否去重',
|
||||
'Please enter de-duplicated field': '请输入去重字段',
|
||||
'Overwrite': '覆盖',
|
||||
'Ignore': '忽略',
|
||||
|
||||
// 爬虫列表
|
||||
'Name': '名称',
|
||||
|
||||
Reference in New Issue
Block a user