支持去重

This commit is contained in:
marvzhang
2020-03-29 11:17:38 +08:00
parent af6c741538
commit ea3a17f4bb
7 changed files with 104 additions and 7 deletions

View File

@@ -59,12 +59,17 @@ type Spider struct {
// 长任务
IsLongTask bool `json:"is_long_task" bson:"is_long_task"` // 是否为长任务
// 去重
IsDedup bool `json:"is_dedup" bson:"is_dedup"` // 是否去重
DedupField string `json:"dedup_field" bson:"dedup_field"` // 去重字段
DedupMethod string `json:"dedup_method" bson:"dedup_method"` // 去重方式
// 前端展示
LastRunTs time.Time `json:"last_run_ts"` // 最后一次执行时间
LastStatus string `json:"last_status"` // 最后执行状态
Config entity.ConfigSpiderData `json:"config"` // 可配置爬虫配置
LatestTasks []Task `json:"latest_tasks"` // 最近任务列表
Username string `json:"username""`
Username string `json:"username"` // 用户名称
// 时间
UserId bson.ObjectId `json:"user_id" bson:"user_id"`

View File

@@ -158,6 +158,19 @@ func PostSpider(c *gin.Context) {
return
}
// 获取爬虫
spider, err := model.GetSpider(bson.ObjectIdHex(id))
if err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
// 去重处理
if err := services.UpdateSpiderDedup(spider); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",

View File

@@ -433,6 +433,27 @@ func CopySpider(spider model.Spider, newName string) error {
return nil
}
func UpdateSpiderDedup(spider model.Spider) error {
s, c := database.GetCol(spider.Col)
defer s.Close()
if !spider.IsDedup {
if err := c.DropIndex(spider.DedupField); err != nil {
return err
}
return nil
}
if err := c.EnsureIndex(mgo.Index{
Key: []string{spider.DedupField},
Unique: true,
}); err != nil {
return err
}
return nil
}
func InitDemoSpiders() {
// 添加Demo爬虫
templateSpidersDir := "./template/spiders"

View File

@@ -49,7 +49,9 @@ func (s *SpiderSync) CheckIsScrapy() {
return
}
s.Spider.IsScrapy = utils.Exists(path.Join(s.Spider.Src, "scrapy.cfg"))
s.Spider.Cmd = "scrapy crawl"
if s.Spider.IsScrapy {
s.Spider.Cmd = "scrapy crawl"
}
if err := s.Spider.Save(); err != nil {
log.Errorf(err.Error())
debug.PrintStack()

View File

@@ -107,7 +107,7 @@ func AssignTask(task model.Task) error {
}
// 设置环境变量
func SetEnv(cmd *exec.Cmd, envs []model.Env, taskId string, dataCol string) *exec.Cmd {
func SetEnv(cmd *exec.Cmd, envs []model.Env, task model.Task, spider model.Spider) *exec.Cmd {
// 默认把Node.js的全局node_modules加入环境变量
envPath := os.Getenv("PATH")
homePath := os.Getenv("HOME")
@@ -117,8 +117,8 @@ func SetEnv(cmd *exec.Cmd, envs []model.Env, taskId string, dataCol string) *exe
_ = os.Setenv("NODE_PATH", nodePath)
// 默认环境变量
cmd.Env = append(os.Environ(), "CRAWLAB_TASK_ID="+taskId)
cmd.Env = append(cmd.Env, "CRAWLAB_COLLECTION="+dataCol)
cmd.Env = append(os.Environ(), "CRAWLAB_TASK_ID="+task.Id)
cmd.Env = append(cmd.Env, "CRAWLAB_COLLECTION="+spider.Col)
cmd.Env = append(cmd.Env, "CRAWLAB_MONGO_HOST="+viper.GetString("mongo.host"))
cmd.Env = append(cmd.Env, "CRAWLAB_MONGO_PORT="+viper.GetString("mongo.port"))
if viper.GetString("mongo.db") != "" {
@@ -136,6 +136,13 @@ func SetEnv(cmd *exec.Cmd, envs []model.Env, taskId string, dataCol string) *exe
cmd.Env = append(cmd.Env, "PYTHONUNBUFFERED=0")
cmd.Env = append(cmd.Env, "PYTHONIOENCODING=utf-8")
cmd.Env = append(cmd.Env, "TZ=Asia/Shanghai")
cmd.Env = append(cmd.Env, "CRAWLAB_DEDUP_FIELD="+spider.DedupField)
cmd.Env = append(cmd.Env, "CRAWLAB_DEDUP_METHOD="+spider.DedupMethod)
if spider.IsDedup {
cmd.Env = append(cmd.Env, "CRAWLAB_IS_DEDUP=1")
} else {
cmd.Env = append(cmd.Env, "CRAWLAB_IS_DEDUP=0")
}
//任务环境变量
for _, env := range envs {
@@ -270,7 +277,7 @@ func ExecuteShellCmd(cmdStr string, cwd string, t model.Task, s model.Spider) (e
envs = append(envs, model.Env{Name: "CRAWLAB_SETTING_" + envName, Value: envValue})
}
}
cmd = SetEnv(cmd, envs, t.Id, s.Col)
cmd = SetEnv(cmd, envs, t, s)
// 起一个goroutine来监控进程
ch := utils.TaskExecChanMap.ChanBlocked(t.Id)

View File

@@ -45,7 +45,7 @@
/>
</el-form-item>
</template>
<el-form-item :label="$t('Results Collection')" prop="col">
<el-form-item :label="$t('Results Collection')" prop="col" required>
<el-input
v-model="spiderForm.col"
:placeholder="$t('Results Collection')"
@@ -96,6 +96,32 @@
</el-form-item>
</el-col>
</el-row>
<el-form-item v-if="!isView" :label="$t('Is De-Duplicated')" prop="dedup_field" :rules="dedupRules">
<div style="display: flex; align-items: center; height: 40px">
<el-switch
v-model="spiderForm.is_dedup"
active-color="#13ce66"
:disabled="isView || isPublic"
@change="onIsDedupChange"
/>
<el-select
v-if="spiderForm.is_dedup"
v-model="spiderForm.dedup_method"
active-color="#13ce66"
:disabled="isView || isPublic"
style="margin-left: 20px; width: 180px"
>
<el-option value="overwrite" :label="$t('Overwrite')"/>
<el-option value="ignore" :label="$t('Ignore')"/>
</el-select>
<el-input
v-if="spiderForm.is_dedup"
v-model="spiderForm.dedup_field"
:placeholder="$t('Please enter de-duplicated field')"
style="margin-left: 20px"
/>
</div>
</el-form-item>
<el-row>
<el-col :span="6">
<el-form-item v-if="!isView" :label="$t('Is Public')" prop="is_public">
@@ -167,6 +193,17 @@ export default {
}
callback()
}
const dedupValidator = (rule, value, callback) => {
if (!this.spiderForm.is_dedup) {
return callback()
} else {
if (value) {
return callback()
} else {
return callback(new Error('dedup field cannot be empty'))
}
}
}
return {
uploadLoading: false,
fileList: [],
@@ -176,6 +213,9 @@ export default {
],
cronRules: [
{ validator: cronValidator, trigger: 'blur' }
],
dedupRules: [
{ validator: dedupValidator, trigger: 'blur' }
]
}
},
@@ -250,6 +290,11 @@ export default {
if (value) {
this.spiderForm.cmd = 'scrapy crawl'
}
},
onIsDedupChange (value) {
if (value && !this.spiderForm.dedup_method) {
this.spiderForm.dedup_method = 'overwrite'
}
}
},
async created () {

View File

@@ -228,6 +228,10 @@ export default {
'Is Public': '是否公共',
'Owner': '所有者',
'Convert to Customized': '转化为自定义',
'Is De-Duplicated': '是否去重',
'Please enter de-duplicated field': '请输入去重字段',
'Overwrite': '覆盖',
'Ignore': '忽略',
// 爬虫列表
'Name': '名称',