Merge pull request #588 from crawlab-team/develop

Develop
This commit is contained in:
Marvin Zhang
2020-02-24 09:18:48 +08:00
committed by GitHub
12 changed files with 463 additions and 157 deletions

View File

@@ -11,9 +11,6 @@ on:
tags:
- v*
# Run tests for any PRs.
pull_request:
env:
IMAGE_NAME: tikazyq/crawlab
@@ -54,6 +51,12 @@ jobs:
- name: Deploy
run: |
# Strip git ref prefix from version
VERSION=$(echo "${{ github.ref }}" | sed -e 's,.*/\(.*\),\1,')
# Strip "v" prefix from tag name
[[ "${{ github.ref }}" == "refs/tags/"* ]] && VERSION=$(echo $VERSION | sed -e 's/^v//')
if [ $VERSION == "release"]; then
apt-get install -y curl
curl ${{ secrets.JENKINS_RELEASE_URL }}

View File

@@ -5,6 +5,8 @@
- **长任务支持**. 用户可以添加长任务爬虫这些爬虫可以跑长期运行的任务. [425](https://github.com/crawlab-team/crawlab/issues/425)
- **爬虫列表优化**. 分状态任务列数统计任务列表详情弹出框图例. [425](https://github.com/crawlab-team/crawlab/issues/425)
- **版本升级检测**. 检测最新版本通知用户升级.
- **批量操作爬虫**. 允许用户批量运行/停止爬虫任务以及批量删除爬虫.
- **复制爬虫**. 允许用户复制已存在爬虫来创建新爬虫.
### Bug 修复

View File

@@ -5,6 +5,8 @@
- **Long Task Support**. Users can add long-task spiders which is supposed to run without finishing. [#425](https://github.com/crawlab-team/crawlab/issues/425)
- **Spider List Optimization**. Tasks count by status, tasks detail popup, legend. [#425](https://github.com/crawlab-team/crawlab/issues/425)
- **Upgrade Check**. Check latest version and notifiy users to upgrade.
- **Spiders Batch Operation**. Allow users to run/stop spider tasks and delete spiders in batches.
- **Copy Spiders**. Allow users to copy an existing spider to create a new one.
### Bug Fixes

View File

@@ -164,6 +164,7 @@ func main() {
authGroup.POST("/spiders/:id/upload", routes.UploadSpiderFromId) // 上传爬虫ID
authGroup.DELETE("/spiders", routes.DeleteSelectedSpider) // 删除选择的爬虫
authGroup.DELETE("/spiders/:id", routes.DeleteSpider) // 删除爬虫
authGroup.POST("/spiders/:id/copy", routes.CopySpider) // 拷贝爬虫
authGroup.GET("/spiders/:id/tasks", routes.GetSpiderTasks) // 爬虫任务列表
authGroup.GET("/spiders/:id/file/tree", routes.GetSpiderFileTree) // 爬虫文件目录树读取
authGroup.GET("/spiders/:id/file", routes.GetSpiderFile) // 爬虫文件读取

View File

@@ -26,6 +26,8 @@ import (
"time"
)
// ======== 爬虫管理 ========
func GetSpiderList(c *gin.Context) {
pageNum, _ := c.GetQuery("page_num")
pageSize, _ := c.GetQuery("page_size")
@@ -240,6 +242,50 @@ func PutSpider(c *gin.Context) {
})
}
func CopySpider(c *gin.Context) {
type ReqBody struct {
Name string `json:"name"`
}
id := c.Param("id")
if !bson.IsObjectIdHex(id) {
HandleErrorF(http.StatusBadRequest, c, "invalid id")
}
var reqBody ReqBody
if err := c.ShouldBindJSON(&reqBody); err != nil {
HandleError(http.StatusBadRequest, c, err)
return
}
// 检查新爬虫名称是否存在
// 如果存在,则返回错误
s := model.GetSpiderByName(reqBody.Name)
if s.Name != "" {
HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("spider name '%s' already exists", reqBody.Name))
return
}
// 被复制爬虫
spider, err := model.GetSpider(bson.ObjectIdHex(id))
if err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
// 复制爬虫
if err := services.CopySpider(spider, reqBody.Name); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",
})
}
func UploadSpider(c *gin.Context) {
// 从body中获取文件
uploadFile, err := c.FormFile("file")
@@ -647,7 +693,151 @@ func GetSpiderTasks(c *gin.Context) {
})
}
// 爬虫文件管理
func GetSpiderStats(c *gin.Context) {
type Overview struct {
TaskCount int `json:"task_count" bson:"task_count"`
ResultCount int `json:"result_count" bson:"result_count"`
SuccessCount int `json:"success_count" bson:"success_count"`
SuccessRate float64 `json:"success_rate"`
TotalWaitDuration float64 `json:"wait_duration" bson:"wait_duration"`
TotalRuntimeDuration float64 `json:"runtime_duration" bson:"runtime_duration"`
AvgWaitDuration float64 `json:"avg_wait_duration"`
AvgRuntimeDuration float64 `json:"avg_runtime_duration"`
}
type Data struct {
Overview Overview `json:"overview"`
Daily []model.TaskDailyItem `json:"daily"`
}
id := c.Param("id")
spider, err := model.GetSpider(bson.ObjectIdHex(id))
if err != nil {
log.Errorf(err.Error())
HandleError(http.StatusInternalServerError, c, err)
return
}
s, col := database.GetCol("tasks")
defer s.Close()
// 起始日期
startDate := time.Now().Add(-time.Hour * 24 * 30)
endDate := time.Now()
// match
op1 := bson.M{
"$match": bson.M{
"spider_id": spider.Id,
"create_ts": bson.M{
"$gte": startDate,
"$lt": endDate,
},
},
}
// project
op2 := bson.M{
"$project": bson.M{
"success_count": bson.M{
"$cond": []interface{}{
bson.M{
"$eq": []string{
"$status",
constants.StatusFinished,
},
},
1,
0,
},
},
"result_count": "$result_count",
"wait_duration": "$wait_duration",
"runtime_duration": "$runtime_duration",
},
}
// group
op3 := bson.M{
"$group": bson.M{
"_id": nil,
"task_count": bson.M{"$sum": 1},
"success_count": bson.M{"$sum": "$success_count"},
"result_count": bson.M{"$sum": "$result_count"},
"wait_duration": bson.M{"$sum": "$wait_duration"},
"runtime_duration": bson.M{"$sum": "$runtime_duration"},
},
}
// run aggregation pipeline
var overview Overview
if err := col.Pipe([]bson.M{op1, op2, op3}).One(&overview); err != nil {
if err == mgo.ErrNotFound {
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",
Data: Data{
Overview: overview,
Daily: []model.TaskDailyItem{},
},
})
return
}
log.Errorf(err.Error())
HandleError(http.StatusInternalServerError, c, err)
return
}
// 后续处理
successCount, _ := strconv.ParseFloat(strconv.Itoa(overview.SuccessCount), 64)
taskCount, _ := strconv.ParseFloat(strconv.Itoa(overview.TaskCount), 64)
overview.SuccessRate = successCount / taskCount
overview.AvgWaitDuration = overview.TotalWaitDuration / taskCount
overview.AvgRuntimeDuration = overview.TotalRuntimeDuration / taskCount
items, err := model.GetDailyTaskStats(bson.M{"spider_id": spider.Id})
if err != nil {
log.Errorf(err.Error())
HandleError(http.StatusInternalServerError, c, err)
return
}
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",
Data: Data{
Overview: overview,
Daily: items,
},
})
}
func GetSpiderSchedules(c *gin.Context) {
id := c.Param("id")
if !bson.IsObjectIdHex(id) {
HandleErrorF(http.StatusBadRequest, c, "spider_id is invalid")
return
}
// 获取定时任务
list, err := model.GetScheduleList(bson.M{"spider_id": bson.ObjectIdHex(id)})
if err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",
Data: list,
})
}
// ======== ./爬虫管理 ========
// ======== 爬虫文件管理 ========
func GetSpiderDir(c *gin.Context) {
// 爬虫ID
@@ -946,147 +1136,9 @@ func RenameSpiderFile(c *gin.Context) {
})
}
func GetSpiderStats(c *gin.Context) {
type Overview struct {
TaskCount int `json:"task_count" bson:"task_count"`
ResultCount int `json:"result_count" bson:"result_count"`
SuccessCount int `json:"success_count" bson:"success_count"`
SuccessRate float64 `json:"success_rate"`
TotalWaitDuration float64 `json:"wait_duration" bson:"wait_duration"`
TotalRuntimeDuration float64 `json:"runtime_duration" bson:"runtime_duration"`
AvgWaitDuration float64 `json:"avg_wait_duration"`
AvgRuntimeDuration float64 `json:"avg_runtime_duration"`
}
// ======== 爬虫文件管理 ========
type Data struct {
Overview Overview `json:"overview"`
Daily []model.TaskDailyItem `json:"daily"`
}
id := c.Param("id")
spider, err := model.GetSpider(bson.ObjectIdHex(id))
if err != nil {
log.Errorf(err.Error())
HandleError(http.StatusInternalServerError, c, err)
return
}
s, col := database.GetCol("tasks")
defer s.Close()
// 起始日期
startDate := time.Now().Add(-time.Hour * 24 * 30)
endDate := time.Now()
// match
op1 := bson.M{
"$match": bson.M{
"spider_id": spider.Id,
"create_ts": bson.M{
"$gte": startDate,
"$lt": endDate,
},
},
}
// project
op2 := bson.M{
"$project": bson.M{
"success_count": bson.M{
"$cond": []interface{}{
bson.M{
"$eq": []string{
"$status",
constants.StatusFinished,
},
},
1,
0,
},
},
"result_count": "$result_count",
"wait_duration": "$wait_duration",
"runtime_duration": "$runtime_duration",
},
}
// group
op3 := bson.M{
"$group": bson.M{
"_id": nil,
"task_count": bson.M{"$sum": 1},
"success_count": bson.M{"$sum": "$success_count"},
"result_count": bson.M{"$sum": "$result_count"},
"wait_duration": bson.M{"$sum": "$wait_duration"},
"runtime_duration": bson.M{"$sum": "$runtime_duration"},
},
}
// run aggregation pipeline
var overview Overview
if err := col.Pipe([]bson.M{op1, op2, op3}).One(&overview); err != nil {
if err == mgo.ErrNotFound {
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",
Data: Data{
Overview: overview,
Daily: []model.TaskDailyItem{},
},
})
return
}
log.Errorf(err.Error())
HandleError(http.StatusInternalServerError, c, err)
return
}
// 后续处理
successCount, _ := strconv.ParseFloat(strconv.Itoa(overview.SuccessCount), 64)
taskCount, _ := strconv.ParseFloat(strconv.Itoa(overview.TaskCount), 64)
overview.SuccessRate = successCount / taskCount
overview.AvgWaitDuration = overview.TotalWaitDuration / taskCount
overview.AvgRuntimeDuration = overview.TotalRuntimeDuration / taskCount
items, err := model.GetDailyTaskStats(bson.M{"spider_id": spider.Id})
if err != nil {
log.Errorf(err.Error())
HandleError(http.StatusInternalServerError, c, err)
return
}
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",
Data: Data{
Overview: overview,
Daily: items,
},
})
}
func GetSpiderSchedules(c *gin.Context) {
id := c.Param("id")
if !bson.IsObjectIdHex(id) {
HandleErrorF(http.StatusBadRequest, c, "spider_id is invalid")
return
}
// 获取定时任务
list, err := model.GetScheduleList(bson.M{"spider_id": bson.ObjectIdHex(id)})
if err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",
Data: list,
})
}
// ======== Scrapy 部分 ========
func GetSpiderScrapySpiders(c *gin.Context) {
id := c.Param("id")
@@ -1328,6 +1380,10 @@ func GetSpiderScrapySpiderFilepath(c *gin.Context) {
})
}
// ======== ./Scrapy 部分 ========
// ======== Git 部分 ========
func PostSpiderSyncGit(c *gin.Context) {
id := c.Param("id")
@@ -1377,3 +1433,5 @@ func PostSpiderResetGit(c *gin.Context) {
Message: "success",
})
}
// ======== ./Git 部分 ========

View File

@@ -15,11 +15,13 @@ import (
"github.com/satori/go.uuid"
"github.com/spf13/viper"
"gopkg.in/yaml.v2"
"io"
"io/ioutil"
"os"
"path"
"path/filepath"
"runtime/debug"
"time"
)
type SpiderFileData struct {
@@ -293,6 +295,123 @@ func CancelSpider(id string) error {
return nil
}
func cloneGridFsFile(spider model.Spider, newName string) (err error) {
// 构造新爬虫
newSpider := spider
newSpider.Id = bson.NewObjectId()
newSpider.Name = newName
newSpider.DisplayName = newName
newSpider.Src = path.Join(path.Dir(spider.Src), newName)
newSpider.CreateTs = time.Now()
newSpider.UpdateTs = time.Now()
// GridFS连接实例
s, gf := database.GetGridFs("files")
defer s.Close()
// 被克隆爬虫的GridFS文件
f, err := gf.OpenId(spider.FileId)
if err != nil {
log.Errorf(err.Error())
debug.PrintStack()
return err
}
// 新爬虫的GridFS文件
fNew, err := gf.Create(newSpider.Name + ".zip")
if err != nil {
log.Errorf(err.Error())
debug.PrintStack()
return err
}
// 生成唯一ID
randomId := uuid.NewV4()
tmpPath := viper.GetString("other.tmppath")
if !utils.Exists(tmpPath) {
if err := os.MkdirAll(tmpPath, 0777); err != nil {
log.Errorf("mkdir other.tmppath error: %v", err.Error())
return err
}
}
// 创建临时文件
tmpFilePath := filepath.Join(tmpPath, randomId.String()+".zip")
tmpFile := utils.OpenFile(tmpFilePath)
// 拷贝到临时文件
if _, err := io.Copy(tmpFile, f); err != nil {
log.Errorf(err.Error())
debug.PrintStack()
return err
}
// 关闭临时文件
if err := tmpFile.Close(); err != nil {
log.Errorf(err.Error())
debug.PrintStack()
return err
}
// 读取内容
fContent, err := ioutil.ReadFile(tmpFilePath)
if err != nil {
log.Errorf(err.Error())
debug.PrintStack()
return err
}
// 写入GridFS文件
if _, err := fNew.Write(fContent); err != nil {
log.Errorf(err.Error())
debug.PrintStack()
return err
}
// 关闭被克隆爬虫GridFS文件
if err = f.Close(); err != nil {
log.Errorf(err.Error())
debug.PrintStack()
return err
}
// 将新爬虫文件复制
newSpider.FileId = fNew.Id().(bson.ObjectId)
// 保存新爬虫
if err := newSpider.Add(); err != nil {
return err
}
// 关闭新爬虫GridFS文件
if err := fNew.Close(); err != nil {
log.Errorf(err.Error())
debug.PrintStack()
return err
}
// 删除临时文件
if err := os.RemoveAll(tmpFilePath); err != nil {
log.Errorf(err.Error())
debug.PrintStack()
return err
}
// 同步爬虫
PublishSpider(newSpider)
return nil
}
func CopySpider(spider model.Spider, newName string) error {
// 克隆GridFS文件
if err := cloneGridFsFile(spider, newName); err != nil {
return err
}
return nil
}
// 启动爬虫服务
func InitSpiderService() error {
// 构造定时任务执行器

View File

@@ -536,7 +536,7 @@ func SpiderFileCheck(t model.Task, spider model.Spider) error {
// 判断爬虫文件是否存在
gfFile := model.GetGridFs(spider.FileId)
if gfFile == nil {
t.Error = "找不到爬虫文件,请重新上传"
t.Error = "cannot find spider files, please re-upload"
t.Status = constants.StatusError
t.FinishTs = time.Now() // 结束时间
t.RuntimeDuration = t.FinishTs.Sub(t.StartTs).Seconds() // 运行时长

View File

@@ -1,29 +1,33 @@
package utils
import (
"sync"
)
var TaskExecChanMap = NewChanMap()
type ChanMap struct {
m map[string]chan string
m sync.Map
}
func NewChanMap() *ChanMap {
return &ChanMap{m: make(map[string]chan string)}
return &ChanMap{m: sync.Map{}}
}
func (cm *ChanMap) Chan(key string) chan string {
if ch, ok := cm.m[key]; ok {
return ch
if ch, ok := cm.m.Load(key); ok {
return ch.(interface{}).(chan string)
}
ch := make(chan string, 10)
cm.m[key] = ch
cm.m.Store(key, ch)
return ch
}
func (cm *ChanMap) ChanBlocked(key string) chan string {
if ch, ok := cm.m[key]; ok {
return ch
if ch, ok := cm.m.Load(key); ok {
return ch.(interface{}).(chan string)
}
ch := make(chan string)
cm.m[key] = ch
cm.m.Store(key, ch)
return ch
}

View File

@@ -78,8 +78,6 @@
<span style="margin-left: 5px">跳转到任务详情页</span>
</div>
</el-form-item>
<el-form-item>
</el-form-item>
</el-form>
<template slot="footer">
<el-button type="plain" size="small" @click="$emit('close')">{{$t('Cancel')}}</el-button>

View File

@@ -0,0 +1,85 @@
<template>
<el-dialog
class="copy-spider-dialog"
ref="form"
:title="$t('Copy Spider')"
:visible="visible"
width="580px"
:before-close="onClose"
>
<el-form
label-width="160px"
:model="form"
ref="form"
>
<el-form-item
:label="$t('New Spider Name')"
required
>
<el-input v-model="form.name" :placeholder="$t('New Spider Name')"/>
</el-form-item>
</el-form>
<template slot="footer">
<el-button type="plain" size="small" @click="$emit('close')">{{$t('Cancel')}}</el-button>
<el-button
type="primary"
size="small"
:icon="isLoading ? 'el-icon-loading' : ''"
:disabled="isLoading"
@click="onConfirm"
>
{{$t('Confirm')}}
</el-button>
</template>
</el-dialog>
</template>
<script>
export default {
name: 'CopySpiderDialog',
props: {
spiderId: {
type: String,
default: ''
},
visible: {
type: Boolean,
default: false
}
},
data () {
return {
form: {
name: ''
},
isLoading: false
}
},
methods: {
onClose () {
this.$emit('close')
},
onConfirm () {
this.$refs['form'].validate(async valid => {
if (!valid) return
try {
this.isLoading = true
const res = await this.$request.post(`/spiders/${this.spiderId}/copy`, this.form)
if (!res.data.error) {
this.$message.success('Copied successfully')
}
this.$emit('confirm')
this.$emit('close')
this.$st.sendEv('爬虫复制', '确认提交')
} finally {
this.isLoading = false
}
})
}
}
}
</script>
<style scoped>
</style>

View File

@@ -220,6 +220,8 @@ export default {
'Item Name': 'Item 名称',
'Add Item': '添加 Item',
'Add Variable': '添加变量',
'Copy Spider': '复制爬虫',
'New Spider Name': '新爬虫名称',
// 爬虫列表
'Name': '名称',
@@ -564,6 +566,7 @@ docker run -d --restart always --name crawlab_worker \\
'Are you sure to delete selected items?': '您是否确认删除所选项',
'Are you sure to stop selected items?': '您是否确认停止所选项',
'Sent signals to cancel selected tasks': '已经向所选任务发送取消任务信号',
'Copied successfully': '已成功复制',
// 其他
'Star crawlab-team/crawlab on GitHub': ' GitHub 上为 Crawlab 加星吧'

View File

@@ -290,6 +290,15 @@
/>
<!--./crawl confirm dialog-->
<!--copy dialog-->
<copy-spider-dialog
:visible="copyDialogVisible"
:spider-id="activeSpiderId"
@close="copyDialogVisible = false"
@confirm="onCopyConfirm"
/>
<!--./copy dialog-->
<el-card style="border-radius: 0">
<!--filter-->
<div class="filter">
@@ -566,7 +575,7 @@
>
</el-table-column>
</template>
<el-table-column :label="$t('Action')" align="left" fixed="right" min-width="170px">
<el-table-column :label="$t('Action')" align="left" fixed="right" min-width="220px">
<template slot-scope="scope">
<el-tooltip :content="$t('View')" placement="top">
<el-button type="primary" icon="el-icon-search" size="mini"
@@ -576,6 +585,14 @@
<el-button type="danger" icon="el-icon-delete" size="mini"
@click="onRemove(scope.row, $event)"></el-button>
</el-tooltip>
<el-tooltip :content="$t('Copy')" placement="top">
<el-button
type="info"
icon="el-icon-copy-document"
size="mini"
@click="onCopy(scope.row, $event)"
/>
</el-tooltip>
<el-tooltip v-if="!isShowRun(scope.row)" :content="$t('No command line')" placement="top">
<el-button disabled type="success" icon="fa fa-bug" size="mini"
@click="onCrawl(scope.row, $event)"></el-button>
@@ -619,10 +636,12 @@ import dayjs from 'dayjs'
import CrawlConfirmDialog from '../../components/Common/CrawlConfirmDialog'
import StatusTag from '../../components/Status/StatusTag'
import StatusLegend from '../../components/Status/StatusLegend'
import CopySpiderDialog from '../../components/Spider/CopySpiderDialog'
export default {
name: 'SpiderList',
components: {
CopySpiderDialog,
StatusLegend,
CrawlConfirmDialog,
StatusTag
@@ -784,7 +803,8 @@ export default {
selectedSpiders: [],
isStopLoading: false,
isRemoveLoading: false,
isMultiple: false
isMultiple: false,
copyDialogVisible: false
}
},
computed: {
@@ -966,6 +986,17 @@ export default {
this.getList()
}, 1000)
},
onCopy (row, ev) {
ev.stopPropagation()
this.copyDialogVisible = true
this.activeSpiderId = row._id
this.$st.sendEv('爬虫列表', '点击复制')
},
onCopyConfirm () {
setTimeout(() => {
this.getList()
}, 1000)
},
onView (row, ev) {
ev.stopPropagation()
this.$router.push('/spiders/' + row._id)