Merge pull request #501 from crawlab-team/develop

Develop
This commit is contained in:
Marvin Zhang
2020-02-03 09:25:38 +08:00
committed by GitHub
47 changed files with 1544 additions and 154 deletions

View File

@@ -1,7 +1,10 @@
# 0.4.5 (unkown)
### 功能 / 优化
- **交互式教程**. 引导用户了解 Crawlab 的主要功能.
- **加入全局环境变量**. 可以设置全局环境变量,然后传入到所有爬虫程序中.
- **加入全局环境变量**. 可以设置全局环境变量,然后传入到所有爬虫程序中. [#177](https://github.com/crawlab-team/crawlab/issues/177)
- **项目**. 允许用户将爬虫关联到项目上. [#316](https://github.com/crawlab-team/crawlab/issues/316)
- **用户管理优化**. 限制管理用户的权限. [#456](https://github.com/crawlab-team/crawlab/issues/456)
- **设置页面优化**.
### Bug 修复
- **无法找到爬虫文件错误**. [#485](https://github.com/crawlab-team/crawlab/issues/485)

View File

@@ -1,7 +1,10 @@
# 0.4.5 (unkown)
### Features / Enhancement
- **Interactive Tutorial**. Guide users through the main functionalities of Crawlab.
- **Global Environment Variables**. Allow users to set global environment variables, which will be passed into all spider programs.
- **Global Environment Variables**. Allow users to set global environment variables, which will be passed into all spider programs. [#177](https://github.com/crawlab-team/crawlab/issues/177)
- **Project**. Allow users to link spiders to projects. [#316](https://github.com/crawlab-team/crawlab/issues/316)
- **User Admin Optimization**. Restrict privilleges of admin users. [#456](https://github.com/crawlab-team/crawlab/issues/456)
- **Setting Page Optimization**.
### Bug Fixes
- **Unable to find spider file error**. [#485](https://github.com/crawlab-team/crawlab/issues/485)

View File

@@ -1,12 +1,22 @@
package entity
type ConfigSpiderData struct {
Version string `yaml:"version" json:"version"`
// 通用
Name string `yaml:"name" json:"name"`
DisplayName string `yaml:"display_name" json:"display_name"`
Col string `yaml:"col" json:"col"`
Remark string `yaml:"remark" json:"remark"`
Type string `yaml:"type" bson:"type"`
// 可配置爬虫
Engine string `yaml:"engine" json:"engine"`
StartUrl string `yaml:"start_url" json:"start_url"`
StartStage string `yaml:"start_stage" json:"start_stage"`
Stages []Stage `yaml:"stages" json:"stages"`
Settings map[string]string `yaml:"settings" json:"settings"`
// 自定义爬虫
Cmd string `yaml:"cmd" json:"cmd"`
}
type Stage struct {

View File

@@ -39,7 +39,6 @@ func main() {
log.SetLevelFromString(logLevel)
}
log.Info("initialized log config successfully")
if viper.GetString("log.isDeletePeriodically") == "Y" {
err := services.InitDeleteLogPeriodically()
if err != nil {
@@ -74,8 +73,24 @@ func main() {
debug.PrintStack()
panic(err)
}
log.Info("initialized schedule successfully")
// 初始化用户服务
if err := services.InitUserService(); err != nil {
log.Error("init user service error:" + err.Error())
debug.PrintStack()
panic(err)
}
log.Info("initialized user service successfully")
// 初始化依赖服务
if err := services.InitDepsFetcher(); err != nil {
log.Error("init dependency fetcher error:" + err.Error())
debug.PrintStack()
panic(err)
}
log.Info("initialized dependency fetcher successfully")
}
log.Info("initialized schedule successfully")
// 初始化任务执行器
if err := services.InitTaskExecutor(); err != nil {
@@ -100,22 +115,6 @@ func main() {
}
log.Info("initialized spider service successfully")
// 初始化用户服务
if err := services.InitUserService(); err != nil {
log.Error("init user service error:" + err.Error())
debug.PrintStack()
panic(err)
}
log.Info("initialized user service successfully")
// 初始化依赖服务
if err := services.InitDepsFetcher(); err != nil {
log.Error("init dependency fetcher error:" + err.Error())
debug.PrintStack()
panic(err)
}
log.Info("initialized dependency fetcher successfully")
// 初始化RPC服务
if err := services.InitRpcService(); err != nil {
log.Error("init rpc service error:" + err.Error())
@@ -224,10 +223,18 @@ func main() {
}
// 全局变量
{
authGroup.POST("/variable", routes.PostVariable) // 新增
authGroup.PUT("/variable/:id", routes.PutVariable) //修改
authGroup.DELETE("/variable/:id", routes.DeleteVariable) //删除
authGroup.GET("/variables", routes.GetVariableList) // 列表
authGroup.PUT("/variable", routes.PutVariable) // 新增
authGroup.POST("/variable/:id", routes.PostVariable) //修改
authGroup.DELETE("/variable/:id", routes.DeleteVariable) //删除
}
// 项目
{
authGroup.GET("/projects", routes.GetProjectList) // 列表
authGroup.GET("/projects/tags", routes.GetProjectTags) // 项目标签
authGroup.PUT("/projects", routes.PutProject) //修改
authGroup.POST("/projects/:id", routes.PostProject) // 新增
authGroup.DELETE("/projects/:id", routes.DeleteProject) //删除
}
// 统计数据
authGroup.GET("/stats/home", routes.GetHomeStats) // 首页统计数据

146
backend/model/project.go Normal file
View File

@@ -0,0 +1,146 @@
package model
import (
"crawlab/constants"
"crawlab/database"
"github.com/apex/log"
"github.com/globalsign/mgo/bson"
"runtime/debug"
"time"
)
type Project struct {
Id bson.ObjectId `json:"_id" bson:"_id"`
Name string `json:"name" bson:"name"`
Description string `json:"description" bson:"description"`
Tags []string `json:"tags" bson:"tags"`
CreateTs time.Time `json:"create_ts" bson:"create_ts"`
UpdateTs time.Time `json:"update_ts" bson:"update_ts"`
// 前端展示
Spiders []Spider `json:"spiders" bson:"spiders"`
}
func (p *Project) Save() error {
s, c := database.GetCol("projects")
defer s.Close()
p.UpdateTs = time.Now()
if err := c.UpdateId(p.Id, p); err != nil {
debug.PrintStack()
return err
}
return nil
}
func (p *Project) Add() error {
s, c := database.GetCol("projects")
defer s.Close()
p.Id = bson.NewObjectId()
p.UpdateTs = time.Now()
p.CreateTs = time.Now()
if err := c.Insert(p); err != nil {
log.Errorf(err.Error())
debug.PrintStack()
return err
}
return nil
}
func (p *Project) GetSpiders() ([]Spider, error) {
s, c := database.GetCol("spiders")
defer s.Close()
var query interface{}
if p.Id.Hex() == constants.ObjectIdNull {
query = bson.M{
"$or": []bson.M{
{"project_id": p.Id},
{"project_id": bson.M{"$exists": false}},
},
}
} else {
query = bson.M{"project_id": p.Id}
}
var spiders []Spider
if err := c.Find(query).All(&spiders); err != nil {
log.Errorf(err.Error())
debug.PrintStack()
return spiders, err
}
return spiders, nil
}
func GetProject(id bson.ObjectId) (Project, error) {
s, c := database.GetCol("projects")
defer s.Close()
var p Project
if err := c.Find(bson.M{"_id": id}).One(&p); err != nil {
log.Errorf(err.Error())
debug.PrintStack()
return p, err
}
return p, nil
}
func GetProjectList(filter interface{}, skip int, sortKey string) ([]Project, error) {
s, c := database.GetCol("projects")
defer s.Close()
var projects []Project
if err := c.Find(filter).Skip(skip).Limit(constants.Infinite).Sort(sortKey).All(&projects); err != nil {
debug.PrintStack()
return projects, err
}
return projects, nil
}
func GetProjectListTotal(filter interface{}) (int, error) {
s, c := database.GetCol("projects")
defer s.Close()
var result int
result, err := c.Find(filter).Count()
if err != nil {
return result, err
}
return result, nil
}
func UpdateProject(id bson.ObjectId, item Project) error {
s, c := database.GetCol("projects")
defer s.Close()
var result Project
if err := c.FindId(id).One(&result); err != nil {
debug.PrintStack()
return err
}
if err := item.Save(); err != nil {
return err
}
return nil
}
func RemoveProject(id bson.ObjectId) error {
s, c := database.GetCol("projects")
defer s.Close()
var result User
if err := c.FindId(id).One(&result); err != nil {
return err
}
if err := c.RemoveId(id); err != nil {
return err
}
return nil
}

View File

@@ -32,6 +32,7 @@ type Spider struct {
Envs []Env `json:"envs" bson:"envs"` // 环境变量
Remark string `json:"remark" bson:"remark"` // 备注
Src string `json:"src" bson:"src"` // 源码位置
ProjectId bson.ObjectId `json:"project_id" bson:"project_id"` // 项目ID
// 自定义爬虫
Cmd string `json:"cmd" bson:"cmd"` // 执行命令
@@ -56,6 +57,11 @@ func (spider *Spider) Save() error {
spider.UpdateTs = time.Now()
// 兼容没有项目ID的爬虫
if spider.ProjectId.Hex() == "" {
spider.ProjectId = bson.ObjectIdHex(constants.ObjectIdNull)
}
if err := c.UpdateId(spider.Id, spider); err != nil {
debug.PrintStack()
return err
@@ -162,7 +168,7 @@ func GetSpiderByName(name string) Spider {
defer s.Close()
var result Spider
if err := c.Find(bson.M{"name": name}).One(&result); err != nil {
if err := c.Find(bson.M{"name": name}).One(&result); err != nil && err != mgo.ErrNotFound {
log.Errorf("get spider error: %s, spider_name: %s", err.Error(), name)
//debug.PrintStack()
return result

190
backend/routes/projects.go Normal file
View File

@@ -0,0 +1,190 @@
package routes
import (
"crawlab/constants"
"crawlab/database"
"crawlab/model"
"github.com/gin-gonic/gin"
"github.com/globalsign/mgo/bson"
"net/http"
)
func GetProjectList(c *gin.Context) {
tag := c.Query("tag")
// 筛选条件
query := bson.M{}
if tag != "" {
query["tags"] = tag
}
// 获取列表
projects, err := model.GetProjectList(query, 0, "+_id")
if err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
// 获取总数
total, err := model.GetProjectListTotal(query)
if err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
// 获取每个项目的爬虫列表
for i, p := range projects {
spiders, err := p.GetSpiders()
if err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
projects[i].Spiders = spiders
}
// 获取未被分配的爬虫数量
if tag == "" {
noProject := model.Project{
Id: bson.ObjectIdHex(constants.ObjectIdNull),
Name: "No Project",
Description: "Not assigned to any project",
}
spiders, err := noProject.GetSpiders()
if err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
noProject.Spiders = spiders
projects = append(projects, noProject)
}
c.JSON(http.StatusOK, ListResponse{
Status: "ok",
Message: "success",
Data: projects,
Total: total,
})
}
func PutProject(c *gin.Context) {
// 绑定请求数据
var p model.Project
if err := c.ShouldBindJSON(&p); err != nil {
HandleError(http.StatusBadRequest, c, err)
return
}
if err := p.Add(); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",
})
}
func PostProject(c *gin.Context) {
id := c.Param("id")
if !bson.IsObjectIdHex(id) {
HandleErrorF(http.StatusBadRequest, c, "invalid id")
}
var item model.Project
if err := c.ShouldBindJSON(&item); err != nil {
HandleError(http.StatusBadRequest, c, err)
return
}
if err := model.UpdateProject(bson.ObjectIdHex(id), item); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",
})
}
func DeleteProject(c *gin.Context) {
id := c.Param("id")
if !bson.IsObjectIdHex(id) {
HandleErrorF(http.StatusBadRequest, c, "invalid id")
return
}
// 从数据库中删除该爬虫
if err := model.RemoveProject(bson.ObjectIdHex(id)); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
// 获取相关的爬虫
var spiders []model.Spider
s, col := database.GetCol("spiders")
defer s.Close()
if err := col.Find(bson.M{"project_id": bson.ObjectIdHex(id)}).All(&spiders); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
// 将爬虫的项目ID置空
for _, spider := range spiders {
spider.ProjectId = bson.ObjectIdHex(constants.ObjectIdNull)
if err := spider.Save(); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
}
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",
})
}
func GetProjectTags(c *gin.Context) {
type Result struct {
Tag string `json:"tag" bson:"tag"`
}
s, col := database.GetCol("projects")
defer s.Close()
pipeline := []bson.M{
{
"$unwind": "$tags",
},
{
"$group": bson.M{
"_id": "$tags",
},
},
{
"$sort": bson.M{
"_id": 1,
},
},
{
"$addFields": bson.M{
"tag": "$_id",
},
},
}
var items []Result
if err := col.Pipe(pipeline).All(&items); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",
Data: items,
})
}

View File

@@ -30,6 +30,7 @@ func GetSpiderList(c *gin.Context) {
pageNum, _ := c.GetQuery("page_num")
pageSize, _ := c.GetQuery("page_size")
keyword, _ := c.GetQuery("keyword")
pid, _ := c.GetQuery("project_id")
t, _ := c.GetQuery("type")
sortKey, _ := c.GetQuery("sort_key")
sortDirection, _ := c.GetQuery("sort_direction")
@@ -41,6 +42,16 @@ func GetSpiderList(c *gin.Context) {
if t != "" && t != "all" {
filter["type"] = t
}
if pid == "" {
// do nothing
} else if pid == constants.ObjectIdNull {
filter["$or"] = []bson.M{
{"project_id": bson.ObjectIdHex(pid)},
{"project_id": bson.M{"$exists": false}},
}
} else {
filter["project_id"] = bson.ObjectIdHex(pid)
}
// 排序
sortStr := "-_id"

View File

@@ -8,7 +8,7 @@ import (
)
// 新增
func PostVariable(c *gin.Context) {
func PutVariable(c *gin.Context) {
var variable model.Variable
if err := c.ShouldBindJSON(&variable); err != nil {
HandleError(http.StatusBadRequest, c, err)
@@ -22,7 +22,7 @@ func PostVariable(c *gin.Context) {
}
// 修改
func PutVariable(c *gin.Context) {
func PostVariable(c *gin.Context) {
var id = c.Param("id")
var variable model.Variable
if err := c.ShouldBindJSON(&variable); err != nil {

View File

@@ -6,6 +6,7 @@ import (
"crawlab/entity"
"crawlab/model"
"crawlab/model/config_spider"
"crawlab/services/spider_handler"
"crawlab/utils"
"errors"
"fmt"
@@ -227,6 +228,17 @@ func ProcessSpiderFilesFromConfigData(spider model.Spider, configData entity.Con
spider.FileId = fid
_ = spider.Save()
// 获取爬虫同步实例
spiderSync := spider_handler.SpiderSync{
Spider: spider,
}
// 获取gfFile
gfFile2 := model.GetGridFs(spider.FileId)
// 生成MD5
spiderSync.CreateMd5File(gfFile2.Md5)
return nil
}

View File

@@ -14,7 +14,10 @@ import (
"github.com/globalsign/mgo/bson"
"github.com/satori/go.uuid"
"github.com/spf13/viper"
"gopkg.in/yaml.v2"
"io/ioutil"
"os"
"path"
"path/filepath"
"runtime/debug"
)
@@ -264,5 +267,108 @@ func InitSpiderService() error {
// 启动定时任务
c.Start()
if model.IsMaster() {
// 添加Demo爬虫
templateSpidersDir := "../spiders"
for _, info := range utils.ListDir(templateSpidersDir) {
if !info.IsDir() {
continue
}
spiderName := info.Name()
// 如果爬虫在数据库中不存在,则添加
spider := model.GetSpiderByName(spiderName)
if spider.Name != "" {
// 存在同名爬虫,跳过
continue
}
// 拷贝爬虫
templateSpiderPath := path.Join(templateSpidersDir, spiderName)
spiderPath := path.Join(viper.GetString("spider.path"), spiderName)
if utils.Exists(spiderPath) {
utils.RemoveFiles(spiderPath)
}
if err := utils.CopyDir(templateSpiderPath, spiderPath); err != nil {
log.Errorf("copy error: " + err.Error())
debug.PrintStack()
continue
}
// 构造配置数据
configData := entity.ConfigSpiderData{}
// 读取YAML文件
yamlFile, err := ioutil.ReadFile(path.Join(spiderPath, "Spiderfile"))
if err != nil {
log.Errorf("read yaml error: " + err.Error())
//debug.PrintStack()
continue
}
// 反序列化
if err := yaml.Unmarshal(yamlFile, &configData); err != nil {
log.Errorf("unmarshal error: " + err.Error())
debug.PrintStack()
continue
}
if configData.Type == constants.Customized {
// 添加该爬虫到数据库
spider = model.Spider{
Id: bson.NewObjectId(),
Name: configData.Name,
DisplayName: configData.DisplayName,
Type: constants.Customized,
Col: configData.Col,
Src: spiderPath,
Remark: configData.Remark,
ProjectId: bson.ObjectIdHex(constants.ObjectIdNull),
FileId: bson.ObjectIdHex(constants.ObjectIdNull),
Cmd: configData.Cmd,
}
if err := spider.Add(); err != nil {
log.Errorf("add spider error: " + err.Error())
debug.PrintStack()
continue
}
// 上传爬虫到GridFS
if err := UploadSpiderToGridFsFromMaster(spider); err != nil {
log.Errorf("upload spider error: " + err.Error())
debug.PrintStack()
continue
}
} else if configData.Type == constants.Configurable || configData.Type == "config" {
// 添加该爬虫到数据库
spider = model.Spider{
Id: bson.NewObjectId(),
Name: configData.Name,
DisplayName: configData.DisplayName,
Type: constants.Configurable,
Col: configData.Col,
Src: spiderPath,
Remark: configData.Remark,
ProjectId: bson.ObjectIdHex(constants.ObjectIdNull),
FileId: bson.ObjectIdHex(constants.ObjectIdNull),
Config: configData,
}
if err := spider.Add(); err != nil {
log.Errorf("add spider error: " + err.Error())
debug.PrintStack()
continue
}
// 根据序列化后的数据处理爬虫文件
if err := ProcessSpiderFilesFromConfigData(spider, configData); err != nil {
log.Errorf("add spider error: " + err.Error())
debug.PrintStack()
continue
}
}
}
}
return nil
}

View File

@@ -18,6 +18,20 @@
<el-form-item :label="$t('Spider Name')">
<el-input v-model="spiderForm.display_name" :placeholder="$t('Spider Name')" :disabled="isView"></el-input>
</el-form-item>
<el-form-item :label="$t('Project')" prop="project_id" required>
<el-select
v-model="spiderForm.project_id"
:placeholder="$t('Project')"
filterable
>
<el-option
v-for="p in projectList"
:key="p._id"
:value="p._id"
:label="p.name"
/>
</el-select>
</el-form-item>
<el-form-item :label="$t('Source Folder')">
<el-input v-model="spiderForm.src" :placeholder="$t('Source Folder')" disabled></el-input>
</el-form-item>
@@ -127,6 +141,9 @@ export default {
...mapGetters('user', [
'token'
]),
...mapState('project', [
'projectList'
]),
isShowRun () {
if (this.spiderForm.type === 'customized') {
return !!this.spiderForm.cmd
@@ -180,6 +197,15 @@ export default {
onUploadError () {
this.uploadLoading = false
}
},
async created () {
// fetch project list
await this.$store.dispatch('project/getProjectList')
// 兼容项目ID
if (!this.spiderForm.project_id) {
this.$set(this.spiderForm, 'project_id', '000000000000000000000000')
}
}
}
</script>

View File

@@ -12,6 +12,7 @@ export default {
'Deploys': '部署',
'Sites': '网站',
'Setting': '设置',
'Project': '项目',
// 标签
'Overview': '概览',
@@ -71,6 +72,7 @@ export default {
'Create Directory': '新建目录',
'Create File': '新建文件',
'Add Node': '添加节点',
'Add Project': '添加项目',
// 主页
'Total Tasks': '总任务数',
@@ -217,6 +219,14 @@ export default {
// 部署
'Time': '时间',
// 项目
'All Tags': '全部标签',
'Project Name': '项目名称',
'Project Description': '项目描述',
'Tags': '标签',
'Enter Tags': '输入标签',
'No Project': '无项目',
// 定时任务
'Schedule Name': '定时任务名称',
'Schedule Description': '定时任务描述',
@@ -245,6 +255,9 @@ export default {
'Home Page Response Time (sec)': '首页响应时间(秒)',
'Home Page Response Status Code': '首页响应状态码',
// 用户
'Super Admin': '超级管理员',
// 文件
'Choose Folder': '选择文件',
'File': '文件',
@@ -350,7 +363,7 @@ export default {
'Username': '用户名',
'Password': '密码',
'Confirm Password': '确认密码',
'normal': '正常用户',
'normal': '普通用户',
'admin': '管理用户',
'Role': '角色',
'Edit User': '更改用户',

View File

@@ -47,6 +47,25 @@ export const constantRouterMap = [
}
]
},
{
path: '/projects',
component: Layout,
meta: {
title: 'Project',
icon: 'fa fa-gear'
},
children: [
{
path: '',
name: 'Project',
component: () => import('../views/project/ProjectList'),
meta: {
title: 'Project',
icon: 'fa fa-code-fork'
}
}
]
},
{
path: '/spiders',
component: Layout,

View File

@@ -16,6 +16,7 @@ import stats from './modules/stats'
import setting from './modules/setting'
import version from './modules/version'
import tour from './modules/tour'
import project from './modules/project'
import getters from './getters'
Vue.use(Vuex)
@@ -37,6 +38,7 @@ const store = new Vuex.Store({
setting,
version,
tour,
project,
// 统计
stats
},

View File

@@ -0,0 +1,60 @@
import request from '../../api/request'
const state = {
projectForm: {},
projectList: [],
projectTags: []
}
const getters = {}
const mutations = {
SET_PROJECT_FORM: (state, value) => {
state.projectForm = value
},
SET_PROJECT_LIST: (state, value) => {
state.projectList = value
},
SET_PROJECT_TAGS: (state, value) => {
state.projectTags = value
}
}
const actions = {
getProjectList ({ state, commit }, payload) {
return request.get('/projects', payload)
.then(response => {
if (response.data.data) {
commit('SET_PROJECT_LIST', response.data.data.map(d => {
if (!d.spiders) d.spiders = []
return d
}))
}
})
},
getProjectTags ({ state, commit }) {
return request.get('/projects/tags')
.then(response => {
if (response.data.data) {
commit('SET_PROJECT_TAGS', response.data.data.map(d => d.tag))
}
})
},
addProject ({ state }) {
return request.put('/projects', state.projectForm)
},
editProject ({ state }, id) {
return request.post(`/projects/${id}`, state.projectForm)
},
removeProject ({ state }, id) {
return request.delete(`/projects/${id}`)
}
}
export default {
namespaced: true,
state,
getters,
mutations,
actions
}

View File

@@ -156,7 +156,7 @@ const user = {
},
// 新增全局变量
addGlobalVariable ({ commit, state }) {
return request.post(`/variable`, state.globalVariableForm)
return request.put(`/variable`, state.globalVariableForm)
.then(() => {
state.globalVariableForm = {}
})

View File

@@ -101,3 +101,10 @@ export default {
}
}
</script>
<style scoped>
.menu-wrapper >>> .fa {
width: 16px;
text-align: center;
}
</style>

View File

@@ -0,0 +1,330 @@
<template>
<div class="app-container">
<!--add popup-->
<el-dialog
:visible.sync="dialogVisible"
width="640px"
:before-close="onDialogClose">
<el-form label-width="180px"
class="add-form"
:model="projectForm"
:inline-message="true"
ref="projectForm"
label-position="right">
<el-form-item :label="$t('Project Name')" prop="name" required>
<el-input id="name" v-model="projectForm.name" :placeholder="$t('Project Name')"></el-input>
</el-form-item>
<el-form-item :label="$t('Project Description')" prop="description">
<el-input
id="description"
type="textarea"
v-model="projectForm.description"
:placeholder="$t('Project Description')"
/>
</el-form-item>
<el-form-item :label="$t('Tags')" prop="tags">
<el-select
id="tags"
v-model="projectForm.tags"
:placeholder="$t('Enter Tags')"
allow-create
filterable
multiple
>
</el-select>
</el-form-item>
</el-form>
<!--取消保存-->
<span slot="footer" class="dialog-footer">
<el-button size="small" @click="onDialogClose">{{$t('Cancel')}}</el-button>
<el-button id="btn-submit" size="small" type="primary" @click="onAddSubmit">{{$t('Submit')}}</el-button>
</span>
</el-dialog>
<!--./add popup-->
<div class="action-wrapper">
<div class="left">
<el-select
v-model="filter.tag"
size="small"
:placeholder="$t('Select Tag')"
@change="onFilterChange"
>
<el-option value="" :label="$t('All Tags')"/>
<el-option
v-for="tag in projectTags"
:key="tag"
:label="tag"
:value="tag"
/>
</el-select>
</div>
<div class="right">
<el-button
icon="el-icon-plus"
type="primary"
size="small"
@click="onAdd"
>
{{$t('Add Project')}}
</el-button>
</div>
</div>
<div class="content">
<div v-if="projectList.length === 0" class="empty-list">
{{ $t('You have no projects created. You can create a project by clicking the "Add" button.')}}
</div>
<ul v-else class="list">
<li
class="item"
v-for="(item, index) in projectList"
:key="item._id"
@click="onView(item)"
>
<el-card
class="item-card"
>
<i v-if="!isNoProject(item)" class="btn-edit fa fa-edit" @click="onEdit(item)"></i>
<i v-if="!isNoProject(item)" class="btn-close fa fa-trash-o" @click="onRemove(item)"></i>
<el-row>
<h4 v-if="index !== projectList.length - 1" class="title">{{ item.name }}</h4>
<h4 v-else class="title">{{ $t('No Project') }}</h4>
</el-row>
<el-row>
<div class="spider-count">
{{$t('Spider Count')}}: {{ item.spiders.length }}
</div>
</el-row>
<el-row class="description-wrapper">
<div class="description">
{{ item.description }}
</div>
</el-row>
<el-row class="tags-wrapper">
<div class="tags">
<el-tag
v-for="(tag, index) in item.tags"
:key="index"
size="mini"
class="tag"
>
{{ tag }}
</el-tag>
</div>
</el-row>
</el-card>
</li>
</ul>
</div>
</div>
</template>
<script>
import {
mapState
} from 'vuex'
export default {
name: 'ProjectList',
data () {
return {
defaultTags: [],
dialogVisible: false,
isClickAction: false,
filter: {
tag: ''
}
}
},
computed: {
...mapState('project', [
'projectForm',
'projectList',
'projectTags'
])
},
methods: {
onDialogClose () {
this.dialogVisible = false
},
onFilterChange () {
this.$store.dispatch('project/getProjectList', this.filter)
this.$st.sendEv('项目', '筛选项目')
},
onAdd () {
this.isEdit = false
this.dialogVisible = true
this.$store.commit('project/SET_PROJECT_FORM', { tags: [] })
this.$st.sendEv('项目', '添加项目')
},
onAddSubmit () {
this.$refs.projectForm.validate(res => {
if (res) {
const form = JSON.parse(JSON.stringify(this.projectForm))
if (this.isEdit) {
this.$request.post(`/projects/${this.projectForm._id}`, form).then(response => {
if (response.data.error) {
this.$message.error(response.data.error)
return
}
this.dialogVisible = false
this.$store.dispatch('project/getProjectList')
this.$message.success(this.$t('The project has been saved'))
})
} else {
this.$request.put('/projects', form).then(response => {
if (response.data.error) {
this.$message.error(response.data.error)
return
}
this.dialogVisible = false
this.$store.dispatch('project/getProjectList')
this.$message.success(this.$t('The project has been added'))
})
}
}
})
this.$st.sendEv('项目', '提交项目')
},
onEdit (row) {
this.isClickAction = true
setTimeout(() => {
this.isClickAction = false
}, 100)
this.$store.commit('project/SET_PROJECT_FORM', row)
this.dialogVisible = true
this.isEdit = true
this.$st.sendEv('项目', '修改项目')
},
onRemove (row) {
this.isClickAction = true
setTimeout(() => {
this.isClickAction = false
}, 100)
this.$confirm(this.$t('Are you sure to delete the project?'), this.$t('Notification'), {
confirmButtonText: this.$t('Confirm'),
cancelButtonText: this.$t('Cancel'),
type: 'warning'
}).then(() => {
this.$store.dispatch('project/removeProject', row._id)
.then(() => {
setTimeout(() => {
this.$store.dispatch('project/getProjectList')
this.$message.success(this.$t('The project has been removed'))
}, 100)
})
}).catch(() => {
})
this.$st.sendEv('项目', '删除项目')
},
onView (row) {
if (this.isClickAction) return
this.$router.push({
name: 'SpiderList',
params: {
project_id: row._id
}
})
},
isNoProject (row) {
return row._id === '000000000000000000000000'
}
},
async created () {
await this.$store.dispatch('project/getProjectList', this.filter)
await this.$store.dispatch('project/getProjectTags')
}
}
</script>
<style scoped>
.action-wrapper {
display: flex;
justify-content: space-between;
padding-bottom: 10px;
border-bottom: 1px solid #EBEEF5;
}
.list {
margin: 0;
padding: 0;
list-style: none;
display: flex;
flex-wrap: wrap;
}
.list .item {
width: 320px;
margin: 10px;
}
.list .item .item-card {
position: relative;
cursor: pointer;
}
.list .item .item-card .title {
margin: 10px 0 0 0;
}
.list .item .item-card .spider-count {
font-size: 12px;
color: grey;
font-weight: bolder;
}
.list .item .item-card .description-wrapper {
padding-bottom: 5px;
margin-bottom: 0;
border-bottom: 1px solid #EBEEF5;
}
.list .item .item-card .description {
font-size: 12px;
color: grey;
}
.list .item .item-card .tags {
margin-bottom: -5px;
}
.list .item .item-card .tags .tag {
margin: 0 5px 5px 0;
}
.list .item .item-card .el-row {
margin-bottom: 5px;
}
.list .item .item-card .el-row:last-child {
margin-bottom: 0;
}
.list .item .item-card .btn-edit {
z-index: 1;
color: grey;
position: absolute;
top: 11px;
right: 40px;
}
.list .item .item-card .btn-close {
z-index: 1;
color: grey;
position: absolute;
top: 10px;
right: 10px;
}
.empty-list {
font-size: 24px;
display: flex;
align-items: center;
justify-content: center;
height: calc(100vh - 240px);
}
</style>

View File

@@ -58,6 +58,20 @@
<el-form-item :label="$t('Display Name')" prop="display_name" required>
<el-input id="display-name" v-model="spiderForm.display_name" :placeholder="$t('Display Name')"/>
</el-form-item>
<el-form-item :label="$t('Project')" prop="project_id" required>
<el-select
v-model="spiderForm.project_id"
:placeholder="$t('Project')"
filterable
>
<el-option
v-for="p in projectList"
:key="p._id"
:value="p._id"
:label="p.name"
/>
</el-select>
</el-form-item>
<el-form-item :label="$t('Execute Command')" prop="cmd" required>
<el-input id="cmd" v-model="spiderForm.cmd" :placeholder="$t('Execute Command')"/>
</el-form-item>
@@ -104,6 +118,20 @@
<el-form-item :label="$t('Display Name')" prop="display_name" required>
<el-input v-model="spiderForm.display_name" :placeholder="$t('Display Name')"/>
</el-form-item>
<el-form-item :label="$t('Project')" prop="project_id" required>
<el-select
v-model="spiderForm.project_id"
:placeholder="$t('Project')"
filterable
>
<el-option
v-for="p in projectList"
:key="p._id"
:value="p._id"
:label="p.name"
/>
</el-select>
</el-form-item>
<el-form-item :label="$t('Template')" prop="template" required>
<el-select id="template" v-model="spiderForm.template" :value="spiderForm.template"
:placeholder="$t('Template')">
@@ -147,7 +175,29 @@
<!-- </el-select>-->
<!-- </el-form-item>-->
<el-form-item>
<el-input clearable @keyup.enter.native="onSearch" size="small" placeholder="名称" v-model="filter.keyword">
<el-select
v-model="filter.project_id"
size="small"
:placeholder="$t('Project')"
@change="getList"
>
<el-option value="" :label="$t('All Projects')"/>
<el-option
v-for="p in projectList"
:key="p._id"
:value="p._id"
:label="p.name"
/>
</el-select>
</el-form-item>
<el-form-item>
<el-input
v-model="filter.keyword"
size="small"
:placeholder="$t('Spider Name')"
clearable
@keyup.enter.native="onSearch"
>
<i slot="suffix" class="el-input__icon el-icon-search"></i>
</el-input>
</el-form-item>
@@ -335,6 +385,7 @@ export default {
crawlConfirmDialogVisible: false,
activeSpiderId: undefined,
filter: {
project_id: '',
keyword: '',
type: 'all'
},
@@ -491,6 +542,9 @@ export default {
...mapGetters('user', [
'token'
]),
...mapState('project', [
'projectList'
]),
uploadForm () {
return {
name: this.spiderForm.name,
@@ -517,7 +571,12 @@ export default {
this.getList()
},
onAdd () {
let projectId = '000000000000000000000000'
if (this.filter.project_id) {
projectId = this.filter.project_id
}
this.$store.commit('spider/SET_SPIDER_FORM', {
project_id: projectId,
template: this.templateList[0]
})
this.addDialogVisible = true
@@ -737,14 +796,20 @@ export default {
sort_key: this.sort.sortKey,
sort_direction: this.sort.sortDirection,
keyword: this.filter.keyword,
type: this.filter.type
type: this.filter.type,
project_id: this.filter.project_id
}
await this.$store.dispatch('spider/getSpiderList', params)
}
},
async created () {
// fetch spider types
// await this.getTypes()
// fetch project list
await this.$store.dispatch('project/getProjectList')
// project id
if (this.$route.params.project_id) {
this.filter.project_id = this.$route.params.project_id
}
// fetch spider list
await this.getList()

View File

@@ -137,6 +137,7 @@ export default {
},
computed: {
...mapState('task', [
'taskForm',
'taskResultsData',
'taskResultsTotalCount'
]),
@@ -164,6 +165,9 @@ export default {
set (value) {
this.$store.commit('task/SET_RESULTS_PAGE_SIZE', value)
}
},
isRunning () {
return ['pending', 'running'].includes(this.taskForm.status)
}
},
methods: {
@@ -197,6 +201,9 @@ export default {
this.getTaskLog()
this.handle = setInterval(() => {
if (!this.isRunning) return
this.$store.dispatch('task/getTaskData', this.$route.params.id)
this.$store.dispatch('task/getTaskResults', this.$route.params.id)
this.getTaskLog()
}, 5000)
},

View File

@@ -1,7 +1,7 @@
<template>
<div class="app-container">
<!--dialog-->
<el-dialog :visible.sync="dialogVisible" :title="$t('Edit User')">
<el-dialog :visible.sync="dialogVisible" width="640px" :title="$t('Edit User')">
<el-form ref="form" :model="userForm" label-width="80px" :rules="rules" inline-message>
<el-form-item prop="username" :label="$t('Username')" required>
<el-input v-model="userForm.username" :placeholder="$t('Username')" :disabled="!isAdd"></el-input>
@@ -50,7 +50,10 @@
:label="$t('Role')"
>
<template slot-scope="scope">
<el-tag v-if="scope.row.role === 'admin'" type="primary">
<el-tag v-if="scope.row.username === 'admin'" type="success">
{{ $t('Super Admin') }}
</el-tag>
<el-tag v-else-if="scope.row.role === 'admin'" type="primary">
{{ $t(scope.row.role) }}
</el-tag>
<el-tag v-else type="warning">
@@ -71,8 +74,20 @@
fixed="right"
>
<template slot-scope="scope">
<el-button icon="el-icon-edit" type="warning" size="mini" @click="onEdit(scope.row)"></el-button>
<el-button icon="el-icon-delete" type="danger" size="mini" @click="onRemove(scope.row)"></el-button>
<el-button
v-if="isShowEdit(scope.row)"
icon="el-icon-edit"
type="warning"
size="mini"
@click="onEdit(scope.row)"
/>
<el-button
v-if="isShowRemove(scope.row)"
icon="el-icon-delete"
type="danger"
size="mini"
@click="onRemove(scope.row)"
/>
</template>
</el-table-column>
</el-table>
@@ -95,7 +110,8 @@
<script>
import {
mapState
mapState,
mapGetters
} from 'vuex'
import dayjs from 'dayjs'
@@ -133,6 +149,9 @@ export default {
'userForm',
'totalCount'
]),
...mapGetters('user', [
'userInfo'
]),
pageSize: {
get () {
return this.$store.state.user.pageSize
@@ -219,6 +238,15 @@ export default {
this.dialogVisible = true
},
onValidateEmail (value) {
},
isShowEdit (row) {
if (row.username === 'admin') {
return this.userInfo.username === 'admin'
}
return true
},
isShowRemove (row) {
return row.username !== 'admin'
}
},
created () {
@@ -227,23 +255,21 @@ export default {
}
</script>
<style scoped>
<style lang="scss" scoped>
.filter {
display: flex;
justify-content: space-between;
margin-bottom: 8px;
.filter-search {
width: 240px;
}
.filter-search {
width: 240px;
}
.right {
.btn {
margin-left: 10px;
}
}
.right {
.btn {
margin-left: 10px;
}
}
}
.el-table {

View File

@@ -0,0 +1,51 @@
name: "amazon_config"
display_name: "亚马逊中国(可配置)"
remark: "亚马逊中国搜索手机,列表+分页"
type: "configurable"
col: "results_amazon_config"
engine: scrapy
start_url: https://www.amazon.cn/s?k=%E6%89%8B%E6%9C%BA&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_2
start_stage: list
stages:
- name: list
is_list: true
list_css: .s-result-item
list_xpath: ""
page_css: .a-last > a
page_xpath: ""
page_attr: href
fields:
- name: title
css: span.a-text-normal
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: url
css: .a-link-normal
xpath: ""
attr: href
next_stage: ""
remark: ""
- name: price
css: ""
xpath: .//*[@class="a-price-whole"]
attr: ""
next_stage: ""
remark: ""
- name: price_fraction
css: ""
xpath: .//*[@class="a-price-fraction"]
attr: ""
next_stage: ""
remark: ""
- name: img
css: .s-image-square-aspect > img
xpath: ""
attr: src
next_stage: ""
remark: ""
settings:
ROBOTSTXT_OBEY: "false"
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/78.0.3904.108 Safari/537.36

View File

@@ -0,0 +1,57 @@
name: "autohome_config"
display_name: "汽车之家(可配置)"
remark: "汽车之家文章,列表+详情+分页"
type: "configurable"
col: "results_autohome_config"
engine: scrapy
start_url: https://www.autohome.com.cn/all/
start_stage: list
stages:
- name: list
is_list: true
list_css: ul.article > li
list_xpath: ""
page_css: a.page-item-next
page_xpath: ""
page_attr: href
fields:
- name: title
css: li > a > h3
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: url
css: li > a
xpath: ""
attr: href
next_stage: ""
remark: ""
- name: abstract
css: li > a > p
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: time
css: li > a .fn-left
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: views
css: li > a .fn-right > em:first-child
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: comments
css: li > a .fn-right > em:last-child
xpath: ""
attr: ""
next_stage: ""
remark: ""
settings:
ROBOTSTXT_OBEY: "false"
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/78.0.3904.108 Safari/537.36

View File

@@ -0,0 +1,39 @@
name: "baidu_config"
display_name: "百度搜索(可配置)"
remark: "百度搜索Crawlab列表+分页"
type: "configurable"
col: "results_baidu_config"
engine: scrapy
start_url: http://www.baidu.com/s?wd=crawlab
start_stage: list
stages:
- name: list
is_list: true
list_css: ""
list_xpath: //body
page_css: ""
page_xpath: //body
page_attr: href
fields:
- name: title
css: ""
xpath: .//h3/a
attr: href
next_stage: ""
remark: ""
- name: url
css: ""
xpath: .//h3/a
attr: href
next_stage: ""
remark: ""
- name: abstract
css: ""
xpath: .//*[@class="c-abstract"]
attr: href
next_stage: ""
remark: ""
settings:
ROBOTSTXT_OBEY: "false"
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/78.0.3904.108 Safari/537.36

View File

@@ -0,0 +1,5 @@
name: "chinaz"
display_name: "站长之家 (Scrapy)"
col: "results_chinaz"
type: "customized"
cmd: "scrapy crawl chinaz_spider"

View File

@@ -5,24 +5,3 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os
from pymongo import MongoClient
MONGO_HOST = os.environ.get('MONGO_HOST') or 'localhost'
MONGO_PORT = int(os.environ.get('MONGO_PORT') or '27017')
MONGO_DB = os.environ.get('MONGO_DB') or 'crawlab_test'
class MongoPipeline(object):
mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
db = mongo[MONGO_DB]
col_name = os.environ.get('CRAWLAB_COLLECTION') or 'sites'
col = db[col_name]
def process_item(self, item, spider):
item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
item['_id'] = item['domain']
if self.col.find_one({'_id': item['_id']}) is None:
self.col.save(item)
return item

View File

@@ -65,7 +65,7 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'chinaz.pipelines.MongoPipeline': 300,
'crawlab.pipelines.CrawlabMongoPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)

View File

@@ -0,0 +1,60 @@
name: "csdn_config"
display_name: "CSDN可配置"
remark: "CSDN Crawlab 文章,列表+详情+分页"
type: "configurable"
col: "results_csdn_config"
engine: scrapy
start_url: https://so.csdn.net/so/search/s.do?q=crawlab
start_stage: list
stages:
- name: list
is_list: true
list_css: .search-list-con > .search-list
list_xpath: ""
page_css: a.btn-next
page_xpath: ""
page_attr: href
fields:
- name: url
css: ""
xpath: .//*[@class="limit_width"]/a
attr: href
next_stage: detail
remark: ""
- name: detail
is_list: false
list_css: ""
list_xpath: ""
page_css: ""
page_xpath: ""
page_attr: ""
fields:
- name: content
css: ""
xpath: .//div[@id="content_views"]
attr: ""
next_stage: ""
remark: ""
- name: views
css: .read-count
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: title
css: .title-article
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: author
css: .follow-nickName
xpath: ""
attr: ""
next_stage: ""
remark: ""
settings:
AUTOTHROTTLE_ENABLED: "false"
ROBOTSTXT_OBEY: "false"
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/79.0.3945.117 Safari/537.36

View File

@@ -0,0 +1,57 @@
name: "douban_config"
display_name: "豆瓣读书(可配置)"
remark: "豆瓣读书新书推荐,列表"
type: "configurable"
col: "results_douban_config"
engine: scrapy
start_url: https://book.douban.com/latest
start_stage: list
stages:
- name: list
is_list: true
list_css: ul.cover-col-4 > li
list_xpath: ""
page_css: ""
page_xpath: ""
page_attr: ""
fields:
- name: title
css: h2 > a
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: url
css: h2 > a
xpath: ""
attr: href
next_stage: ""
remark: ""
- name: img
css: a.cover img
xpath: ""
attr: src
next_stage: ""
remark: ""
- name: rating
css: p.rating > .color-lightgray
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: abstract
css: p:last-child
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: info
css: .color-gray
xpath: ""
attr: ""
next_stage: ""
remark: ""
settings:
ROBOTSTXT_OBEY: "false"
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/78.0.3904.108 Safari/537.36

5
spiders/jd/Spiderfile Normal file
View File

@@ -0,0 +1,5 @@
name: "jd"
display_name: "京东 (Scrapy)"
col: "results_jd"
type: "customized"
cmd: "scrapy crawl jd_spider"

View File

@@ -12,3 +12,4 @@ class JdItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
price = scrapy.Field()
url = scrapy.Field()

View File

@@ -4,14 +4,3 @@
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from pymongo import MongoClient
class JdPipeline(object):
mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
db = mongo[MONGO_DB]
col_name = os.environ.get('CRAWLAB_COLLECTION') or 'jd_products'
col = db[col_name]
def process_item(self, item, spider):
return item

View File

@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'jd.spiders'
#USER_AGENT = 'jd (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
@@ -65,7 +65,7 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'jd.pipelines.JdPipeline': 300,
'crawlab.pipelines.CrawlabMongoPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)

View File

@@ -1,11 +1,21 @@
# -*- coding: utf-8 -*-
import scrapy
from jd.items import JdItem
class JdSpiderSpider(scrapy.Spider):
name = 'jd_spider'
allowed_domains = ['jd.com']
start_urls = ['http://jd.com/']
def start_requests(self):
for i in range(1, 50):
yield scrapy.Request(url=f'https://search.jd.com/Search?keyword=手机&enc=utf-8&page={i}')
def parse(self, response):
pass
for el in response.css('.gl-item'):
yield JdItem(
url=el.css('.p-name > a::attr("href")').extract_first(),
name=el.css('.p-name > a::attr("title")').extract_first(),
price=float(el.css('.p-price i::text').extract_first()),
)

View File

@@ -0,0 +1,4 @@
name: "realestate"
display_name: "链家网 (Scrapy)"
col: "results_realestate"
cmd: "scrapy crawl lianjia"

View File

@@ -4,22 +4,3 @@
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os
from pymongo import MongoClient
MONGO_HOST = os.environ.get('MONGO_HOST') or 'localhost'
MONGO_PORT = int(os.environ.get('MONGO_PORT') or '27017')
MONGO_DB = os.environ.get('MONGO_DB') or 'crawlab_test'
class MongoPipeline(object):
mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
db = mongo[MONGO_DB]
col_name = os.environ.get('CRAWLAB_COLLECTION')
col = db[col_name]
def process_item(self, item, spider):
item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
self.col.save(item)
return item

View File

@@ -64,7 +64,7 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'realestate.pipelines.MongoPipeline': 300,
'crawlab.pipelines.CrawlabMongoPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)

View File

@@ -0,0 +1,5 @@
name: "sinastock"
display_name: "新浪股票 (Scrapy)"
type: "customized"
col: "results_sinastock"
cmd: "scrapy crawl sinastock_spider"

View File

@@ -4,25 +4,3 @@
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os
from pymongo import MongoClient
class SinastockPipeline(object):
mongo = MongoClient(
host=os.environ.get('MONGO_HOST') or 'localhost',
port=int(os.environ.get('MONGO_PORT') or 27017)
)
db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test']
col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'stock_news')
# create indexes
col.create_index('stocks')
col.create_index('url')
def process_item(self, item, spider):
item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
if self.col.find_one({'url': item['url']}) is None:
self.col.save(item)
return item

View File

@@ -64,7 +64,7 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'sinastock.pipelines.SinastockPipeline': 300,
'crawlab.pipelines.CrawlabMongoPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)

View File

@@ -0,0 +1,54 @@
name: "v2ex_config"
display_name: "V2ex可配置"
remark: "V2ex列表+详情"
type: "configurable"
col: "results_v2ex_config"
engine: scrapy
start_url: https://v2ex.com/
start_stage: list
stages:
- name: list
is_list: true
list_css: .cell.item
list_xpath: ""
page_css: ""
page_xpath: ""
page_attr: href
fields:
- name: title
css: a.topic-link
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: url
css: a.topic-link
xpath: ""
attr: href
next_stage: detail
remark: ""
- name: replies
css: .count_livid
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: detail
is_list: false
list_css: ""
list_xpath: ""
page_css: ""
page_xpath: ""
page_attr: ""
fields:
- name: content
css: ""
xpath: .//*[@class="markdown_body"]
attr: ""
next_stage: ""
remark: ""
settings:
AUTOTHROTTLE_ENABLED: "true"
ROBOTSTXT_OBEY: "false"
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/79.0.3945.117 Safari/537.36

View File

@@ -0,0 +1,5 @@
name: "xueqiu"
display_name: "雪球网 (Scrapy)"
type: "customized"
col: "results_xueqiu"
cmd: "scrapy crawl xueqiu_spider"

View File

@@ -4,26 +4,3 @@
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os
from pymongo import MongoClient
class XueqiuPipeline(object):
mongo = MongoClient(
host=os.environ.get('MONGO_HOST') or 'localhost',
port=int(os.environ.get('MONGO_PORT') or 27017)
)
db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test']
col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'results_xueqiu')
# create indexes
col.create_index('stocks')
col.create_index('id')
col.create_index('url')
def process_item(self, item, spider):
item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
if self.col.find_one({'id': item['id']}) is None:
self.col.save(item)
return item

View File

@@ -18,7 +18,7 @@ NEWSPIDER_MODULE = 'xueqiu.spiders'
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
@@ -64,7 +64,7 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'xueqiu.pipelines.XueqiuPipeline': 300,
'crawlab.pipelines.CrawlabMongoPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)

View File

@@ -0,0 +1,39 @@
name: "xueqiu_config"
display_name: "雪球网(可配置)"
remark: "雪球网新闻,列表"
type: "configurable"
col: "results_xueqiu_config"
engine: scrapy
start_url: https://xueqiu.com/
start_stage: list
stages:
- name: list
is_list: true
list_css: ""
list_xpath: .//*[contains(@class, "AnonymousHome_home__timeline__item")]
page_css: ""
page_xpath: ""
page_attr: ""
fields:
- name: title
css: h3 > a
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: url
css: h3 > a
xpath: ""
attr: href
next_stage: ""
remark: ""
- name: abstract
css: p
xpath: ""
attr: ""
next_stage: ""
remark: ""
settings:
ROBOTSTXT_OBEY: "false"
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/78.0.3904.108 Safari/537.36

View File

@@ -0,0 +1,45 @@
name: "zongheng_config"
display_name: "纵横(可配置)"
remark: "纵横小说网,列表"
type: "configurable"
col: "results_zongheng_config"
engine: scrapy
start_url: http://www.zongheng.com/rank/details.html?rt=1&d=1
start_stage: list
stages:
- name: list
is_list: true
list_css: .rank_d_list
list_xpath: ""
page_css: ""
page_xpath: ""
page_attr: href
fields:
- name: title
css: .rank_d_b_name > a
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: url
css: .rank_d_b_name > a
xpath: ""
attr: href
next_stage: ""
remark: ""
- name: abstract
css: body
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: votes
css: .rank_d_b_ticket
xpath: ""
attr: ""
next_stage: ""
remark: ""
settings:
ROBOTSTXT_OBEY: "false"
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/78.0.3904.108 Safari/537.36