Merge pull request #25 from tikazyq/develop

Develop
2026-01-21 17:21:09 +01:00 · 2019-05-10 21:27:10 +08:00
parent 7c492ee237 bc4a04bc6c
commit 7ed7fb1b29
31 changed files with 963 additions and 34 deletions
--- a/crawlab/app.py
+++ b/crawlab/app.py
@@ -9,6 +9,7 @@ from flask import Flask
 from flask_cors import CORS
 from flask_restful import Api
 # from flask_restplus import Api
+from routes.sites import SiteApi
 from utils.log import other
 from constants.node import NodeStatus
 from db.manager import db_manager
@@ -68,6 +69,9 @@ api.add_resource(StatsApi,
 api.add_resource(ScheduleApi,
                 '/api/schedules',
                 '/api/schedules/<string:id>')
+api.add_resource(SiteApi,
+                 '/api/sites',
+                 '/api/sites/<string:id>')


 def monitor_nodes_status(celery_app):
--- a/crawlab/db/manager.py
+++ b/crawlab/db/manager.py
@@ -13,7 +13,7 @@ class DbManager(object):
    """

    def __init__(self):
-        self.mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
+        self.mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT, connect=False)
        self.db = self.mongo[MONGO_DB]

    def save(self, col_name: str, item: dict, **kwargs) -> None:
--- a/crawlab/routes/sites.py
+++ b/crawlab/routes/sites.py
@@ -0,0 +1,72 @@
+import json
+
+from bson import ObjectId
+from pymongo import ASCENDING
+
+from db.manager import db_manager
+from routes.base import BaseApi
+from utils import jsonify
+
+
+class SiteApi(BaseApi):
+    col_name = 'sites'
+
+    arguments = (
+        ('keyword', str),
+        ('category', str),
+    )
+
+    def get(self, id: str = None, action: str = None):
+        # action by id
+        if action is not None:
+            if not hasattr(self, action):
+                return {
+                           'status': 'ok',
+                           'code': 400,
+                           'error': 'action "%s" invalid' % action
+                       }, 400
+            return getattr(self, action)(id)
+
+        elif id is not None:
+            site = db_manager.get(col_name=self.col_name, id=id)
+            return jsonify(site)
+
+        # list tasks
+        args = self.parser.parse_args()
+        page_size = args.get('page_size') or 10
+        page_num = args.get('page_num') or 1
+        filter_str = args.get('filter')
+        keyword = args.get('keyword')
+        filter_ = {}
+        if filter_str is not None:
+            filter_ = json.loads(filter_str)
+        if keyword is not None:
+            filter_['$or'] = [
+                {'description': {'$regex': keyword}},
+                {'name': {'$regex': keyword}},
+                {'domain': {'$regex': keyword}}
+            ]
+
+        items = db_manager.list(
+            col_name=self.col_name,
+            cond=filter_,
+            limit=page_size,
+            skip=page_size * (page_num - 1),
+            sort_key='rank',
+            sort_direction=ASCENDING
+        )
+
+        sites = []
+        for site in items:
+            # get spider count
+            site['spider_count'] = db_manager.count('spiders', {'site': site['_id']})
+
+            sites.append(site)
+
+        return {
+            'status': 'ok',
+            'total_count': db_manager.count(self.col_name, filter_),
+            'page_num': page_num,
+            'page_size': page_size,
+            'items': jsonify(sites)
+        }
--- a/crawlab/routes/spiders.py
+++ b/crawlab/routes/spiders.py
@@ -61,6 +61,9 @@ class SpiderApi(BaseApi):

        # spider schedule cron enabled
        ('envs', str),
+
+        # spider site
+        ('site', str),
    )

    def get(self, id=None, action=None):
@@ -125,6 +128,12 @@ class SpiderApi(BaseApi):
                    if last_task is not None:
                        spider['task_ts'] = last_task['create_ts']

+                    # get site
+                    if spider.get('site') is not None:
+                        site = db_manager.get('sites', spider['site'])
+                        if site is not None:
+                            spider['site_name'] = site['name']
+
                    # file stats
                    stats = get_file_suffix_stats(dir_path)

--- a/crawlab/routes/tasks.py
+++ b/crawlab/routes/tasks.py
@@ -36,7 +36,6 @@ class TaskApi(BaseApi):
                           'code': 400,
                           'error': 'action "%s" invalid' % action
                       }, 400
-            # other.info(f"到这了{action},{id}")
            return getattr(self, action)(id)

        elif id is not None:
@@ -78,9 +77,6 @@ class TaskApi(BaseApi):
                                sort_key='create_ts')
        items = []
        for task in tasks:
-            # celery tasks
-            # _task = db_manager.get('tasks_celery', id=task['_id'])
-
            # get spider
            _spider = db_manager.get(col_name='spiders', id=str(task['spider_id']))

--- a/crawlab/tasks/scheduler.py
+++ b/crawlab/tasks/scheduler.py
@@ -9,7 +9,7 @@ from db.manager import db_manager


 class Scheduler(object):
-    mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
+    mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT, connect=False)
    task_col = 'apscheduler_jobs'

    # scheduler jobstore
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -1,6 +1,6 @@
 {
  "name": "crawlab",
-  "version": "0.1.0",
+  "version": "0.2.0",
  "private": true,
  "scripts": {
    "serve": "cross-env NODE_ENV=development vue-cli-service serve --ip=0.0.0.0",
--- a/frontend/src/components/InfoView/SpiderInfoView.vue
+++ b/frontend/src/components/InfoView/SpiderInfoView.vue
@@ -23,6 +23,14 @@
          <el-input v-model="spiderForm.col" :placeholder="$t('Results Collection')"
                    :disabled="isView"></el-input>
        </el-form-item>
+        <el-form-item :label="$t('Site')">
+          <el-autocomplete v-model="spiderForm.site"
+                           :placeholder="$t('Site')"
+                           :fetch-suggestions="fetchSiteSuggestions"
+                           clearable
+                           @select="onSiteSelect">
+          </el-autocomplete>
+        </el-form-item>
        <el-form-item :label="$t('Spider Type')">
          <el-select v-model="spiderForm.type" :placeholder="$t('Spider Type')" :disabled="isView" clearable>
            <el-option value="scrapy" label="Scrapy"></el-option>
@@ -38,26 +46,6 @@
            <el-option value="go" label="Go"></el-option>
          </el-select>
        </el-form-item>
-        <!--<el-form-item :label="$t('Schedule Enabled')">-->
-        <!--<el-switch v-model="spiderForm.cron_enabled" :disabled="isView">-->
-        <!--</el-switch>-->
-        <!--</el-form-item>-->
-        <!--<el-form-item :label="$t('Schedule Cron')" v-if="spiderForm.cron_enabled"-->
-        <!--prop="cron"-->
-        <!--:rules="cronRules"-->
-        <!--:inline-message="true">-->
-        <!--<template slot="label">-->
-        <!--<el-tooltip :content="$t('Cron Format: [second] [minute] [hour] [day of month] [month] [day of week]')"-->
-        <!--placement="top">-->
-        <!--<span>-->
-        <!--{{$t('Schedule Cron')}}-->
-        <!--<i class="fa fa-exclamation-circle"></i>-->
-        <!--</span>-->
-        <!--</el-tooltip>-->
-        <!--</template>-->
-        <!--<el-input v-model="spiderForm.cron" :placeholder="$t('Schedule Cron')"-->
-        <!--:disabled="isView"></el-input>-->
-        <!--</el-form-item>-->
      </el-form>
    </el-row>
    <el-row class="button-container" v-if="!isView">
@@ -172,6 +160,22 @@ export default {
            })
        }
      })
+    },
+    fetchSiteSuggestions (keyword, callback) {
+      this.$request.get('/sites', {
+        keyword: keyword,
+        page_num: 1,
+        page_size: 100
+      }).then(response => {
+        const data = response.data.items.map(d => {
+          d.value = `${d.name} | ${d.domain}`
+          return d
+        })
+        callback(data)
+      })
+    },
+    onSiteSelect (item) {
+      this.spiderForm.site = item._id
    }
  }
 }
@@ -187,4 +191,8 @@ export default {
    width: 100%;
    text-align: right;
  }
+
+  .el-autocomplete {
+    width: 100%;
+  }
 </style>
--- a/frontend/src/i18n/zh.js
+++ b/frontend/src/i18n/zh.js
@@ -10,6 +10,7 @@ export default {
  'Task Detail': '任务详情',
  'Schedules': '定时任务',
  'Deploys': '部署',
+  'Sites': '网站',

  // 标签
  Overview: '概览',
@@ -70,7 +71,7 @@ export default {

  // 节点状态
  Online: '在线',
-  Offline: '在线',
+  Offline: '离线',
  Unavailable: '未知',

  // 爬虫
@@ -130,6 +131,15 @@ export default {
  'Parameters': '参数',
  'Add Schedule': '添加定时任务',

+  // 网站
+  'Site': '网站',
+  'Rank': '排名',
+  'Domain': '域名',
+  'Category': '类别',
+  'Select': '请选择',
+  'Select Category': '请选择类别',
+  'Spider Count': '爬虫数',
+
  // 文件
  'Choose Folder': '选择文件',

--- a/frontend/src/router/index.js
+++ b/frontend/src/router/index.js
@@ -183,6 +183,26 @@ export const constantRouterMap = [
      }
    ]
  },
+  {
+    name: 'Site',
+    path: '/sites',
+    component: Layout,
+    meta: {
+      title: 'Site',
+      icon: 'fa fa-sitemap'
+    },
+    children: [
+      {
+        path: '',
+        name: 'SiteList',
+        component: () => import('../views/site/SiteList'),
+        meta: {
+          title: 'Sites',
+          icon: 'fa fa-sitemap'
+        }
+      }
+    ]
+  },

  { path: '*', redirect: '/404', hidden: true }
 ]
--- a/frontend/src/store/index.js
+++ b/frontend/src/store/index.js
@@ -11,6 +11,7 @@ import task from './modules/task'
 import file from './modules/file'
 import schedule from './modules/schedule'
 import lang from './modules/lang'
+import site from './modules/site'
 import getters from './getters'

 Vue.use(Vuex)
@@ -27,7 +28,8 @@ const store = new Vuex.Store({
    task,
    file,
    schedule,
-    lang
+    lang,
+    site
  },
  getters
 })
--- a/frontend/src/store/modules/site.js
+++ b/frontend/src/store/modules/site.js
@@ -0,0 +1,67 @@
+import request from '../../api/request'
+
+const state = {
+  siteList: [],
+
+  // filter
+  filter: {
+    category: undefined
+  },
+  keyword: '',
+
+  // pagination
+  pageNum: 1,
+  pageSize: 10,
+  totalCount: 0
+}
+
+const getters = {}
+
+const mutations = {
+  SET_KEYWORD (state, value) {
+    state.keyword = value
+  },
+  SET_SITE_LIST (state, value) {
+    state.siteList = value
+  },
+  SET_PAGE_NUM (state, value) {
+    state.pageNum = value
+  },
+  SET_PAGE_SIZE (state, value) {
+    state.pageSize = value
+  },
+  SET_TOTAL_COUNT (state, value) {
+    state.totalCount = value
+  }
+}
+
+const actions = {
+  editSite ({ state, dispatch }, payload) {
+    const { id, category } = payload
+    return request.post(`/sites/${id}`, {
+      category
+    })
+  },
+  getSiteList ({ state, commit }) {
+    return request.get('/sites', {
+      page_num: state.pageNum,
+      page_size: state.pageSize,
+      keyword: state.keyword || undefined,
+      filter: {
+        category: state.filter.category || undefined
+      }
+    })
+      .then(response => {
+        commit('SET_SITE_LIST', response.data.items)
+        commit('SET_TOTAL_COUNT', response.data.total_count)
+      })
+  }
+}
+
+export default {
+  namespaced: true,
+  state,
+  getters,
+  mutations,
+  actions
+}
--- a/frontend/src/store/modules/spider.js
+++ b/frontend/src/store/modules/spider.js
@@ -55,7 +55,7 @@ const mutations = {
  },
  SET_NODE_STATS (state, value) {
    state.nodeStats = value
-  },
+  }
 }

 const actions = {
@@ -74,7 +74,8 @@ const actions = {
      lang: state.spiderForm.lang,
      col: state.spiderForm.col,
      cron: state.spiderForm.cron,
-      cron_enabled: state.spiderForm.cron_enabled ? 1 : 0
+      cron_enabled: state.spiderForm.cron_enabled ? 1 : 0,
+      site: state.spiderForm.site
    })
      .then(() => {
        dispatch('getSpiderList')
@@ -89,7 +90,8 @@ const actions = {
      lang: state.spiderForm.lang,
      col: state.spiderForm.col,
      cron: state.spiderForm.cron,
-      cron_enabled: state.spiderForm.cron_enabled ? 1 : 0
+      cron_enabled: state.spiderForm.cron_enabled ? 1 : 0,
+      site: state.spiderForm.site
    })
      .then(() => {
        dispatch('getSpiderList')
--- a/frontend/src/views/site/SiteList.vue
+++ b/frontend/src/views/site/SiteList.vue
@@ -0,0 +1,205 @@
+<template>
+  <div class="app-container">
+    <!--filter-->
+    <div class="filter">
+      <el-input prefix-icon="el-icon-search"
+                :placeholder="$t('Search')"
+                class="filter-search"
+                v-model="keyword">
+      </el-input>
+      <el-select v-model="filter.category" class="filter-category" :placeholder="$t('Select Category')" clearable>
+        <el-option v-for="op in categoryList" :key="op" :value="op" :label="op"></el-option>
+      </el-select>
+      <el-button type="success"
+                 icon="el-icon-refresh"
+                 class="btn refresh"
+                 @click="onSearch">
+        {{$t('Search')}}
+      </el-button>
+    </div>
+
+    <!--table list-->
+    <el-table :data="siteList"
+              class="table"
+              :header-cell-style="{background:'rgb(48, 65, 86)',color:'white'}"
+              border>
+      <template v-for="col in columns">
+        <el-table-column v-if="col.name === 'category'"
+                         :key="col.name"
+                         :label="$t(col.label)"
+                         :width="col.width"
+                         :align="col.align">
+          <template slot-scope="scope">
+            <el-select v-model="scope.row[col.name]"
+                       :placeholder="$t('Select')"
+                       @change="onRowChange(scope.row)">
+              <el-option v-for="op in categoryList"
+                         :key="op"
+                         :value="op"
+                         :label="op">
+              </el-option>
+            </el-select>
+          </template>
+        </el-table-column>
+        <el-table-column v-else-if="col.name === 'domain'"
+                         :key="col.name"
+                         :label="$t(col.label)"
+                         :width="col.width"
+                         :align="col.align">
+          <template slot-scope="scope">
+            <a class="domain" :href="'http://' + scope.row[col.name]" target="_blank">
+              {{scope.row[col.name]}}
+            </a>
+          </template>
+        </el-table-column>
+        <el-table-column v-else
+                         :key="col.name"
+                         :property="col.name"
+                         :label="$t(col.label)"
+                         :sortable="col.sortable"
+                         :align="col.align || 'center'"
+                         :width="col.width">
+        </el-table-column>
+      </template>
+      <el-table-column :label="$t('Action')" align="left" width="120">
+        <template slot-scope="scope">
+          <el-tooltip :content="$t('View')" placement="top">
+            <el-button type="primary" icon="el-icon-search" size="mini" @click="onView(scope.row)"></el-button>
+          </el-tooltip>
+          <!--<el-tooltip :content="$t('Remove')" placement="top">-->
+          <!--<el-button type="danger" icon="el-icon-delete" size="mini" @click="onRemove(scope.row)"></el-button>-->
+          <!--</el-tooltip>-->
+        </template>
+      </el-table-column>
+    </el-table>
+    <div class="pagination">
+      <el-pagination
+        @current-change="onPageChange"
+        @size-change="onPageChange"
+        :current-page.sync="pageNum"
+        :page-sizes="[10, 20, 50, 100]"
+        :page-size.sync="pageSize"
+        layout="sizes, prev, pager, next"
+        :total="totalCount">
+      </el-pagination>
+    </div>
+  </div>
+</template>
+
+<script>
+import {
+  mapState
+} from 'vuex'
+
+export default {
+  name: 'SiteList',
+  data () {
+    return {
+      categoryList: [
+        '新闻',
+        '搜索引擎',
+        '综合',
+        '金融',
+        '购物',
+        '社交',
+        '视频',
+        '音乐',
+        '资讯',
+        '政企官网',
+        '其他'
+      ],
+      columns: [
+        { name: 'rank', label: 'Rank', align: 'center', width: '80' },
+        { name: 'name', label: 'Name', align: 'left', width: '120' },
+        { name: 'domain', label: 'Domain', align: 'left', width: '150' },
+        { name: 'description', label: 'Description', align: 'left' },
+        { name: 'category', label: 'Category', align: 'center', width: '180' },
+        { name: 'spider_count', label: 'Spider Count', align: 'center', width: '60' }
+      ]
+    }
+  },
+  computed: {
+    ...mapState('site', [
+      'filter',
+      'siteList',
+      'totalCount'
+    ]),
+    keyword: {
+      get () {
+        return this.$store.state.site.keyword
+      },
+      set (value) {
+        this.$store.commit('site/SET_KEYWORD', value)
+      }
+    },
+    pageNum: {
+      get () {
+        return this.$store.state.site.pageNum
+      },
+      set (value) {
+        this.$store.commit('site/SET_PAGE_NUM', value)
+      }
+    },
+    pageSize: {
+      get () {
+        return this.$store.state.site.pageSize
+      },
+      set (value) {
+        this.$store.commit('site/SET_PAGE_SIZE', value)
+      }
+    }
+  },
+  methods: {
+    onSearch () {
+      this.$store.dispatch('site/getSiteList')
+    },
+    onPageChange () {
+      this.$store.dispatch('site/getSiteList')
+    },
+    onRowChange (row) {
+      this.$store.dispatch('site/editSite', {
+        id: row.domain,
+        category: row.category
+      })
+    }
+  },
+  created () {
+    this.$store.dispatch('site/getSiteList')
+  }
+}
+</script>
+
+<style scoped>
+  .filter {
+    display: flex;
+  }
+
+  .filter .filter-search {
+    width: 180px;
+  }
+
+  .filter .filter-category {
+    width: 180px;
+    margin-left: 20px;
+  }
+
+  .filter .btn {
+    margin-left: 20px;
+  }
+
+  .table {
+    margin-top: 20px;
+  }
+
+  .table >>> .el-select .el-input__inner {
+    height: 32px;
+  }
+
+  .table >>> .el-select .el-select__caret {
+    line-height: 32px;
+  }
+
+  .table >>> .domain {
+    text-decoration: underline;
+  }
+</style>
--- a/frontend/src/views/spider/SpiderList.vue
+++ b/frontend/src/views/spider/SpiderList.vue
@@ -160,8 +160,9 @@ export default {
      // tableData,
      columns: [
        { name: 'name', label: 'Name', width: 'auto' },
-        { name: 'type', label: 'Spider Type', width: '160', sortable: true },
-        { name: 'lang', label: 'Language', width: '160', sortable: true },
+        { name: 'site_name', label: 'Site', width: '120' },
+        { name: 'type', label: 'Spider Type', width: '120', sortable: true },
+        { name: 'lang', label: 'Language', width: '120', sortable: true },
        { name: 'task_ts', label: 'Last Run', width: '160' },
        { name: 'last_7d_tasks', label: 'Last 7-Day Tasks', width: '80' },
        { name: 'last_5_errors', label: 'Last 5-Run Errors', width: '80' }
--- a/spiders/chinaz/chinaz/init.py
+++ b/spiders/chinaz/chinaz/init.py
--- a/spiders/chinaz/chinaz/items.py
+++ b/spiders/chinaz/chinaz/items.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class ChinazItem(scrapy.Item):
+    # define the fields for your item here like:
+    _id = scrapy.Field()
+    task_id = scrapy.Field()
+    name = scrapy.Field()
+    domain = scrapy.Field()
+    description = scrapy.Field()
+    rank = scrapy.Field()
--- a/spiders/chinaz/chinaz/middlewares.py
+++ b/spiders/chinaz/chinaz/middlewares.py
@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class ChinazSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class ChinazDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/spiders/chinaz/chinaz/pipelines.py
+++ b/spiders/chinaz/chinaz/pipelines.py
@@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+import os
+
+from pymongo import MongoClient
+
+MONGO_HOST = os.environ.get('MONGO_HOST') or 'localhost'
+MONGO_PORT = int(os.environ.get('MONGO_PORT') or '27017')
+MONGO_DB = os.environ.get('MONGO_DB') or 'crawlab_test'
+
+
+class MongoPipeline(object):
+    mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
+    db = mongo[MONGO_DB]
+    col_name = os.environ.get('CRAWLAB_COLLECTION') or 'sites'
+    col = db[col_name]
+
+    def process_item(self, item, spider):
+        item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
+        item['_id'] = item['domain']
+        if self.col.find_one({'_id': item['_id']}) is None:
+            self.col.save(item)
+        return item
--- a/spiders/chinaz/chinaz/settings.py
+++ b/spiders/chinaz/chinaz/settings.py
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for chinaz project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'chinaz'
+
+SPIDER_MODULES = ['chinaz.spiders']
+NEWSPIDER_MODULE = 'chinaz.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'chinaz (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'chinaz.middlewares.ChinazSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'chinaz.middlewares.ChinazDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'chinaz.pipelines.MongoPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/spiders/chinaz/chinaz/spiders/init.py
+++ b/spiders/chinaz/chinaz/spiders/init.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/spiders/chinaz/chinaz/spiders/chinaz_spider.py
+++ b/spiders/chinaz/chinaz/spiders/chinaz_spider.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from chinaz.items import ChinazItem
+
+
+class ChinazSpiderSpider(scrapy.Spider):
+    name = 'chinaz_spider'
+    allowed_domains = ['chinaz.com']
+    start_urls = ['http://top.chinaz.com/hangye/']
+
+    def parse(self, response):
+        for item in response.css('.listCentent > li'):
+            name = item.css('h3.rightTxtHead > a::text').extract_first()
+            domain = item.css('h3.rightTxtHead > span::text').extract_first()
+            description = item.css('p.RtCInfo::text').extract_first()
+            rank = item.css('.RtCRateCent > strong::text').extract_first()
+            rank = int(rank)
+            yield ChinazItem(
+                _id=domain,
+                name=name,
+                domain=domain,
+                description=description,
+                rank=rank,
+            )
+
+        # pagination
+        a_list = response.css('.ListPageWrap > a::attr("href")').extract()
+        url = 'http://top.chinaz.com/hangye/' + a_list[-1]
+        yield scrapy.Request(url=url)
--- a/spiders/chinaz/scrapy.cfg
+++ b/spiders/chinaz/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = chinaz.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = chinaz
--- a/spiders/jd/jd/init.py
+++ b/spiders/jd/jd/init.py
--- a/spiders/jd/jd/items.py
+++ b/spiders/jd/jd/items.py
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class JdItem(scrapy.Item):
+    # define the fields for your item here like:
+    name = scrapy.Field()
+    price = scrapy.Field()
--- a/spiders/jd/jd/middlewares.py
+++ b/spiders/jd/jd/middlewares.py
@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class JdSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class JdDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/spiders/jd/jd/pipelines.py
+++ b/spiders/jd/jd/pipelines.py
@@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+from pymongo import MongoClient
+
+
+class JdPipeline(object):
+    mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
+    db = mongo[MONGO_DB]
+    col_name = os.environ.get('CRAWLAB_COLLECTION') or 'jd_products'
+    col = db[col_name]
+
+    def process_item(self, item, spider):
+        return item
--- a/spiders/jd/jd/settings.py
+++ b/spiders/jd/jd/settings.py
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for jd project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'jd'
+
+SPIDER_MODULES = ['jd.spiders']
+NEWSPIDER_MODULE = 'jd.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'jd (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'jd.middlewares.JdSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'jd.middlewares.JdDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'jd.pipelines.JdPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/spiders/jd/jd/spiders/init.py
+++ b/spiders/jd/jd/spiders/init.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/spiders/jd/jd/spiders/jd_spider.py
+++ b/spiders/jd/jd/spiders/jd_spider.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+import scrapy
+
+
+class JdSpiderSpider(scrapy.Spider):
+    name = 'jd_spider'
+    allowed_domains = ['jd.com']
+    start_urls = ['http://jd.com/']
+
+    def parse(self, response):
+        pass
--- a/spiders/jd/scrapy.cfg
+++ b/spiders/jd/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = jd.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = jd