From 647fac1efe3e1acfcb1f18f08f6b28b74fb966a9 Mon Sep 17 00:00:00 2001
From: Marvin Zhang <tikazyq@gmail.com>
Date: Sat, 9 Mar 2019 14:05:14 +0800
Subject: [PATCH] added cron tasks for spiders

---
 README-zh.md                                  |   2 +-
 README.md                                     |   2 +-
 crawlab/app.py                                |  76 +++++++++++++
 crawlab/config.py                             |   2 -
 crawlab/constants/manage.py                   |   1 +
 crawlab/constants/spider.py                   |   5 +
 crawlab/manage.py                             |  15 +++
 crawlab/routes/schedules.py                   |  18 +++
 crawlab/routes/spiders.py                     |   6 +
 crawlab/tasks/scheduler.py                    |  53 +++++++++
 crawlab/tasks/spider.py                       |   6 +-
 frontend/src/App.vue                          |   4 +
 .../components/InfoView/SpiderInfoView.vue    |  35 ++++++
 frontend/src/router/index.js                  |  21 ++++
 frontend/src/store/modules/spider.js          |  13 ++-
 frontend/src/views/schedule/ScheduleList.vue  |  15 +++
 spiders/baidu/baidu/__init__.py               |   0
 spiders/baidu/baidu/items.py                  |  14 ---
 spiders/baidu/baidu/middlewares.py            | 103 ------------------
 spiders/baidu/baidu/pipelines.py              |  11 --
 spiders/baidu/baidu/settings.py               |  91 ----------------
 spiders/baidu/baidu/spiders/__init__.py       |   4 -
 spiders/baidu/baidu/spiders/baidu_spider.py   |  13 ---
 spiders/baidu/scrapy.cfg                      |  11 --
 spiders/meitui/app.py                         |   1 -
 spiders/taobao/dump.rdb                       | Bin 760 -> 0 bytes
 spiders/taobao/scrapy.cfg                     |  11 --
 spiders/taobao/taobao/__init__.py             |   0
 spiders/taobao/taobao/items.py                |  13 ---
 spiders/taobao/taobao/middlewares.py          | 103 ------------------
 spiders/taobao/taobao/pipelines.py            |  12 --
 spiders/taobao/taobao/settings.py             |  91 ----------------
 spiders/taobao/taobao/spiders/__init__.py     |   4 -
 .../taobao/taobao/spiders/taobao_spider.py    |  15 ---
 spiders/toutiao/toutiao_spider.js             |   0
 spiders/weixin/weixin_crawler.py              |   0
 36 files changed, 263 insertions(+), 508 deletions(-)
 create mode 100644 crawlab/app.py
 create mode 100644 crawlab/routes/schedules.py
 create mode 100644 crawlab/tasks/scheduler.py
 create mode 100644 frontend/src/views/schedule/ScheduleList.vue
 delete mode 100644 spiders/baidu/baidu/__init__.py
 delete mode 100644 spiders/baidu/baidu/items.py
 delete mode 100644 spiders/baidu/baidu/middlewares.py
 delete mode 100644 spiders/baidu/baidu/pipelines.py
 delete mode 100644 spiders/baidu/baidu/settings.py
 delete mode 100644 spiders/baidu/baidu/spiders/__init__.py
 delete mode 100644 spiders/baidu/baidu/spiders/baidu_spider.py
 delete mode 100644 spiders/baidu/scrapy.cfg
 delete mode 100644 spiders/meitui/app.py
 delete mode 100644 spiders/taobao/dump.rdb
 delete mode 100644 spiders/taobao/scrapy.cfg
 delete mode 100644 spiders/taobao/taobao/__init__.py
 delete mode 100644 spiders/taobao/taobao/items.py
 delete mode 100644 spiders/taobao/taobao/middlewares.py
 delete mode 100644 spiders/taobao/taobao/pipelines.py
 delete mode 100644 spiders/taobao/taobao/settings.py
 delete mode 100644 spiders/taobao/taobao/spiders/__init__.py
 delete mode 100644 spiders/taobao/taobao/spiders/taobao_spider.py
 delete mode 100644 spiders/toutiao/toutiao_spider.js
 delete mode 100644 spiders/weixin/weixin_crawler.py

diff --git a/README-zh.md b/README-zh.md
index 9eff0f97..7223aafd 100644
--- a/README-zh.md
+++ b/README-zh.md
@@ -14,7 +14,7 @@
 
 ```bash
 # 安装后台类库
-pip install -r ./crawlab/requirements.txt
+pip install -r requirements.txt
 ```
 
 ```bash
diff --git a/README.md b/README.md
index 820f4dd4..bffad3eb 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ Celery-based web crawler admin platform for managing distributed web spiders reg
 
 ```bash
 # install the requirements for backend
-pip install -r ./crawlab/requirements.txt
+pip install -r requirements.txt
 ```
 
 ```bash
diff --git a/crawlab/app.py b/crawlab/app.py
new file mode 100644
index 00000000..d72aa40f
--- /dev/null
+++ b/crawlab/app.py
@@ -0,0 +1,76 @@
+import os
+import subprocess
+import sys
+from multiprocessing import Process
+
+import click
+from flask import Flask
+from flask_cors import CORS
+from flask_restful import Api
+
+from routes.schedules import ScheduleApi
+from tasks.scheduler import scheduler
+
+file_dir = os.path.dirname(os.path.realpath(__file__))
+root_path = os.path.abspath(os.path.join(file_dir, '.'))
+sys.path.append(root_path)
+
+from config import FLASK_HOST, FLASK_PORT, PROJECT_LOGS_FOLDER, BROKER_URL
+from constants.manage import ActionType
+from routes.deploys import DeployApi
+from routes.files import FileApi
+from routes.nodes import NodeApi
+from routes.spiders import SpiderApi, SpiderImportApi, SpiderManageApi
+from routes.stats import StatsApi
+from routes.tasks import TaskApi
+from tasks.celery import celery_app
+
+# flask app instance
+app = Flask(__name__)
+app.config.from_object('config')
+
+# init flask api instance
+api = Api(app)
+
+# cors support
+CORS(app, supports_credentials=True)
+
+# reference api routes
+api.add_resource(NodeApi,
+                 '/api/nodes',
+                 '/api/nodes/<string:id>',
+                 '/api/nodes/<string:id>/<string:action>')
+api.add_resource(SpiderImportApi,
+                 '/api/spiders/import/<string:platform>')
+api.add_resource(SpiderManageApi,
+                 '/api/spiders/manage/<string:action>')
+api.add_resource(SpiderApi,
+                 '/api/spiders',
+                 '/api/spiders/<string:id>',
+                 '/api/spiders/<string:id>/<string:action>')
+api.add_resource(DeployApi,
+                 '/api/deploys',
+                 '/api/deploys/<string:id>',
+                 '/api/deploys/<string:id>/<string:action>')
+api.add_resource(TaskApi,
+                 '/api/tasks',
+                 '/api/tasks/<string:id>',
+                 '/api/tasks/<string:id>/<string:action>'
+                 )
+api.add_resource(FileApi,
+                 '/api/files',
+                 '/api/files/<string:action>')
+api.add_resource(StatsApi,
+                 '/api/stats',
+                 '/api/stats/<string:action>')
+api.add_resource(ScheduleApi,
+                 '/api/schedules',
+                 '/api/schedules/<string:id>')
+
+if __name__ == '__main__':
+    # create folder if it does not exist
+    if not os.path.exists(PROJECT_LOGS_FOLDER):
+        os.makedirs(PROJECT_LOGS_FOLDER)
+
+    # run app instance
+    app.run(host=FLASK_HOST, port=FLASK_PORT, threaded=True)
diff --git a/crawlab/config.py b/crawlab/config.py
index 6b1575af..5b1dee95 100644
--- a/crawlab/config.py
+++ b/crawlab/config.py
@@ -1,6 +1,4 @@
 # project variables
-from celery.schedules import crontab
-
 PROJECT_SOURCE_FILE_FOLDER = '/Users/yeqing/projects/crawlab/spiders'
 PROJECT_DEPLOY_FILE_FOLDER = '/var/crawlab'
 PROJECT_LOGS_FOLDER = '/var/logs/crawlab'
diff --git a/crawlab/constants/manage.py b/crawlab/constants/manage.py
index f5447bf2..1c57837d 100644
--- a/crawlab/constants/manage.py
+++ b/crawlab/constants/manage.py
@@ -2,4 +2,5 @@ class ActionType:
     APP = 'app'
     FLOWER = 'flower'
     WORKER = 'worker'
+    SCHEDULER = 'scheduler'
     RUN_ALL = 'run_all'
diff --git a/crawlab/constants/spider.py b/crawlab/constants/spider.py
index 7595f79e..685e2b07 100644
--- a/crawlab/constants/spider.py
+++ b/crawlab/constants/spider.py
@@ -12,6 +12,11 @@ class LangType:
     OTHER = 'other'
 
 
+class CronEnabled:
+    ON = 1
+    OFF = 0
+
+
 SUFFIX_IGNORE = [
     'pyc'
 ]
diff --git a/crawlab/manage.py b/crawlab/manage.py
index 45f07f17..9ea83d3c 100644
--- a/crawlab/manage.py
+++ b/crawlab/manage.py
@@ -8,6 +8,9 @@ from flask import Flask
 from flask_cors import CORS
 from flask_restful import Api
 
+from routes.schedules import ScheduleApi
+from tasks.scheduler import scheduler
+
 file_dir = os.path.dirname(os.path.realpath(__file__))
 root_path = os.path.abspath(os.path.join(file_dir, '.'))
 sys.path.append(root_path)
@@ -60,6 +63,9 @@ api.add_resource(FileApi,
 api.add_resource(StatsApi,
                  '/api/stats',
                  '/api/stats/<string:action>')
+api.add_resource(ScheduleApi,
+                 '/api/schedules',
+                 '/api/schedules/<string:id>')
 
 
 def run_app():
@@ -85,10 +91,15 @@ def run_worker():
         celery_app.start(argv=['tasks', 'worker', '-E', '-l', 'INFO'])
 
 
+def run_scheduler():
+    scheduler.run()
+
+
 @click.command()
 @click.argument('action', type=click.Choice([ActionType.APP,
                                              ActionType.FLOWER,
                                              ActionType.WORKER,
+                                             ActionType.SCHEDULER,
                                              ActionType.RUN_ALL]))
 def main(action):
     if action == ActionType.APP:
@@ -97,6 +108,8 @@ def main(action):
         run_flower()
     elif action == ActionType.WORKER:
         run_worker()
+    elif action == ActionType.SCHEDULER:
+        run_scheduler()
     elif action == ActionType.RUN_ALL:
         p_flower = Process(target=run_flower)
         p_flower.start()
@@ -104,6 +117,8 @@ def main(action):
         p_app.start()
         p_worker = Process(target=run_worker)
         p_worker.start()
+        p_scheduler = Process(target=run_scheduler)
+        p_scheduler.start()
 
 
 if __name__ == '__main__':
diff --git a/crawlab/routes/schedules.py b/crawlab/routes/schedules.py
new file mode 100644
index 00000000..1eceabde
--- /dev/null
+++ b/crawlab/routes/schedules.py
@@ -0,0 +1,18 @@
+import json
+
+import requests
+
+from constants.task import TaskStatus
+from db.manager import db_manager
+from routes.base import BaseApi
+from utils import jsonify
+from utils.spider import get_spider_col_fields
+
+
+class ScheduleApi(BaseApi):
+    col_name = 'schedules'
+
+    arguments = (
+        ('cron', str),
+        ('spider_id', str)
+    )
diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py
index f8a220e4..aaeca318 100644
--- a/crawlab/routes/spiders.py
+++ b/crawlab/routes/spiders.py
@@ -47,6 +47,12 @@ class SpiderApi(BaseApi):
 
         # spider results collection
         ('col', str),
+
+        # spider schedule cron
+        ('cron', str),
+
+        # spider schedule cron enabled
+        ('cron_enabled', int),
     )
 
     def get(self, id=None, action=None):
diff --git a/crawlab/tasks/scheduler.py b/crawlab/tasks/scheduler.py
new file mode 100644
index 00000000..f227dce8
--- /dev/null
+++ b/crawlab/tasks/scheduler.py
@@ -0,0 +1,53 @@
+import requests
+from apscheduler.schedulers.background import BlockingScheduler
+from apscheduler.jobstores.mongodb import MongoDBJobStore
+from pymongo import MongoClient
+
+from config import MONGO_DB, MONGO_HOST, MONGO_PORT
+from constants.spider import CronEnabled
+from db.manager import db_manager
+
+
+class Scheduler(object):
+    mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
+
+    jobstores = {
+        'mongo': MongoDBJobStore(database=MONGO_DB,
+                                 collection='apscheduler_jobs',
+                                 client=mongo)
+    }
+
+    scheduler = BlockingScheduler(jobstores=jobstores)
+
+    def execute_spider(self, id: str):
+        r = requests.get('http://localhost:5000/api/spiders/%s/on_crawl' % id)
+
+    def restart(self):
+        self.scheduler.shutdown()
+        self.scheduler.start()
+
+    def update(self):
+        self.scheduler.remove_all_jobs()
+        spiders = db_manager.list('spiders', {'cron_enabled': CronEnabled.ON})
+        for spider in spiders:
+            cron = spider.get('cron')
+            cron_arr = cron.split(' ')
+            second = cron_arr[0]
+            minute = cron_arr[1]
+            hour = cron_arr[2]
+            day = cron_arr[3]
+            month = cron_arr[4]
+            day_of_week = cron_arr[5]
+            self.scheduler.add_job(func=self.execute_spider, trigger='cron', args=(str(spider['_id']),),
+                                   day_of_week=day_of_week, month=month, day=day, hour=hour, minute=minute,
+                                   second=second)
+
+    def run(self):
+        self.update()
+        self.scheduler.start()
+
+
+scheduler = Scheduler()
+
+if __name__ == '__main__':
+    scheduler.run()
diff --git a/crawlab/tasks/spider.py b/crawlab/tasks/spider.py
index 26e0faf5..dd8a1c4e 100644
--- a/crawlab/tasks/spider.py
+++ b/crawlab/tasks/spider.py
@@ -1,10 +1,7 @@
 import os
-import sys
 from datetime import datetime
 
-import requests
 from bson import ObjectId
-from celery import current_app
 from celery.utils.log import get_logger
 
 from config import PROJECT_DEPLOY_FILE_FOLDER, PROJECT_LOGS_FOLDER
@@ -52,7 +49,8 @@ def execute_spider(self, id: str):
     # execute the command
     env = os.environ.copy()
     env['CRAWLAB_TASK_ID'] = task_id
-    env['CRAWLAB_COLLECTION'] = spider.get('col')
+    if spider.get('col'):
+        env['CRAWLAB_COLLECTION'] = spider.get('col')
     p = subprocess.Popen(command.split(' '),
                          stdout=stdout.fileno(),
                          stderr=stderr.fileno(),
diff --git a/frontend/src/App.vue b/frontend/src/App.vue
index 7b5c9085..38d6c19d 100644
--- a/frontend/src/App.vue
+++ b/frontend/src/App.vue
@@ -48,4 +48,8 @@ export default {
     margin-top: 10px;
     text-align: right;
   }
+
+  .el-form .el-form-item {
+    margin-bottom: 10px;
+  }
 </style>
diff --git a/frontend/src/components/InfoView/SpiderInfoView.vue b/frontend/src/components/InfoView/SpiderInfoView.vue
index 4265e58d..6864acd7 100644
--- a/frontend/src/components/InfoView/SpiderInfoView.vue
+++ b/frontend/src/components/InfoView/SpiderInfoView.vue
@@ -38,6 +38,23 @@
             <el-option value="go" label="Go"></el-option>
           </el-select>
         </el-form-item>
+        <el-form-item label="Schedule Enabled">
+          <el-switch v-model="spiderForm.cron_enabled" :disabled="isView">
+          </el-switch>
+        </el-form-item>
+        <el-form-item label="Schedule Cron" v-if="spiderForm.cron_enabled" prop="cron" :rules="cronRules">
+          <template slot="label">
+            <el-tooltip content="Cron Format: [second] [minute] [hour] [day of month] [month] [day of week]"
+                        placement="top">
+              <span>
+                Schedule Cron
+                <i class="fa fa-exclamation-circle"></i>
+              </span>
+            </el-tooltip>
+          </template>
+          <el-input v-model="spiderForm.cron" placeholder="Schedule Cron"
+                    :disabled="isView"></el-input>
+        </el-form-item>
       </el-form>
     </el-row>
     <el-row class="button-container" v-if="!isView">
@@ -62,9 +79,27 @@ export default {
     }
   },
   data () {
+    const cronValidator = (rule, value, callback) => {
+      let patArr = []
+      for (let i = 0; i < 6; i++) {
+        patArr.push('[/*,0-9]+')
+      }
+      const pat = '^' + patArr.join(' ') + '$'
+      if (this.spiderForm.cron_enabled) {
+        if (!value) {
+          callback(new Error('cron cannot be empty'))
+        } else if (!value.match(pat)) {
+          callback(new Error('cron format is invalid'))
+        }
+      }
+      callback()
+    }
     return {
       cmdRule: [
         { message: 'Execute Command should not be empty', required: true }
+      ],
+      cronRules: [
+        { validator: cronValidator, trigger: 'blur' }
       ]
     }
   },
diff --git a/frontend/src/router/index.js b/frontend/src/router/index.js
index 7da95e4f..46b1e741 100644
--- a/frontend/src/router/index.js
+++ b/frontend/src/router/index.js
@@ -132,6 +132,27 @@ export const constantRouterMap = [
       }
     ]
   },
+  {
+    name: 'Schedule',
+    path: '/schedules',
+    component: Layout,
+    meta: {
+      title: 'Schedules',
+      icon: 'fa fa-calendar'
+    },
+    hidden: true,
+    children: [
+      {
+        path: '',
+        name: 'ScheduleList',
+        component: () => import('../views/schedule/ScheduleList'),
+        meta: {
+          title: 'Schedules',
+          icon: 'fa fa-calendar'
+        }
+      }
+    ]
+  },
   {
     name: 'Deploy',
     path: '/deploys',
diff --git a/frontend/src/store/modules/spider.js b/frontend/src/store/modules/spider.js
index 9514e08f..30fb7a69 100644
--- a/frontend/src/store/modules/spider.js
+++ b/frontend/src/store/modules/spider.js
@@ -48,7 +48,10 @@ const actions = {
       src: state.spiderForm.src,
       cmd: state.spiderForm.cmd,
       type: state.spiderForm.type,
-      lang: state.spiderForm.lang
+      lang: state.spiderForm.lang,
+      col: state.spiderForm.col,
+      cron: state.spiderForm.cron,
+      cron_enabled: state.spiderForm.cron_enabled ? 1 : 0
     })
       .then(() => {
         dispatch('getSpiderList')
@@ -61,7 +64,9 @@ const actions = {
       cmd: state.spiderForm.cmd,
       type: state.spiderForm.type,
       lang: state.spiderForm.lang,
-      col: state.spiderForm.col
+      col: state.spiderForm.col,
+      cron: state.spiderForm.cron,
+      cron_enabled: state.spiderForm.cron_enabled ? 1 : 0
     })
       .then(() => {
         dispatch('getSpiderList')
@@ -76,7 +81,9 @@ const actions = {
   getSpiderData ({ state, commit }, id) {
     return request.get(`/spiders/${id}`)
       .then(response => {
-        commit('SET_SPIDER_FORM', response.data)
+        let data = response.data
+        data.cron_enabled = !!data.cron_enabled
+        commit('SET_SPIDER_FORM', data)
       })
   },
   deploySpider ({ state, dispatch }, id) {
diff --git a/frontend/src/views/schedule/ScheduleList.vue b/frontend/src/views/schedule/ScheduleList.vue
new file mode 100644
index 00000000..d1b8a5bb
--- /dev/null
+++ b/frontend/src/views/schedule/ScheduleList.vue
@@ -0,0 +1,15 @@
+<template>
+  <div class="app-container">
+    Schedule List
+  </div>
+</template>
+
+<script>
+export default {
+  name: 'ScheduleList'
+}
+</script>
+
+<style scoped>
+
+</style>
diff --git a/spiders/baidu/baidu/__init__.py b/spiders/baidu/baidu/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/spiders/baidu/baidu/items.py b/spiders/baidu/baidu/items.py
deleted file mode 100644
index 26b5888c..00000000
--- a/spiders/baidu/baidu/items.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Define here the models for your scraped items
-#
-# See documentation in:
-# https://doc.scrapy.org/en/latest/topics/items.html
-
-import scrapy
-
-
-class BaiduItem(scrapy.Item):
-    # define the fields for your item here like:
-    title = scrapy.Field()
-    url = scrapy.Field()
diff --git a/spiders/baidu/baidu/middlewares.py b/spiders/baidu/baidu/middlewares.py
deleted file mode 100644
index 3911485d..00000000
--- a/spiders/baidu/baidu/middlewares.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Define here the models for your spider middleware
-#
-# See documentation in:
-# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
-
-from scrapy import signals
-
-
-class BaiduSpiderMiddleware(object):
-    # Not all methods need to be defined. If a method is not defined,
-    # scrapy acts as if the spider middleware does not modify the
-    # passed objects.
-
-    @classmethod
-    def from_crawler(cls, crawler):
-        # This method is used by Scrapy to create your spiders.
-        s = cls()
-        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
-        return s
-
-    def process_spider_input(self, response, spider):
-        # Called for each response that goes through the spider
-        # middleware and into the spider.
-
-        # Should return None or raise an exception.
-        return None
-
-    def process_spider_output(self, response, result, spider):
-        # Called with the results returned from the Spider, after
-        # it has processed the response.
-
-        # Must return an iterable of Request, dict or Item objects.
-        for i in result:
-            yield i
-
-    def process_spider_exception(self, response, exception, spider):
-        # Called when a spider or process_spider_input() method
-        # (from other spider middleware) raises an exception.
-
-        # Should return either None or an iterable of Response, dict
-        # or Item objects.
-        pass
-
-    def process_start_requests(self, start_requests, spider):
-        # Called with the start requests of the spider, and works
-        # similarly to the process_spider_output() method, except
-        # that it doesn’t have a response associated.
-
-        # Must return only requests (not items).
-        for r in start_requests:
-            yield r
-
-    def spider_opened(self, spider):
-        spider.logger.info('Spider opened: %s' % spider.name)
-
-
-class BaiduDownloaderMiddleware(object):
-    # Not all methods need to be defined. If a method is not defined,
-    # scrapy acts as if the downloader middleware does not modify the
-    # passed objects.
-
-    @classmethod
-    def from_crawler(cls, crawler):
-        # This method is used by Scrapy to create your spiders.
-        s = cls()
-        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
-        return s
-
-    def process_request(self, request, spider):
-        # Called for each request that goes through the downloader
-        # middleware.
-
-        # Must either:
-        # - return None: continue processing this request
-        # - or return a Response object
-        # - or return a Request object
-        # - or raise IgnoreRequest: process_exception() methods of
-        #   installed downloader middleware will be called
-        return None
-
-    def process_response(self, request, response, spider):
-        # Called with the response returned from the downloader.
-
-        # Must either;
-        # - return a Response object
-        # - return a Request object
-        # - or raise IgnoreRequest
-        return response
-
-    def process_exception(self, request, exception, spider):
-        # Called when a download handler or a process_request()
-        # (from other downloader middleware) raises an exception.
-
-        # Must either:
-        # - return None: continue processing this exception
-        # - return a Response object: stops process_exception() chain
-        # - return a Request object: stops process_exception() chain
-        pass
-
-    def spider_opened(self, spider):
-        spider.logger.info('Spider opened: %s' % spider.name)
diff --git a/spiders/baidu/baidu/pipelines.py b/spiders/baidu/baidu/pipelines.py
deleted file mode 100644
index beae9c24..00000000
--- a/spiders/baidu/baidu/pipelines.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Define your item pipelines here
-#
-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
-
-
-class BaiduPipeline(object):
-    def process_item(self, item, spider):
-        return item
diff --git a/spiders/baidu/baidu/settings.py b/spiders/baidu/baidu/settings.py
deleted file mode 100644
index 667b09ca..00000000
--- a/spiders/baidu/baidu/settings.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Scrapy settings for baidu project
-#
-# For simplicity, this file contains only settings considered important or
-# commonly used. You can find more settings consulting the documentation:
-#
-#     https://doc.scrapy.org/en/latest/topics/settings.html
-#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
-#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
-
-BOT_NAME = 'baidu'
-
-SPIDER_MODULES = ['baidu.spiders']
-NEWSPIDER_MODULE = 'baidu.spiders'
-
-
-# Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'baidu (+http://www.yourdomain.com)'
-
-# Obey robots.txt rules
-ROBOTSTXT_OBEY = True
-# ROBOTSTXT_OBEY = False
-
-# Configure maximum concurrent requests performed by Scrapy (default: 16)
-#CONCURRENT_REQUESTS = 32
-
-# Configure a delay for requests for the same website (default: 0)
-# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
-# See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
-# The download delay setting will honor only one of:
-#CONCURRENT_REQUESTS_PER_DOMAIN = 16
-#CONCURRENT_REQUESTS_PER_IP = 16
-
-# Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
-
-# Disable Telnet Console (enabled by default)
-#TELNETCONSOLE_ENABLED = False
-
-# Override the default request headers:
-#DEFAULT_REQUEST_HEADERS = {
-#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-#   'Accept-Language': 'en',
-#}
-
-# Enable or disable spider middlewares
-# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
-#SPIDER_MIDDLEWARES = {
-#    'baidu.middlewares.BaiduSpiderMiddleware': 543,
-#}
-
-# Enable or disable downloader middlewares
-# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
-#DOWNLOADER_MIDDLEWARES = {
-#    'baidu.middlewares.BaiduDownloaderMiddleware': 543,
-#}
-
-# Enable or disable extensions
-# See https://doc.scrapy.org/en/latest/topics/extensions.html
-#EXTENSIONS = {
-#    'scrapy.extensions.telnet.TelnetConsole': None,
-#}
-
-# Configure item pipelines
-# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'baidu.pipelines.BaiduPipeline': 300,
-#}
-
-# Enable and configure the AutoThrottle extension (disabled by default)
-# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
-# The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
-# The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
-# The average number of requests Scrapy should be sending in parallel to
-# each remote server
-#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
-# Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG = False
-
-# Enable and configure HTTP caching (disabled by default)
-# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-#HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS = 0
-#HTTPCACHE_DIR = 'httpcache'
-#HTTPCACHE_IGNORE_HTTP_CODES = []
-#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
diff --git a/spiders/baidu/baidu/spiders/__init__.py b/spiders/baidu/baidu/spiders/__init__.py
deleted file mode 100644
index ebd689ac..00000000
--- a/spiders/baidu/baidu/spiders/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# This package will contain the spiders of your Scrapy project
-#
-# Please refer to the documentation for information on how to create and manage
-# your spiders.
diff --git a/spiders/baidu/baidu/spiders/baidu_spider.py b/spiders/baidu/baidu/spiders/baidu_spider.py
deleted file mode 100644
index f84ffc8d..00000000
--- a/spiders/baidu/baidu/spiders/baidu_spider.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# -*- coding: utf-8 -*-
-from time import sleep
-
-import scrapy
-
-
-class BaiduSpiderSpider(scrapy.Spider):
-    name = 'baidu_spider'
-    allowed_domains = ['baidu.com']
-    start_urls = ['http://baidu.com/s?wd=百度']
-
-    def parse(self, response):
-        sleep(30)
diff --git a/spiders/baidu/scrapy.cfg b/spiders/baidu/scrapy.cfg
deleted file mode 100644
index 492b18d1..00000000
--- a/spiders/baidu/scrapy.cfg
+++ /dev/null
@@ -1,11 +0,0 @@
-# Automatically created by: scrapy startproject
-#
-# For more information about the [deploy] section see:
-# https://scrapyd.readthedocs.io/en/latest/deploy.html
-
-[settings]
-default = baidu.settings
-
-[deploy]
-#url = http://localhost:6800/
-project = baidu
diff --git a/spiders/meitui/app.py b/spiders/meitui/app.py
deleted file mode 100644
index 493ffc46..00000000
--- a/spiders/meitui/app.py
+++ /dev/null
@@ -1 +0,0 @@
-# /Users/yeqing/projects/crawlab/spiders
diff --git a/spiders/taobao/dump.rdb b/spiders/taobao/dump.rdb
deleted file mode 100644
index 48df713409ebce5b6617a84280df170eb7137ad1..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 760
zcmZvZv2K+y3`TD|bOMQm@&K^9iWA3)cNo|ZJV0f~4qTO9B%l@SycGkF!N3d;fe?_Y
z1JW;+#KU)VJ}1wgJbwBjrNqB?zdk>EmG?)x-R|zyLB~b-`Q_F7`0&U$v{kC0%@}AJ
zf|)uLvKX~sVzlG?=NE4-?dr_-2j3rFo{e}NZ$BR0yS-0NPfl;A;|Ko@#`T^2@Lwvf
z&nzq97!y>h0@c#iBm}K^>5_=k>>Ff`Dz2e0Q(Z-C3$UOGp0ac*9u{qbur(jGErDnS
zB39^31&}3Ul93$i2B}4(4I&_~2xO#ST{_^vs_Y@T&<5$sjM&o(_P&aRFN+CSqM13G
z7u{OKEsJr5V!Dv%1kzN1bDCUj_Efi)+&KrWI_YKuY9m3XM&QN`r)t3*n_`lwb<Y7=
z+p>(@Kw=@%br`G1)UAb!uiB}D-yCFGK!h>~siPL*o@HanhE&t65L0|TTgqVU19;eM
zbE0S5T1-+Jf(ZmM*1u2(n>7K3_ZtYCq>ZHq4dKF2eJrFb-&&7hT4br67(@U3{PF4Q
Jw=eI${{;yZ)Xe|@

diff --git a/spiders/taobao/scrapy.cfg b/spiders/taobao/scrapy.cfg
deleted file mode 100644
index c0e3980d..00000000
--- a/spiders/taobao/scrapy.cfg
+++ /dev/null
@@ -1,11 +0,0 @@
-# Automatically created by: scrapy startproject
-#
-# For more information about the [deploy] section see:
-# https://scrapyd.readthedocs.io/en/latest/deploy.html
-
-[settings]
-default = taobao.settings
-
-[deploy]
-#url = http://localhost:6800/
-project = taobao
diff --git a/spiders/taobao/taobao/__init__.py b/spiders/taobao/taobao/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/spiders/taobao/taobao/items.py b/spiders/taobao/taobao/items.py
deleted file mode 100644
index 199c1f82..00000000
--- a/spiders/taobao/taobao/items.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Define here the models for your scraped items
-#
-# See documentation in:
-# https://doc.scrapy.org/en/latest/topics/items.html
-
-import scrapy
-
-
-class TaobaoItem(scrapy.Item):
-    # define the fields for your item here like:
-    name = scrapy.Field()
diff --git a/spiders/taobao/taobao/middlewares.py b/spiders/taobao/taobao/middlewares.py
deleted file mode 100644
index afc752ba..00000000
--- a/spiders/taobao/taobao/middlewares.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Define here the models for your spider middleware
-#
-# See documentation in:
-# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
-
-from scrapy import signals
-
-
-class TaobaoSpiderMiddleware(object):
-    # Not all methods need to be defined. If a method is not defined,
-    # scrapy acts as if the spider middleware does not modify the
-    # passed objects.
-
-    @classmethod
-    def from_crawler(cls, crawler):
-        # This method is used by Scrapy to create your spiders.
-        s = cls()
-        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
-        return s
-
-    def process_spider_input(self, response, spider):
-        # Called for each response that goes through the spider
-        # middleware and into the spider.
-
-        # Should return None or raise an exception.
-        return None
-
-    def process_spider_output(self, response, result, spider):
-        # Called with the results returned from the Spider, after
-        # it has processed the response.
-
-        # Must return an iterable of Request, dict or Item objects.
-        for i in result:
-            yield i
-
-    def process_spider_exception(self, response, exception, spider):
-        # Called when a spider or process_spider_input() method
-        # (from other spider middleware) raises an exception.
-
-        # Should return either None or an iterable of Response, dict
-        # or Item objects.
-        pass
-
-    def process_start_requests(self, start_requests, spider):
-        # Called with the start requests of the spider, and works
-        # similarly to the process_spider_output() method, except
-        # that it doesn’t have a response associated.
-
-        # Must return only requests (not items).
-        for r in start_requests:
-            yield r
-
-    def spider_opened(self, spider):
-        spider.logger.info('Spider opened: %s' % spider.name)
-
-
-class TaobaoDownloaderMiddleware(object):
-    # Not all methods need to be defined. If a method is not defined,
-    # scrapy acts as if the downloader middleware does not modify the
-    # passed objects.
-
-    @classmethod
-    def from_crawler(cls, crawler):
-        # This method is used by Scrapy to create your spiders.
-        s = cls()
-        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
-        return s
-
-    def process_request(self, request, spider):
-        # Called for each request that goes through the downloader
-        # middleware.
-
-        # Must either:
-        # - return None: continue processing this request
-        # - or return a Response object
-        # - or return a Request object
-        # - or raise IgnoreRequest: process_exception() methods of
-        #   installed downloader middleware will be called
-        return None
-
-    def process_response(self, request, response, spider):
-        # Called with the response returned from the downloader.
-
-        # Must either;
-        # - return a Response object
-        # - return a Request object
-        # - or raise IgnoreRequest
-        return response
-
-    def process_exception(self, request, exception, spider):
-        # Called when a download handler or a process_request()
-        # (from other downloader middleware) raises an exception.
-
-        # Must either:
-        # - return None: continue processing this exception
-        # - return a Response object: stops process_exception() chain
-        # - return a Request object: stops process_exception() chain
-        pass
-
-    def spider_opened(self, spider):
-        spider.logger.info('Spider opened: %s' % spider.name)
diff --git a/spiders/taobao/taobao/pipelines.py b/spiders/taobao/taobao/pipelines.py
deleted file mode 100644
index 7ddf8da5..00000000
--- a/spiders/taobao/taobao/pipelines.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Define your item pipelines here
-#
-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
-
-
-class TaobaoPipeline(object):
-    def process_item(self, item, spider):
-        print('task_id: %s' % spider.task_id)
-        return item
diff --git a/spiders/taobao/taobao/settings.py b/spiders/taobao/taobao/settings.py
deleted file mode 100644
index 0e237049..00000000
--- a/spiders/taobao/taobao/settings.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Scrapy settings for taobao project
-#
-# For simplicity, this file contains only settings considered important or
-# commonly used. You can find more settings consulting the documentation:
-#
-#     https://doc.scrapy.org/en/latest/topics/settings.html
-#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
-#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
-
-BOT_NAME = 'taobao'
-
-SPIDER_MODULES = ['taobao.spiders']
-NEWSPIDER_MODULE = 'taobao.spiders'
-
-
-# Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'taobao (+http://www.yourdomain.com)'
-
-# Obey robots.txt rules
-# ROBOTSTXT_OBEY = True
-ROBOTSTXT_OBEY = False
-
-# Configure maximum concurrent requests performed by Scrapy (default: 16)
-#CONCURRENT_REQUESTS = 32
-
-# Configure a delay for requests for the same website (default: 0)
-# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
-# See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
-# The download delay setting will honor only one of:
-#CONCURRENT_REQUESTS_PER_DOMAIN = 16
-#CONCURRENT_REQUESTS_PER_IP = 16
-
-# Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
-
-# Disable Telnet Console (enabled by default)
-#TELNETCONSOLE_ENABLED = False
-
-# Override the default request headers:
-#DEFAULT_REQUEST_HEADERS = {
-#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-#   'Accept-Language': 'en',
-#}
-
-# Enable or disable spider middlewares
-# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
-#SPIDER_MIDDLEWARES = {
-#    'taobao.middlewares.TaobaoSpiderMiddleware': 543,
-#}
-
-# Enable or disable downloader middlewares
-# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
-#DOWNLOADER_MIDDLEWARES = {
-#    'taobao.middlewares.TaobaoDownloaderMiddleware': 543,
-#}
-
-# Enable or disable extensions
-# See https://doc.scrapy.org/en/latest/topics/extensions.html
-#EXTENSIONS = {
-#    'scrapy.extensions.telnet.TelnetConsole': None,
-#}
-
-# Configure item pipelines
-# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'taobao.pipelines.TaobaoPipeline': 300,
-#}
-
-# Enable and configure the AutoThrottle extension (disabled by default)
-# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
-# The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
-# The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
-# The average number of requests Scrapy should be sending in parallel to
-# each remote server
-#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
-# Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG = False
-
-# Enable and configure HTTP caching (disabled by default)
-# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-#HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS = 0
-#HTTPCACHE_DIR = 'httpcache'
-#HTTPCACHE_IGNORE_HTTP_CODES = []
-#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
diff --git a/spiders/taobao/taobao/spiders/__init__.py b/spiders/taobao/taobao/spiders/__init__.py
deleted file mode 100644
index ebd689ac..00000000
--- a/spiders/taobao/taobao/spiders/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# This package will contain the spiders of your Scrapy project
-#
-# Please refer to the documentation for information on how to create and manage
-# your spiders.
diff --git a/spiders/taobao/taobao/spiders/taobao_spider.py b/spiders/taobao/taobao/spiders/taobao_spider.py
deleted file mode 100644
index 2a939a06..00000000
--- a/spiders/taobao/taobao/spiders/taobao_spider.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# -*- coding: utf-8 -*-
-import os
-
-import scrapy
-
-from ..items import TaobaoItem
-
-
-class TaobaoSpiderSpider(scrapy.Spider):
-    name = 'taobao_spider'
-    allowed_domains = ['taobao.com']
-    start_urls = ['http://taobao.com/']
-
-    def parse(self, response):
-        yield TaobaoItem()
diff --git a/spiders/toutiao/toutiao_spider.js b/spiders/toutiao/toutiao_spider.js
deleted file mode 100644
index e69de29b..00000000
diff --git a/spiders/weixin/weixin_crawler.py b/spiders/weixin/weixin_crawler.py
deleted file mode 100644
index e69de29b..00000000