added cron tasks for spiders

2026-01-21 17:21:09 +01:00 · 2019-03-09 14:05:14 +08:00
parent 2f0107fff4
commit 647fac1efe
36 changed files with 263 additions and 508 deletions
--- a/README-zh.md
+++ b/README-zh.md
@@ -14,7 +14,7 @@

 ```bash
 # 安装后台类库
-pip install -r ./crawlab/requirements.txt
+pip install -r requirements.txt
 ```

 ```bash
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ Celery-based web crawler admin platform for managing distributed web spiders reg

 ```bash
 # install the requirements for backend
-pip install -r ./crawlab/requirements.txt
+pip install -r requirements.txt
 ```

 ```bash
--- a/crawlab/app.py
+++ b/crawlab/app.py
@@ -0,0 +1,76 @@
+import os
+import subprocess
+import sys
+from multiprocessing import Process
+
+import click
+from flask import Flask
+from flask_cors import CORS
+from flask_restful import Api
+
+from routes.schedules import ScheduleApi
+from tasks.scheduler import scheduler
+
+file_dir = os.path.dirname(os.path.realpath(__file__))
+root_path = os.path.abspath(os.path.join(file_dir, '.'))
+sys.path.append(root_path)
+
+from config import FLASK_HOST, FLASK_PORT, PROJECT_LOGS_FOLDER, BROKER_URL
+from constants.manage import ActionType
+from routes.deploys import DeployApi
+from routes.files import FileApi
+from routes.nodes import NodeApi
+from routes.spiders import SpiderApi, SpiderImportApi, SpiderManageApi
+from routes.stats import StatsApi
+from routes.tasks import TaskApi
+from tasks.celery import celery_app
+
+# flask app instance
+app = Flask(__name__)
+app.config.from_object('config')
+
+# init flask api instance
+api = Api(app)
+
+# cors support
+CORS(app, supports_credentials=True)
+
+# reference api routes
+api.add_resource(NodeApi,
+                 '/api/nodes',
+                 '/api/nodes/<string:id>',
+                 '/api/nodes/<string:id>/<string:action>')
+api.add_resource(SpiderImportApi,
+                 '/api/spiders/import/<string:platform>')
+api.add_resource(SpiderManageApi,
+                 '/api/spiders/manage/<string:action>')
+api.add_resource(SpiderApi,
+                 '/api/spiders',
+                 '/api/spiders/<string:id>',
+                 '/api/spiders/<string:id>/<string:action>')
+api.add_resource(DeployApi,
+                 '/api/deploys',
+                 '/api/deploys/<string:id>',
+                 '/api/deploys/<string:id>/<string:action>')
+api.add_resource(TaskApi,
+                 '/api/tasks',
+                 '/api/tasks/<string:id>',
+                 '/api/tasks/<string:id>/<string:action>'
+                 )
+api.add_resource(FileApi,
+                 '/api/files',
+                 '/api/files/<string:action>')
+api.add_resource(StatsApi,
+                 '/api/stats',
+                 '/api/stats/<string:action>')
+api.add_resource(ScheduleApi,
+                 '/api/schedules',
+                 '/api/schedules/<string:id>')
+
+if __name__ == '__main__':
+    # create folder if it does not exist
+    if not os.path.exists(PROJECT_LOGS_FOLDER):
+        os.makedirs(PROJECT_LOGS_FOLDER)
+
+    # run app instance
+    app.run(host=FLASK_HOST, port=FLASK_PORT, threaded=True)
--- a/crawlab/config.py
+++ b/crawlab/config.py
@@ -1,6 +1,4 @@
 # project variables
-from celery.schedules import crontab
-
 PROJECT_SOURCE_FILE_FOLDER = '/Users/yeqing/projects/crawlab/spiders'
 PROJECT_DEPLOY_FILE_FOLDER = '/var/crawlab'
 PROJECT_LOGS_FOLDER = '/var/logs/crawlab'
--- a/crawlab/constants/manage.py
+++ b/crawlab/constants/manage.py
@@ -2,4 +2,5 @@ class ActionType:
    APP = 'app'
    FLOWER = 'flower'
    WORKER = 'worker'
+    SCHEDULER = 'scheduler'
    RUN_ALL = 'run_all'
--- a/crawlab/constants/spider.py
+++ b/crawlab/constants/spider.py
@@ -12,6 +12,11 @@ class LangType:
    OTHER = 'other'


+class CronEnabled:
+    ON = 1
+    OFF = 0
+
+
 SUFFIX_IGNORE = [
    'pyc'
 ]
--- a/crawlab/manage.py
+++ b/crawlab/manage.py
@@ -8,6 +8,9 @@ from flask import Flask
 from flask_cors import CORS
 from flask_restful import Api

+from routes.schedules import ScheduleApi
+from tasks.scheduler import scheduler
+
 file_dir = os.path.dirname(os.path.realpath(__file__))
 root_path = os.path.abspath(os.path.join(file_dir, '.'))
 sys.path.append(root_path)
@@ -60,6 +63,9 @@ api.add_resource(FileApi,
 api.add_resource(StatsApi,
                 '/api/stats',
                 '/api/stats/<string:action>')
+api.add_resource(ScheduleApi,
+                 '/api/schedules',
+                 '/api/schedules/<string:id>')


 def run_app():
@@ -85,10 +91,15 @@ def run_worker():
        celery_app.start(argv=['tasks', 'worker', '-E', '-l', 'INFO'])


+def run_scheduler():
+    scheduler.run()
+
+
@click.command()
@click.argument('action', type=click.Choice([ActionType.APP,
                                             ActionType.FLOWER,
                                             ActionType.WORKER,
+                                             ActionType.SCHEDULER,
                                             ActionType.RUN_ALL]))
 def main(action):
    if action == ActionType.APP:
@@ -97,6 +108,8 @@ def main(action):
        run_flower()
    elif action == ActionType.WORKER:
        run_worker()
+    elif action == ActionType.SCHEDULER:
+        run_scheduler()
    elif action == ActionType.RUN_ALL:
        p_flower = Process(target=run_flower)
        p_flower.start()
@@ -104,6 +117,8 @@ def main(action):
        p_app.start()
        p_worker = Process(target=run_worker)
        p_worker.start()
+        p_scheduler = Process(target=run_scheduler)
+        p_scheduler.start()


 if __name__ == '__main__':
--- a/crawlab/routes/schedules.py
+++ b/crawlab/routes/schedules.py
@@ -0,0 +1,18 @@
+import json
+
+import requests
+
+from constants.task import TaskStatus
+from db.manager import db_manager
+from routes.base import BaseApi
+from utils import jsonify
+from utils.spider import get_spider_col_fields
+
+
+class ScheduleApi(BaseApi):
+    col_name = 'schedules'
+
+    arguments = (
+        ('cron', str),
+        ('spider_id', str)
+    )
--- a/crawlab/routes/spiders.py
+++ b/crawlab/routes/spiders.py
@@ -47,6 +47,12 @@ class SpiderApi(BaseApi):

        # spider results collection
        ('col', str),
+
+        # spider schedule cron
+        ('cron', str),
+
+        # spider schedule cron enabled
+        ('cron_enabled', int),
    )

    def get(self, id=None, action=None):
--- a/crawlab/tasks/scheduler.py
+++ b/crawlab/tasks/scheduler.py
@@ -0,0 +1,53 @@
+import requests
+from apscheduler.schedulers.background import BlockingScheduler
+from apscheduler.jobstores.mongodb import MongoDBJobStore
+from pymongo import MongoClient
+
+from config import MONGO_DB, MONGO_HOST, MONGO_PORT
+from constants.spider import CronEnabled
+from db.manager import db_manager
+
+
+class Scheduler(object):
+    mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
+
+    jobstores = {
+        'mongo': MongoDBJobStore(database=MONGO_DB,
+                                 collection='apscheduler_jobs',
+                                 client=mongo)
+    }
+
+    scheduler = BlockingScheduler(jobstores=jobstores)
+
+    def execute_spider(self, id: str):
+        r = requests.get('http://localhost:5000/api/spiders/%s/on_crawl' % id)
+
+    def restart(self):
+        self.scheduler.shutdown()
+        self.scheduler.start()
+
+    def update(self):
+        self.scheduler.remove_all_jobs()
+        spiders = db_manager.list('spiders', {'cron_enabled': CronEnabled.ON})
+        for spider in spiders:
+            cron = spider.get('cron')
+            cron_arr = cron.split(' ')
+            second = cron_arr[0]
+            minute = cron_arr[1]
+            hour = cron_arr[2]
+            day = cron_arr[3]
+            month = cron_arr[4]
+            day_of_week = cron_arr[5]
+            self.scheduler.add_job(func=self.execute_spider, trigger='cron', args=(str(spider['_id']),),
+                                   day_of_week=day_of_week, month=month, day=day, hour=hour, minute=minute,
+                                   second=second)
+
+    def run(self):
+        self.update()
+        self.scheduler.start()
+
+
+scheduler = Scheduler()
+
+if __name__ == '__main__':
+    scheduler.run()
--- a/crawlab/tasks/spider.py
+++ b/crawlab/tasks/spider.py
@@ -1,10 +1,7 @@
 import os
-import sys
 from datetime import datetime

-import requests
 from bson import ObjectId
-from celery import current_app
 from celery.utils.log import get_logger

 from config import PROJECT_DEPLOY_FILE_FOLDER, PROJECT_LOGS_FOLDER
@@ -52,7 +49,8 @@ def execute_spider(self, id: str):
    # execute the command
    env = os.environ.copy()
    env['CRAWLAB_TASK_ID'] = task_id
-    env['CRAWLAB_COLLECTION'] = spider.get('col')
+    if spider.get('col'):
+        env['CRAWLAB_COLLECTION'] = spider.get('col')
    p = subprocess.Popen(command.split(' '),
                         stdout=stdout.fileno(),
                         stderr=stderr.fileno(),
--- a/frontend/src/App.vue
+++ b/frontend/src/App.vue
@@ -48,4 +48,8 @@ export default {
    margin-top: 10px;
    text-align: right;
  }
+
+  .el-form .el-form-item {
+    margin-bottom: 10px;
+  }
 </style>
--- a/frontend/src/components/InfoView/SpiderInfoView.vue
+++ b/frontend/src/components/InfoView/SpiderInfoView.vue
@@ -38,6 +38,23 @@
            <el-option value="go" label="Go"></el-option>
          </el-select>
        </el-form-item>
+        <el-form-item label="Schedule Enabled">
+          <el-switch v-model="spiderForm.cron_enabled" :disabled="isView">
+          </el-switch>
+        </el-form-item>
+        <el-form-item label="Schedule Cron" v-if="spiderForm.cron_enabled" prop="cron" :rules="cronRules">
+          <template slot="label">
+            <el-tooltip content="Cron Format: [second] [minute] [hour] [day of month] [month] [day of week]"
+                        placement="top">
+              <span>
+                Schedule Cron
+                <i class="fa fa-exclamation-circle"></i>
+              </span>
+            </el-tooltip>
+          </template>
+          <el-input v-model="spiderForm.cron" placeholder="Schedule Cron"
+                    :disabled="isView"></el-input>
+        </el-form-item>
      </el-form>
    </el-row>
    <el-row class="button-container" v-if="!isView">
@@ -62,9 +79,27 @@ export default {
    }
  },
  data () {
+    const cronValidator = (rule, value, callback) => {
+      let patArr = []
+      for (let i = 0; i < 6; i++) {
+        patArr.push('[/*,0-9]+')
+      }
+      const pat = '^' + patArr.join(' ') + '$'
+      if (this.spiderForm.cron_enabled) {
+        if (!value) {
+          callback(new Error('cron cannot be empty'))
+        } else if (!value.match(pat)) {
+          callback(new Error('cron format is invalid'))
+        }
+      }
+      callback()
+    }
    return {
      cmdRule: [
        { message: 'Execute Command should not be empty', required: true }
+      ],
+      cronRules: [
+        { validator: cronValidator, trigger: 'blur' }
      ]
    }
  },
--- a/frontend/src/router/index.js
+++ b/frontend/src/router/index.js
@@ -132,6 +132,27 @@ export const constantRouterMap = [
      }
    ]
  },
+  {
+    name: 'Schedule',
+    path: '/schedules',
+    component: Layout,
+    meta: {
+      title: 'Schedules',
+      icon: 'fa fa-calendar'
+    },
+    hidden: true,
+    children: [
+      {
+        path: '',
+        name: 'ScheduleList',
+        component: () => import('../views/schedule/ScheduleList'),
+        meta: {
+          title: 'Schedules',
+          icon: 'fa fa-calendar'
+        }
+      }
+    ]
+  },
  {
    name: 'Deploy',
    path: '/deploys',
--- a/frontend/src/store/modules/spider.js
+++ b/frontend/src/store/modules/spider.js
@@ -48,7 +48,10 @@ const actions = {
      src: state.spiderForm.src,
      cmd: state.spiderForm.cmd,
      type: state.spiderForm.type,
-      lang: state.spiderForm.lang
+      lang: state.spiderForm.lang,
+      col: state.spiderForm.col,
+      cron: state.spiderForm.cron,
+      cron_enabled: state.spiderForm.cron_enabled ? 1 : 0
    })
      .then(() => {
        dispatch('getSpiderList')
@@ -61,7 +64,9 @@ const actions = {
      cmd: state.spiderForm.cmd,
      type: state.spiderForm.type,
      lang: state.spiderForm.lang,
-      col: state.spiderForm.col
+      col: state.spiderForm.col,
+      cron: state.spiderForm.cron,
+      cron_enabled: state.spiderForm.cron_enabled ? 1 : 0
    })
      .then(() => {
        dispatch('getSpiderList')
@@ -76,7 +81,9 @@ const actions = {
  getSpiderData ({ state, commit }, id) {
    return request.get(`/spiders/${id}`)
      .then(response => {
-        commit('SET_SPIDER_FORM', response.data)
+        let data = response.data
+        data.cron_enabled = !!data.cron_enabled
+        commit('SET_SPIDER_FORM', data)
      })
  },
  deploySpider ({ state, dispatch }, id) {
--- a/frontend/src/views/schedule/ScheduleList.vue
+++ b/frontend/src/views/schedule/ScheduleList.vue
@@ -0,0 +1,15 @@
+<template>
+  <div class="app-container">
+    Schedule List
+  </div>
+</template>
+
+<script>
+export default {
+  name: 'ScheduleList'
+}
+</script>
+
+<style scoped>
+
+</style>
--- a/spiders/baidu/baidu/init.py
+++ b/spiders/baidu/baidu/init.py
--- a/spiders/baidu/baidu/items.py
+++ b/spiders/baidu/baidu/items.py
@@ -1,14 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Define here the models for your scraped items
-#
-# See documentation in:
-# https://doc.scrapy.org/en/latest/topics/items.html
-
-import scrapy
-
-
-class BaiduItem(scrapy.Item):
-    # define the fields for your item here like:
-    title = scrapy.Field()
-    url = scrapy.Field()
--- a/spiders/baidu/baidu/middlewares.py
+++ b/spiders/baidu/baidu/middlewares.py
@@ -1,103 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Define here the models for your spider middleware
-#
-# See documentation in:
-# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
-
-from scrapy import signals
-
-
-class BaiduSpiderMiddleware(object):
-    # Not all methods need to be defined. If a method is not defined,
-    # scrapy acts as if the spider middleware does not modify the
-    # passed objects.
-
-    @classmethod
-    def from_crawler(cls, crawler):
-        # This method is used by Scrapy to create your spiders.
-        s = cls()
-        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
-        return s
-
-    def process_spider_input(self, response, spider):
-        # Called for each response that goes through the spider
-        # middleware and into the spider.
-
-        # Should return None or raise an exception.
-        return None
-
-    def process_spider_output(self, response, result, spider):
-        # Called with the results returned from the Spider, after
-        # it has processed the response.
-
-        # Must return an iterable of Request, dict or Item objects.
-        for i in result:
-            yield i
-
-    def process_spider_exception(self, response, exception, spider):
-        # Called when a spider or process_spider_input() method
-        # (from other spider middleware) raises an exception.
-
-        # Should return either None or an iterable of Response, dict
-        # or Item objects.
-        pass
-
-    def process_start_requests(self, start_requests, spider):
-        # Called with the start requests of the spider, and works
-        # similarly to the process_spider_output() method, except
-        # that it doesn’t have a response associated.
-
-        # Must return only requests (not items).
-        for r in start_requests:
-            yield r
-
-    def spider_opened(self, spider):
-        spider.logger.info('Spider opened: %s' % spider.name)
-
-
-class BaiduDownloaderMiddleware(object):
-    # Not all methods need to be defined. If a method is not defined,
-    # scrapy acts as if the downloader middleware does not modify the
-    # passed objects.
-
-    @classmethod
-    def from_crawler(cls, crawler):
-        # This method is used by Scrapy to create your spiders.
-        s = cls()
-        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
-        return s
-
-    def process_request(self, request, spider):
-        # Called for each request that goes through the downloader
-        # middleware.
-
-        # Must either:
-        # - return None: continue processing this request
-        # - or return a Response object
-        # - or return a Request object
-        # - or raise IgnoreRequest: process_exception() methods of
-        #   installed downloader middleware will be called
-        return None
-
-    def process_response(self, request, response, spider):
-        # Called with the response returned from the downloader.
-
-        # Must either;
-        # - return a Response object
-        # - return a Request object
-        # - or raise IgnoreRequest
-        return response
-
-    def process_exception(self, request, exception, spider):
-        # Called when a download handler or a process_request()
-        # (from other downloader middleware) raises an exception.
-
-        # Must either:
-        # - return None: continue processing this exception
-        # - return a Response object: stops process_exception() chain
-        # - return a Request object: stops process_exception() chain
-        pass
-
-    def spider_opened(self, spider):
-        spider.logger.info('Spider opened: %s' % spider.name)
--- a/spiders/baidu/baidu/pipelines.py
+++ b/spiders/baidu/baidu/pipelines.py
@@ -1,11 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Define your item pipelines here
-#
-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
-
-
-class BaiduPipeline(object):
-    def process_item(self, item, spider):
-        return item
--- a/spiders/baidu/baidu/settings.py
+++ b/spiders/baidu/baidu/settings.py
@@ -1,91 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Scrapy settings for baidu project
-#
-# For simplicity, this file contains only settings considered important or
-# commonly used. You can find more settings consulting the documentation:
-#
-#     https://doc.scrapy.org/en/latest/topics/settings.html
-#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
-#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
-
-BOT_NAME = 'baidu'
-
-SPIDER_MODULES = ['baidu.spiders']
-NEWSPIDER_MODULE = 'baidu.spiders'
-
-
-# Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'baidu (+http://www.yourdomain.com)'
-
-# Obey robots.txt rules
-ROBOTSTXT_OBEY = True
-# ROBOTSTXT_OBEY = False
-
-# Configure maximum concurrent requests performed by Scrapy (default: 16)
-#CONCURRENT_REQUESTS = 32
-
-# Configure a delay for requests for the same website (default: 0)
-# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
-# See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
-# The download delay setting will honor only one of:
-#CONCURRENT_REQUESTS_PER_DOMAIN = 16
-#CONCURRENT_REQUESTS_PER_IP = 16
-
-# Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
-
-# Disable Telnet Console (enabled by default)
-#TELNETCONSOLE_ENABLED = False
-
-# Override the default request headers:
-#DEFAULT_REQUEST_HEADERS = {
-#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-#   'Accept-Language': 'en',
-#}
-
-# Enable or disable spider middlewares
-# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
-#SPIDER_MIDDLEWARES = {
-#    'baidu.middlewares.BaiduSpiderMiddleware': 543,
-#}
-
-# Enable or disable downloader middlewares
-# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
-#DOWNLOADER_MIDDLEWARES = {
-#    'baidu.middlewares.BaiduDownloaderMiddleware': 543,
-#}
-
-# Enable or disable extensions
-# See https://doc.scrapy.org/en/latest/topics/extensions.html
-#EXTENSIONS = {
-#    'scrapy.extensions.telnet.TelnetConsole': None,
-#}
-
-# Configure item pipelines
-# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'baidu.pipelines.BaiduPipeline': 300,
-#}
-
-# Enable and configure the AutoThrottle extension (disabled by default)
-# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
-# The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
-# The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
-# The average number of requests Scrapy should be sending in parallel to
-# each remote server
-#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
-# Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG = False
-
-# Enable and configure HTTP caching (disabled by default)
-# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-#HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS = 0
-#HTTPCACHE_DIR = 'httpcache'
-#HTTPCACHE_IGNORE_HTTP_CODES = []
-#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/spiders/baidu/baidu/spiders/init.py
+++ b/spiders/baidu/baidu/spiders/init.py
@@ -1,4 +0,0 @@
-# This package will contain the spiders of your Scrapy project
-#
-# Please refer to the documentation for information on how to create and manage
-# your spiders.
--- a/spiders/baidu/baidu/spiders/baidu_spider.py
+++ b/spiders/baidu/baidu/spiders/baidu_spider.py
@@ -1,13 +0,0 @@
-# -*- coding: utf-8 -*-
-from time import sleep
-
-import scrapy
-
-
-class BaiduSpiderSpider(scrapy.Spider):
-    name = 'baidu_spider'
-    allowed_domains = ['baidu.com']
-    start_urls = ['http://baidu.com/s?wd=百度']
-
-    def parse(self, response):
-        sleep(30)
--- a/spiders/baidu/scrapy.cfg
+++ b/spiders/baidu/scrapy.cfg
@@ -1,11 +0,0 @@
-# Automatically created by: scrapy startproject
-#
-# For more information about the [deploy] section see:
-# https://scrapyd.readthedocs.io/en/latest/deploy.html
-
-[settings]
-default = baidu.settings
-
-[deploy]
-#url = http://localhost:6800/
-project = baidu
--- a/spiders/meitui/app.py
+++ b/spiders/meitui/app.py
@@ -1 +0,0 @@
-# /Users/yeqing/projects/crawlab/spiders
--- a/spiders/taobao/dump.rdb
+++ b/spiders/taobao/dump.rdb
--- a/spiders/taobao/scrapy.cfg
+++ b/spiders/taobao/scrapy.cfg
@@ -1,11 +0,0 @@
-# Automatically created by: scrapy startproject
-#
-# For more information about the [deploy] section see:
-# https://scrapyd.readthedocs.io/en/latest/deploy.html
-
-[settings]
-default = taobao.settings
-
-[deploy]
-#url = http://localhost:6800/
-project = taobao
--- a/spiders/taobao/taobao/init.py
+++ b/spiders/taobao/taobao/init.py
--- a/spiders/taobao/taobao/items.py
+++ b/spiders/taobao/taobao/items.py
@@ -1,13 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Define here the models for your scraped items
-#
-# See documentation in:
-# https://doc.scrapy.org/en/latest/topics/items.html
-
-import scrapy
-
-
-class TaobaoItem(scrapy.Item):
-    # define the fields for your item here like:
-    name = scrapy.Field()
--- a/spiders/taobao/taobao/middlewares.py
+++ b/spiders/taobao/taobao/middlewares.py
@@ -1,103 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Define here the models for your spider middleware
-#
-# See documentation in:
-# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
-
-from scrapy import signals
-
-
-class TaobaoSpiderMiddleware(object):
-    # Not all methods need to be defined. If a method is not defined,
-    # scrapy acts as if the spider middleware does not modify the
-    # passed objects.
-
-    @classmethod
-    def from_crawler(cls, crawler):
-        # This method is used by Scrapy to create your spiders.
-        s = cls()
-        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
-        return s
-
-    def process_spider_input(self, response, spider):
-        # Called for each response that goes through the spider
-        # middleware and into the spider.
-
-        # Should return None or raise an exception.
-        return None
-
-    def process_spider_output(self, response, result, spider):
-        # Called with the results returned from the Spider, after
-        # it has processed the response.
-
-        # Must return an iterable of Request, dict or Item objects.
-        for i in result:
-            yield i
-
-    def process_spider_exception(self, response, exception, spider):
-        # Called when a spider or process_spider_input() method
-        # (from other spider middleware) raises an exception.
-
-        # Should return either None or an iterable of Response, dict
-        # or Item objects.
-        pass
-
-    def process_start_requests(self, start_requests, spider):
-        # Called with the start requests of the spider, and works
-        # similarly to the process_spider_output() method, except
-        # that it doesn’t have a response associated.
-
-        # Must return only requests (not items).
-        for r in start_requests:
-            yield r
-
-    def spider_opened(self, spider):
-        spider.logger.info('Spider opened: %s' % spider.name)
-
-
-class TaobaoDownloaderMiddleware(object):
-    # Not all methods need to be defined. If a method is not defined,
-    # scrapy acts as if the downloader middleware does not modify the
-    # passed objects.
-
-    @classmethod
-    def from_crawler(cls, crawler):
-        # This method is used by Scrapy to create your spiders.
-        s = cls()
-        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
-        return s
-
-    def process_request(self, request, spider):
-        # Called for each request that goes through the downloader
-        # middleware.
-
-        # Must either:
-        # - return None: continue processing this request
-        # - or return a Response object
-        # - or return a Request object
-        # - or raise IgnoreRequest: process_exception() methods of
-        #   installed downloader middleware will be called
-        return None
-
-    def process_response(self, request, response, spider):
-        # Called with the response returned from the downloader.
-
-        # Must either;
-        # - return a Response object
-        # - return a Request object
-        # - or raise IgnoreRequest
-        return response
-
-    def process_exception(self, request, exception, spider):
-        # Called when a download handler or a process_request()
-        # (from other downloader middleware) raises an exception.
-
-        # Must either:
-        # - return None: continue processing this exception
-        # - return a Response object: stops process_exception() chain
-        # - return a Request object: stops process_exception() chain
-        pass
-
-    def spider_opened(self, spider):
-        spider.logger.info('Spider opened: %s' % spider.name)
--- a/spiders/taobao/taobao/pipelines.py
+++ b/spiders/taobao/taobao/pipelines.py
@@ -1,12 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Define your item pipelines here
-#
-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
-
-
-class TaobaoPipeline(object):
-    def process_item(self, item, spider):
-        print('task_id: %s' % spider.task_id)
-        return item
--- a/spiders/taobao/taobao/settings.py
+++ b/spiders/taobao/taobao/settings.py
@@ -1,91 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Scrapy settings for taobao project
-#
-# For simplicity, this file contains only settings considered important or
-# commonly used. You can find more settings consulting the documentation:
-#
-#     https://doc.scrapy.org/en/latest/topics/settings.html
-#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
-#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
-
-BOT_NAME = 'taobao'
-
-SPIDER_MODULES = ['taobao.spiders']
-NEWSPIDER_MODULE = 'taobao.spiders'
-
-
-# Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'taobao (+http://www.yourdomain.com)'
-
-# Obey robots.txt rules
-# ROBOTSTXT_OBEY = True
-ROBOTSTXT_OBEY = False
-
-# Configure maximum concurrent requests performed by Scrapy (default: 16)
-#CONCURRENT_REQUESTS = 32
-
-# Configure a delay for requests for the same website (default: 0)
-# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
-# See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
-# The download delay setting will honor only one of:
-#CONCURRENT_REQUESTS_PER_DOMAIN = 16
-#CONCURRENT_REQUESTS_PER_IP = 16
-
-# Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
-
-# Disable Telnet Console (enabled by default)
-#TELNETCONSOLE_ENABLED = False
-
-# Override the default request headers:
-#DEFAULT_REQUEST_HEADERS = {
-#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-#   'Accept-Language': 'en',
-#}
-
-# Enable or disable spider middlewares
-# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
-#SPIDER_MIDDLEWARES = {
-#    'taobao.middlewares.TaobaoSpiderMiddleware': 543,
-#}
-
-# Enable or disable downloader middlewares
-# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
-#DOWNLOADER_MIDDLEWARES = {
-#    'taobao.middlewares.TaobaoDownloaderMiddleware': 543,
-#}
-
-# Enable or disable extensions
-# See https://doc.scrapy.org/en/latest/topics/extensions.html
-#EXTENSIONS = {
-#    'scrapy.extensions.telnet.TelnetConsole': None,
-#}
-
-# Configure item pipelines
-# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'taobao.pipelines.TaobaoPipeline': 300,
-#}
-
-# Enable and configure the AutoThrottle extension (disabled by default)
-# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
-# The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
-# The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
-# The average number of requests Scrapy should be sending in parallel to
-# each remote server
-#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
-# Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG = False
-
-# Enable and configure HTTP caching (disabled by default)
-# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-#HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS = 0
-#HTTPCACHE_DIR = 'httpcache'
-#HTTPCACHE_IGNORE_HTTP_CODES = []
-#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/spiders/taobao/taobao/spiders/init.py
+++ b/spiders/taobao/taobao/spiders/init.py
@@ -1,4 +0,0 @@
-# This package will contain the spiders of your Scrapy project
-#
-# Please refer to the documentation for information on how to create and manage
-# your spiders.
--- a/spiders/taobao/taobao/spiders/taobao_spider.py
+++ b/spiders/taobao/taobao/spiders/taobao_spider.py
@@ -1,15 +0,0 @@
-# -*- coding: utf-8 -*-
-import os
-
-import scrapy
-
-from ..items import TaobaoItem
-
-
-class TaobaoSpiderSpider(scrapy.Spider):
-    name = 'taobao_spider'
-    allowed_domains = ['taobao.com']
-    start_urls = ['http://taobao.com/']
-
-    def parse(self, response):
-        yield TaobaoItem()
--- a/spiders/toutiao/toutiao_spider.js
+++ b/spiders/toutiao/toutiao_spider.js
--- a/spiders/weixin/weixin_crawler.py
+++ b/spiders/weixin/weixin_crawler.py