From 647fac1efe3e1acfcb1f18f08f6b28b74fb966a9 Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Sat, 9 Mar 2019 14:05:14 +0800 Subject: [PATCH] added cron tasks for spiders --- README-zh.md | 2 +- README.md | 2 +- crawlab/app.py | 76 +++++++++++++ crawlab/config.py | 2 - crawlab/constants/manage.py | 1 + crawlab/constants/spider.py | 5 + crawlab/manage.py | 15 +++ crawlab/routes/schedules.py | 18 +++ crawlab/routes/spiders.py | 6 + crawlab/tasks/scheduler.py | 53 +++++++++ crawlab/tasks/spider.py | 6 +- frontend/src/App.vue | 4 + .../components/InfoView/SpiderInfoView.vue | 35 ++++++ frontend/src/router/index.js | 21 ++++ frontend/src/store/modules/spider.js | 13 ++- frontend/src/views/schedule/ScheduleList.vue | 15 +++ spiders/baidu/baidu/__init__.py | 0 spiders/baidu/baidu/items.py | 14 --- spiders/baidu/baidu/middlewares.py | 103 ------------------ spiders/baidu/baidu/pipelines.py | 11 -- spiders/baidu/baidu/settings.py | 91 ---------------- spiders/baidu/baidu/spiders/__init__.py | 4 - spiders/baidu/baidu/spiders/baidu_spider.py | 13 --- spiders/baidu/scrapy.cfg | 11 -- spiders/meitui/app.py | 1 - spiders/taobao/dump.rdb | Bin 760 -> 0 bytes spiders/taobao/scrapy.cfg | 11 -- spiders/taobao/taobao/__init__.py | 0 spiders/taobao/taobao/items.py | 13 --- spiders/taobao/taobao/middlewares.py | 103 ------------------ spiders/taobao/taobao/pipelines.py | 12 -- spiders/taobao/taobao/settings.py | 91 ---------------- spiders/taobao/taobao/spiders/__init__.py | 4 - .../taobao/taobao/spiders/taobao_spider.py | 15 --- spiders/toutiao/toutiao_spider.js | 0 spiders/weixin/weixin_crawler.py | 0 36 files changed, 263 insertions(+), 508 deletions(-) create mode 100644 crawlab/app.py create mode 100644 crawlab/routes/schedules.py create mode 100644 crawlab/tasks/scheduler.py create mode 100644 frontend/src/views/schedule/ScheduleList.vue delete mode 100644 spiders/baidu/baidu/__init__.py delete mode 100644 spiders/baidu/baidu/items.py delete mode 100644 spiders/baidu/baidu/middlewares.py delete mode 100644 spiders/baidu/baidu/pipelines.py delete mode 100644 spiders/baidu/baidu/settings.py delete mode 100644 spiders/baidu/baidu/spiders/__init__.py delete mode 100644 spiders/baidu/baidu/spiders/baidu_spider.py delete mode 100644 spiders/baidu/scrapy.cfg delete mode 100644 spiders/meitui/app.py delete mode 100644 spiders/taobao/dump.rdb delete mode 100644 spiders/taobao/scrapy.cfg delete mode 100644 spiders/taobao/taobao/__init__.py delete mode 100644 spiders/taobao/taobao/items.py delete mode 100644 spiders/taobao/taobao/middlewares.py delete mode 100644 spiders/taobao/taobao/pipelines.py delete mode 100644 spiders/taobao/taobao/settings.py delete mode 100644 spiders/taobao/taobao/spiders/__init__.py delete mode 100644 spiders/taobao/taobao/spiders/taobao_spider.py delete mode 100644 spiders/toutiao/toutiao_spider.js delete mode 100644 spiders/weixin/weixin_crawler.py diff --git a/README-zh.md b/README-zh.md index 9eff0f97..7223aafd 100644 --- a/README-zh.md +++ b/README-zh.md @@ -14,7 +14,7 @@ ```bash # 安装后台类库 -pip install -r ./crawlab/requirements.txt +pip install -r requirements.txt ``` ```bash diff --git a/README.md b/README.md index 820f4dd4..bffad3eb 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Celery-based web crawler admin platform for managing distributed web spiders reg ```bash # install the requirements for backend -pip install -r ./crawlab/requirements.txt +pip install -r requirements.txt ``` ```bash diff --git a/crawlab/app.py b/crawlab/app.py new file mode 100644 index 00000000..d72aa40f --- /dev/null +++ b/crawlab/app.py @@ -0,0 +1,76 @@ +import os +import subprocess +import sys +from multiprocessing import Process + +import click +from flask import Flask +from flask_cors import CORS +from flask_restful import Api + +from routes.schedules import ScheduleApi +from tasks.scheduler import scheduler + +file_dir = os.path.dirname(os.path.realpath(__file__)) +root_path = os.path.abspath(os.path.join(file_dir, '.')) +sys.path.append(root_path) + +from config import FLASK_HOST, FLASK_PORT, PROJECT_LOGS_FOLDER, BROKER_URL +from constants.manage import ActionType +from routes.deploys import DeployApi +from routes.files import FileApi +from routes.nodes import NodeApi +from routes.spiders import SpiderApi, SpiderImportApi, SpiderManageApi +from routes.stats import StatsApi +from routes.tasks import TaskApi +from tasks.celery import celery_app + +# flask app instance +app = Flask(__name__) +app.config.from_object('config') + +# init flask api instance +api = Api(app) + +# cors support +CORS(app, supports_credentials=True) + +# reference api routes +api.add_resource(NodeApi, + '/api/nodes', + '/api/nodes/', + '/api/nodes//') +api.add_resource(SpiderImportApi, + '/api/spiders/import/') +api.add_resource(SpiderManageApi, + '/api/spiders/manage/') +api.add_resource(SpiderApi, + '/api/spiders', + '/api/spiders/', + '/api/spiders//') +api.add_resource(DeployApi, + '/api/deploys', + '/api/deploys/', + '/api/deploys//') +api.add_resource(TaskApi, + '/api/tasks', + '/api/tasks/', + '/api/tasks//' + ) +api.add_resource(FileApi, + '/api/files', + '/api/files/') +api.add_resource(StatsApi, + '/api/stats', + '/api/stats/') +api.add_resource(ScheduleApi, + '/api/schedules', + '/api/schedules/') + +if __name__ == '__main__': + # create folder if it does not exist + if not os.path.exists(PROJECT_LOGS_FOLDER): + os.makedirs(PROJECT_LOGS_FOLDER) + + # run app instance + app.run(host=FLASK_HOST, port=FLASK_PORT, threaded=True) diff --git a/crawlab/config.py b/crawlab/config.py index 6b1575af..5b1dee95 100644 --- a/crawlab/config.py +++ b/crawlab/config.py @@ -1,6 +1,4 @@ # project variables -from celery.schedules import crontab - PROJECT_SOURCE_FILE_FOLDER = '/Users/yeqing/projects/crawlab/spiders' PROJECT_DEPLOY_FILE_FOLDER = '/var/crawlab' PROJECT_LOGS_FOLDER = '/var/logs/crawlab' diff --git a/crawlab/constants/manage.py b/crawlab/constants/manage.py index f5447bf2..1c57837d 100644 --- a/crawlab/constants/manage.py +++ b/crawlab/constants/manage.py @@ -2,4 +2,5 @@ class ActionType: APP = 'app' FLOWER = 'flower' WORKER = 'worker' + SCHEDULER = 'scheduler' RUN_ALL = 'run_all' diff --git a/crawlab/constants/spider.py b/crawlab/constants/spider.py index 7595f79e..685e2b07 100644 --- a/crawlab/constants/spider.py +++ b/crawlab/constants/spider.py @@ -12,6 +12,11 @@ class LangType: OTHER = 'other' +class CronEnabled: + ON = 1 + OFF = 0 + + SUFFIX_IGNORE = [ 'pyc' ] diff --git a/crawlab/manage.py b/crawlab/manage.py index 45f07f17..9ea83d3c 100644 --- a/crawlab/manage.py +++ b/crawlab/manage.py @@ -8,6 +8,9 @@ from flask import Flask from flask_cors import CORS from flask_restful import Api +from routes.schedules import ScheduleApi +from tasks.scheduler import scheduler + file_dir = os.path.dirname(os.path.realpath(__file__)) root_path = os.path.abspath(os.path.join(file_dir, '.')) sys.path.append(root_path) @@ -60,6 +63,9 @@ api.add_resource(FileApi, api.add_resource(StatsApi, '/api/stats', '/api/stats/') +api.add_resource(ScheduleApi, + '/api/schedules', + '/api/schedules/') def run_app(): @@ -85,10 +91,15 @@ def run_worker(): celery_app.start(argv=['tasks', 'worker', '-E', '-l', 'INFO']) +def run_scheduler(): + scheduler.run() + + @click.command() @click.argument('action', type=click.Choice([ActionType.APP, ActionType.FLOWER, ActionType.WORKER, + ActionType.SCHEDULER, ActionType.RUN_ALL])) def main(action): if action == ActionType.APP: @@ -97,6 +108,8 @@ def main(action): run_flower() elif action == ActionType.WORKER: run_worker() + elif action == ActionType.SCHEDULER: + run_scheduler() elif action == ActionType.RUN_ALL: p_flower = Process(target=run_flower) p_flower.start() @@ -104,6 +117,8 @@ def main(action): p_app.start() p_worker = Process(target=run_worker) p_worker.start() + p_scheduler = Process(target=run_scheduler) + p_scheduler.start() if __name__ == '__main__': diff --git a/crawlab/routes/schedules.py b/crawlab/routes/schedules.py new file mode 100644 index 00000000..1eceabde --- /dev/null +++ b/crawlab/routes/schedules.py @@ -0,0 +1,18 @@ +import json + +import requests + +from constants.task import TaskStatus +from db.manager import db_manager +from routes.base import BaseApi +from utils import jsonify +from utils.spider import get_spider_col_fields + + +class ScheduleApi(BaseApi): + col_name = 'schedules' + + arguments = ( + ('cron', str), + ('spider_id', str) + ) diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py index f8a220e4..aaeca318 100644 --- a/crawlab/routes/spiders.py +++ b/crawlab/routes/spiders.py @@ -47,6 +47,12 @@ class SpiderApi(BaseApi): # spider results collection ('col', str), + + # spider schedule cron + ('cron', str), + + # spider schedule cron enabled + ('cron_enabled', int), ) def get(self, id=None, action=None): diff --git a/crawlab/tasks/scheduler.py b/crawlab/tasks/scheduler.py new file mode 100644 index 00000000..f227dce8 --- /dev/null +++ b/crawlab/tasks/scheduler.py @@ -0,0 +1,53 @@ +import requests +from apscheduler.schedulers.background import BlockingScheduler +from apscheduler.jobstores.mongodb import MongoDBJobStore +from pymongo import MongoClient + +from config import MONGO_DB, MONGO_HOST, MONGO_PORT +from constants.spider import CronEnabled +from db.manager import db_manager + + +class Scheduler(object): + mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT) + + jobstores = { + 'mongo': MongoDBJobStore(database=MONGO_DB, + collection='apscheduler_jobs', + client=mongo) + } + + scheduler = BlockingScheduler(jobstores=jobstores) + + def execute_spider(self, id: str): + r = requests.get('http://localhost:5000/api/spiders/%s/on_crawl' % id) + + def restart(self): + self.scheduler.shutdown() + self.scheduler.start() + + def update(self): + self.scheduler.remove_all_jobs() + spiders = db_manager.list('spiders', {'cron_enabled': CronEnabled.ON}) + for spider in spiders: + cron = spider.get('cron') + cron_arr = cron.split(' ') + second = cron_arr[0] + minute = cron_arr[1] + hour = cron_arr[2] + day = cron_arr[3] + month = cron_arr[4] + day_of_week = cron_arr[5] + self.scheduler.add_job(func=self.execute_spider, trigger='cron', args=(str(spider['_id']),), + day_of_week=day_of_week, month=month, day=day, hour=hour, minute=minute, + second=second) + + def run(self): + self.update() + self.scheduler.start() + + +scheduler = Scheduler() + +if __name__ == '__main__': + scheduler.run() diff --git a/crawlab/tasks/spider.py b/crawlab/tasks/spider.py index 26e0faf5..dd8a1c4e 100644 --- a/crawlab/tasks/spider.py +++ b/crawlab/tasks/spider.py @@ -1,10 +1,7 @@ import os -import sys from datetime import datetime -import requests from bson import ObjectId -from celery import current_app from celery.utils.log import get_logger from config import PROJECT_DEPLOY_FILE_FOLDER, PROJECT_LOGS_FOLDER @@ -52,7 +49,8 @@ def execute_spider(self, id: str): # execute the command env = os.environ.copy() env['CRAWLAB_TASK_ID'] = task_id - env['CRAWLAB_COLLECTION'] = spider.get('col') + if spider.get('col'): + env['CRAWLAB_COLLECTION'] = spider.get('col') p = subprocess.Popen(command.split(' '), stdout=stdout.fileno(), stderr=stderr.fileno(), diff --git a/frontend/src/App.vue b/frontend/src/App.vue index 7b5c9085..38d6c19d 100644 --- a/frontend/src/App.vue +++ b/frontend/src/App.vue @@ -48,4 +48,8 @@ export default { margin-top: 10px; text-align: right; } + + .el-form .el-form-item { + margin-bottom: 10px; + } diff --git a/frontend/src/components/InfoView/SpiderInfoView.vue b/frontend/src/components/InfoView/SpiderInfoView.vue index 4265e58d..6864acd7 100644 --- a/frontend/src/components/InfoView/SpiderInfoView.vue +++ b/frontend/src/components/InfoView/SpiderInfoView.vue @@ -38,6 +38,23 @@ + + + + + + + + @@ -62,9 +79,27 @@ export default { } }, data () { + const cronValidator = (rule, value, callback) => { + let patArr = [] + for (let i = 0; i < 6; i++) { + patArr.push('[/*,0-9]+') + } + const pat = '^' + patArr.join(' ') + '$' + if (this.spiderForm.cron_enabled) { + if (!value) { + callback(new Error('cron cannot be empty')) + } else if (!value.match(pat)) { + callback(new Error('cron format is invalid')) + } + } + callback() + } return { cmdRule: [ { message: 'Execute Command should not be empty', required: true } + ], + cronRules: [ + { validator: cronValidator, trigger: 'blur' } ] } }, diff --git a/frontend/src/router/index.js b/frontend/src/router/index.js index 7da95e4f..46b1e741 100644 --- a/frontend/src/router/index.js +++ b/frontend/src/router/index.js @@ -132,6 +132,27 @@ export const constantRouterMap = [ } ] }, + { + name: 'Schedule', + path: '/schedules', + component: Layout, + meta: { + title: 'Schedules', + icon: 'fa fa-calendar' + }, + hidden: true, + children: [ + { + path: '', + name: 'ScheduleList', + component: () => import('../views/schedule/ScheduleList'), + meta: { + title: 'Schedules', + icon: 'fa fa-calendar' + } + } + ] + }, { name: 'Deploy', path: '/deploys', diff --git a/frontend/src/store/modules/spider.js b/frontend/src/store/modules/spider.js index 9514e08f..30fb7a69 100644 --- a/frontend/src/store/modules/spider.js +++ b/frontend/src/store/modules/spider.js @@ -48,7 +48,10 @@ const actions = { src: state.spiderForm.src, cmd: state.spiderForm.cmd, type: state.spiderForm.type, - lang: state.spiderForm.lang + lang: state.spiderForm.lang, + col: state.spiderForm.col, + cron: state.spiderForm.cron, + cron_enabled: state.spiderForm.cron_enabled ? 1 : 0 }) .then(() => { dispatch('getSpiderList') @@ -61,7 +64,9 @@ const actions = { cmd: state.spiderForm.cmd, type: state.spiderForm.type, lang: state.spiderForm.lang, - col: state.spiderForm.col + col: state.spiderForm.col, + cron: state.spiderForm.cron, + cron_enabled: state.spiderForm.cron_enabled ? 1 : 0 }) .then(() => { dispatch('getSpiderList') @@ -76,7 +81,9 @@ const actions = { getSpiderData ({ state, commit }, id) { return request.get(`/spiders/${id}`) .then(response => { - commit('SET_SPIDER_FORM', response.data) + let data = response.data + data.cron_enabled = !!data.cron_enabled + commit('SET_SPIDER_FORM', data) }) }, deploySpider ({ state, dispatch }, id) { diff --git a/frontend/src/views/schedule/ScheduleList.vue b/frontend/src/views/schedule/ScheduleList.vue new file mode 100644 index 00000000..d1b8a5bb --- /dev/null +++ b/frontend/src/views/schedule/ScheduleList.vue @@ -0,0 +1,15 @@ + + + + + diff --git a/spiders/baidu/baidu/__init__.py b/spiders/baidu/baidu/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/spiders/baidu/baidu/items.py b/spiders/baidu/baidu/items.py deleted file mode 100644 index 26b5888c..00000000 --- a/spiders/baidu/baidu/items.py +++ /dev/null @@ -1,14 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class BaiduItem(scrapy.Item): - # define the fields for your item here like: - title = scrapy.Field() - url = scrapy.Field() diff --git a/spiders/baidu/baidu/middlewares.py b/spiders/baidu/baidu/middlewares.py deleted file mode 100644 index 3911485d..00000000 --- a/spiders/baidu/baidu/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class BaiduSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Response, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class BaiduDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/spiders/baidu/baidu/pipelines.py b/spiders/baidu/baidu/pipelines.py deleted file mode 100644 index beae9c24..00000000 --- a/spiders/baidu/baidu/pipelines.py +++ /dev/null @@ -1,11 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html - - -class BaiduPipeline(object): - def process_item(self, item, spider): - return item diff --git a/spiders/baidu/baidu/settings.py b/spiders/baidu/baidu/settings.py deleted file mode 100644 index 667b09ca..00000000 --- a/spiders/baidu/baidu/settings.py +++ /dev/null @@ -1,91 +0,0 @@ -# -*- coding: utf-8 -*- - -# Scrapy settings for baidu project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://doc.scrapy.org/en/latest/topics/settings.html -# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'baidu' - -SPIDER_MODULES = ['baidu.spiders'] -NEWSPIDER_MODULE = 'baidu.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'baidu (+http://www.yourdomain.com)' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True -# ROBOTSTXT_OBEY = False - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'baidu.middlewares.BaiduSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'baidu.middlewares.BaiduDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://doc.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html -#ITEM_PIPELINES = { -# 'baidu.pipelines.BaiduPipeline': 300, -#} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/spiders/baidu/baidu/spiders/__init__.py b/spiders/baidu/baidu/spiders/__init__.py deleted file mode 100644 index ebd689ac..00000000 --- a/spiders/baidu/baidu/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/spiders/baidu/baidu/spiders/baidu_spider.py b/spiders/baidu/baidu/spiders/baidu_spider.py deleted file mode 100644 index f84ffc8d..00000000 --- a/spiders/baidu/baidu/spiders/baidu_spider.py +++ /dev/null @@ -1,13 +0,0 @@ -# -*- coding: utf-8 -*- -from time import sleep - -import scrapy - - -class BaiduSpiderSpider(scrapy.Spider): - name = 'baidu_spider' - allowed_domains = ['baidu.com'] - start_urls = ['http://baidu.com/s?wd=百度'] - - def parse(self, response): - sleep(30) diff --git a/spiders/baidu/scrapy.cfg b/spiders/baidu/scrapy.cfg deleted file mode 100644 index 492b18d1..00000000 --- a/spiders/baidu/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = baidu.settings - -[deploy] -#url = http://localhost:6800/ -project = baidu diff --git a/spiders/meitui/app.py b/spiders/meitui/app.py deleted file mode 100644 index 493ffc46..00000000 --- a/spiders/meitui/app.py +++ /dev/null @@ -1 +0,0 @@ -# /Users/yeqing/projects/crawlab/spiders diff --git a/spiders/taobao/dump.rdb b/spiders/taobao/dump.rdb deleted file mode 100644 index 48df713409ebce5b6617a84280df170eb7137ad1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 760 zcmZvZv2K+y3`TD|bOMQm@&K^9iWA3)cNo|ZJV0f~4qTO9B%l@SycGkF!N3d;fe?_Y z1JW;+#KU)VJ}1wgJbwBjrNqB?zdk>EmG?)x-R|zyLB~b-`Q_F7`0&U$v{kC0%@}AJ zf|)uLvKX~sVzlG?=NE4-?dr_-2j3rFo{e}NZ$BR0yS-0NPfl;A;|Ko@#`T^2@Lwvf z&nzq97!y>h0@c#iBm}K^>5_=k>>Ff`Dz2e0Q(Z-C3$UOGp0ac*9u{qbur(jGErDnS zB39^31&}3Ul93$i2B}4(4I&_~2xO#ST{_^vs_Y@T&<5$sjM&o(_P&aRFN+CSqM13G z7u{OKEsJr5V!Dv%1kzN1bDCUj_Efi)+&KrWI_YKuY9m3XM&QN`r)t3*n_`lwb(@Kw=@%br`G1)UAb!uiB}D-yCFGK!h>~siPL*o@HanhE&t65L0|TTgqVU19;eM zbE0S5T1-+Jf(ZmM*1u2(n>7K3_ZtYCq>ZHq4dKF2eJrFb-&&7hT4br67(@U3{PF4Q Jw=eI${{;yZ)Xe|@ diff --git a/spiders/taobao/scrapy.cfg b/spiders/taobao/scrapy.cfg deleted file mode 100644 index c0e3980d..00000000 --- a/spiders/taobao/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = taobao.settings - -[deploy] -#url = http://localhost:6800/ -project = taobao diff --git a/spiders/taobao/taobao/__init__.py b/spiders/taobao/taobao/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/spiders/taobao/taobao/items.py b/spiders/taobao/taobao/items.py deleted file mode 100644 index 199c1f82..00000000 --- a/spiders/taobao/taobao/items.py +++ /dev/null @@ -1,13 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class TaobaoItem(scrapy.Item): - # define the fields for your item here like: - name = scrapy.Field() diff --git a/spiders/taobao/taobao/middlewares.py b/spiders/taobao/taobao/middlewares.py deleted file mode 100644 index afc752ba..00000000 --- a/spiders/taobao/taobao/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class TaobaoSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Response, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class TaobaoDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/spiders/taobao/taobao/pipelines.py b/spiders/taobao/taobao/pipelines.py deleted file mode 100644 index 7ddf8da5..00000000 --- a/spiders/taobao/taobao/pipelines.py +++ /dev/null @@ -1,12 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html - - -class TaobaoPipeline(object): - def process_item(self, item, spider): - print('task_id: %s' % spider.task_id) - return item diff --git a/spiders/taobao/taobao/settings.py b/spiders/taobao/taobao/settings.py deleted file mode 100644 index 0e237049..00000000 --- a/spiders/taobao/taobao/settings.py +++ /dev/null @@ -1,91 +0,0 @@ -# -*- coding: utf-8 -*- - -# Scrapy settings for taobao project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://doc.scrapy.org/en/latest/topics/settings.html -# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'taobao' - -SPIDER_MODULES = ['taobao.spiders'] -NEWSPIDER_MODULE = 'taobao.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'taobao (+http://www.yourdomain.com)' - -# Obey robots.txt rules -# ROBOTSTXT_OBEY = True -ROBOTSTXT_OBEY = False - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'taobao.middlewares.TaobaoSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'taobao.middlewares.TaobaoDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://doc.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html -#ITEM_PIPELINES = { -# 'taobao.pipelines.TaobaoPipeline': 300, -#} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/spiders/taobao/taobao/spiders/__init__.py b/spiders/taobao/taobao/spiders/__init__.py deleted file mode 100644 index ebd689ac..00000000 --- a/spiders/taobao/taobao/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/spiders/taobao/taobao/spiders/taobao_spider.py b/spiders/taobao/taobao/spiders/taobao_spider.py deleted file mode 100644 index 2a939a06..00000000 --- a/spiders/taobao/taobao/spiders/taobao_spider.py +++ /dev/null @@ -1,15 +0,0 @@ -# -*- coding: utf-8 -*- -import os - -import scrapy - -from ..items import TaobaoItem - - -class TaobaoSpiderSpider(scrapy.Spider): - name = 'taobao_spider' - allowed_domains = ['taobao.com'] - start_urls = ['http://taobao.com/'] - - def parse(self, response): - yield TaobaoItem() diff --git a/spiders/toutiao/toutiao_spider.js b/spiders/toutiao/toutiao_spider.js deleted file mode 100644 index e69de29b..00000000 diff --git a/spiders/weixin/weixin_crawler.py b/spiders/weixin/weixin_crawler.py deleted file mode 100644 index e69de29b..00000000