added cron tasks for spiders

This commit is contained in:
Marvin Zhang
2019-03-09 14:05:14 +08:00
parent 2f0107fff4
commit 647fac1efe
36 changed files with 263 additions and 508 deletions

View File

@@ -14,7 +14,7 @@
```bash
# 安装后台类库
pip install -r ./crawlab/requirements.txt
pip install -r requirements.txt
```
```bash

View File

@@ -15,7 +15,7 @@ Celery-based web crawler admin platform for managing distributed web spiders reg
```bash
# install the requirements for backend
pip install -r ./crawlab/requirements.txt
pip install -r requirements.txt
```
```bash

76
crawlab/app.py Normal file
View File

@@ -0,0 +1,76 @@
import os
import subprocess
import sys
from multiprocessing import Process
import click
from flask import Flask
from flask_cors import CORS
from flask_restful import Api
from routes.schedules import ScheduleApi
from tasks.scheduler import scheduler
file_dir = os.path.dirname(os.path.realpath(__file__))
root_path = os.path.abspath(os.path.join(file_dir, '.'))
sys.path.append(root_path)
from config import FLASK_HOST, FLASK_PORT, PROJECT_LOGS_FOLDER, BROKER_URL
from constants.manage import ActionType
from routes.deploys import DeployApi
from routes.files import FileApi
from routes.nodes import NodeApi
from routes.spiders import SpiderApi, SpiderImportApi, SpiderManageApi
from routes.stats import StatsApi
from routes.tasks import TaskApi
from tasks.celery import celery_app
# flask app instance
app = Flask(__name__)
app.config.from_object('config')
# init flask api instance
api = Api(app)
# cors support
CORS(app, supports_credentials=True)
# reference api routes
api.add_resource(NodeApi,
'/api/nodes',
'/api/nodes/<string:id>',
'/api/nodes/<string:id>/<string:action>')
api.add_resource(SpiderImportApi,
'/api/spiders/import/<string:platform>')
api.add_resource(SpiderManageApi,
'/api/spiders/manage/<string:action>')
api.add_resource(SpiderApi,
'/api/spiders',
'/api/spiders/<string:id>',
'/api/spiders/<string:id>/<string:action>')
api.add_resource(DeployApi,
'/api/deploys',
'/api/deploys/<string:id>',
'/api/deploys/<string:id>/<string:action>')
api.add_resource(TaskApi,
'/api/tasks',
'/api/tasks/<string:id>',
'/api/tasks/<string:id>/<string:action>'
)
api.add_resource(FileApi,
'/api/files',
'/api/files/<string:action>')
api.add_resource(StatsApi,
'/api/stats',
'/api/stats/<string:action>')
api.add_resource(ScheduleApi,
'/api/schedules',
'/api/schedules/<string:id>')
if __name__ == '__main__':
# create folder if it does not exist
if not os.path.exists(PROJECT_LOGS_FOLDER):
os.makedirs(PROJECT_LOGS_FOLDER)
# run app instance
app.run(host=FLASK_HOST, port=FLASK_PORT, threaded=True)

View File

@@ -1,6 +1,4 @@
# project variables
from celery.schedules import crontab
PROJECT_SOURCE_FILE_FOLDER = '/Users/yeqing/projects/crawlab/spiders'
PROJECT_DEPLOY_FILE_FOLDER = '/var/crawlab'
PROJECT_LOGS_FOLDER = '/var/logs/crawlab'

View File

@@ -2,4 +2,5 @@ class ActionType:
APP = 'app'
FLOWER = 'flower'
WORKER = 'worker'
SCHEDULER = 'scheduler'
RUN_ALL = 'run_all'

View File

@@ -12,6 +12,11 @@ class LangType:
OTHER = 'other'
class CronEnabled:
ON = 1
OFF = 0
SUFFIX_IGNORE = [
'pyc'
]

View File

@@ -8,6 +8,9 @@ from flask import Flask
from flask_cors import CORS
from flask_restful import Api
from routes.schedules import ScheduleApi
from tasks.scheduler import scheduler
file_dir = os.path.dirname(os.path.realpath(__file__))
root_path = os.path.abspath(os.path.join(file_dir, '.'))
sys.path.append(root_path)
@@ -60,6 +63,9 @@ api.add_resource(FileApi,
api.add_resource(StatsApi,
'/api/stats',
'/api/stats/<string:action>')
api.add_resource(ScheduleApi,
'/api/schedules',
'/api/schedules/<string:id>')
def run_app():
@@ -85,10 +91,15 @@ def run_worker():
celery_app.start(argv=['tasks', 'worker', '-E', '-l', 'INFO'])
def run_scheduler():
scheduler.run()
@click.command()
@click.argument('action', type=click.Choice([ActionType.APP,
ActionType.FLOWER,
ActionType.WORKER,
ActionType.SCHEDULER,
ActionType.RUN_ALL]))
def main(action):
if action == ActionType.APP:
@@ -97,6 +108,8 @@ def main(action):
run_flower()
elif action == ActionType.WORKER:
run_worker()
elif action == ActionType.SCHEDULER:
run_scheduler()
elif action == ActionType.RUN_ALL:
p_flower = Process(target=run_flower)
p_flower.start()
@@ -104,6 +117,8 @@ def main(action):
p_app.start()
p_worker = Process(target=run_worker)
p_worker.start()
p_scheduler = Process(target=run_scheduler)
p_scheduler.start()
if __name__ == '__main__':

View File

@@ -0,0 +1,18 @@
import json
import requests
from constants.task import TaskStatus
from db.manager import db_manager
from routes.base import BaseApi
from utils import jsonify
from utils.spider import get_spider_col_fields
class ScheduleApi(BaseApi):
col_name = 'schedules'
arguments = (
('cron', str),
('spider_id', str)
)

View File

@@ -47,6 +47,12 @@ class SpiderApi(BaseApi):
# spider results collection
('col', str),
# spider schedule cron
('cron', str),
# spider schedule cron enabled
('cron_enabled', int),
)
def get(self, id=None, action=None):

View File

@@ -0,0 +1,53 @@
import requests
from apscheduler.schedulers.background import BlockingScheduler
from apscheduler.jobstores.mongodb import MongoDBJobStore
from pymongo import MongoClient
from config import MONGO_DB, MONGO_HOST, MONGO_PORT
from constants.spider import CronEnabled
from db.manager import db_manager
class Scheduler(object):
mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
jobstores = {
'mongo': MongoDBJobStore(database=MONGO_DB,
collection='apscheduler_jobs',
client=mongo)
}
scheduler = BlockingScheduler(jobstores=jobstores)
def execute_spider(self, id: str):
r = requests.get('http://localhost:5000/api/spiders/%s/on_crawl' % id)
def restart(self):
self.scheduler.shutdown()
self.scheduler.start()
def update(self):
self.scheduler.remove_all_jobs()
spiders = db_manager.list('spiders', {'cron_enabled': CronEnabled.ON})
for spider in spiders:
cron = spider.get('cron')
cron_arr = cron.split(' ')
second = cron_arr[0]
minute = cron_arr[1]
hour = cron_arr[2]
day = cron_arr[3]
month = cron_arr[4]
day_of_week = cron_arr[5]
self.scheduler.add_job(func=self.execute_spider, trigger='cron', args=(str(spider['_id']),),
day_of_week=day_of_week, month=month, day=day, hour=hour, minute=minute,
second=second)
def run(self):
self.update()
self.scheduler.start()
scheduler = Scheduler()
if __name__ == '__main__':
scheduler.run()

View File

@@ -1,10 +1,7 @@
import os
import sys
from datetime import datetime
import requests
from bson import ObjectId
from celery import current_app
from celery.utils.log import get_logger
from config import PROJECT_DEPLOY_FILE_FOLDER, PROJECT_LOGS_FOLDER
@@ -52,7 +49,8 @@ def execute_spider(self, id: str):
# execute the command
env = os.environ.copy()
env['CRAWLAB_TASK_ID'] = task_id
env['CRAWLAB_COLLECTION'] = spider.get('col')
if spider.get('col'):
env['CRAWLAB_COLLECTION'] = spider.get('col')
p = subprocess.Popen(command.split(' '),
stdout=stdout.fileno(),
stderr=stderr.fileno(),

View File

@@ -48,4 +48,8 @@ export default {
margin-top: 10px;
text-align: right;
}
.el-form .el-form-item {
margin-bottom: 10px;
}
</style>

View File

@@ -38,6 +38,23 @@
<el-option value="go" label="Go"></el-option>
</el-select>
</el-form-item>
<el-form-item label="Schedule Enabled">
<el-switch v-model="spiderForm.cron_enabled" :disabled="isView">
</el-switch>
</el-form-item>
<el-form-item label="Schedule Cron" v-if="spiderForm.cron_enabled" prop="cron" :rules="cronRules">
<template slot="label">
<el-tooltip content="Cron Format: [second] [minute] [hour] [day of month] [month] [day of week]"
placement="top">
<span>
Schedule Cron
<i class="fa fa-exclamation-circle"></i>
</span>
</el-tooltip>
</template>
<el-input v-model="spiderForm.cron" placeholder="Schedule Cron"
:disabled="isView"></el-input>
</el-form-item>
</el-form>
</el-row>
<el-row class="button-container" v-if="!isView">
@@ -62,9 +79,27 @@ export default {
}
},
data () {
const cronValidator = (rule, value, callback) => {
let patArr = []
for (let i = 0; i < 6; i++) {
patArr.push('[/*,0-9]+')
}
const pat = '^' + patArr.join(' ') + '$'
if (this.spiderForm.cron_enabled) {
if (!value) {
callback(new Error('cron cannot be empty'))
} else if (!value.match(pat)) {
callback(new Error('cron format is invalid'))
}
}
callback()
}
return {
cmdRule: [
{ message: 'Execute Command should not be empty', required: true }
],
cronRules: [
{ validator: cronValidator, trigger: 'blur' }
]
}
},

View File

@@ -132,6 +132,27 @@ export const constantRouterMap = [
}
]
},
{
name: 'Schedule',
path: '/schedules',
component: Layout,
meta: {
title: 'Schedules',
icon: 'fa fa-calendar'
},
hidden: true,
children: [
{
path: '',
name: 'ScheduleList',
component: () => import('../views/schedule/ScheduleList'),
meta: {
title: 'Schedules',
icon: 'fa fa-calendar'
}
}
]
},
{
name: 'Deploy',
path: '/deploys',

View File

@@ -48,7 +48,10 @@ const actions = {
src: state.spiderForm.src,
cmd: state.spiderForm.cmd,
type: state.spiderForm.type,
lang: state.spiderForm.lang
lang: state.spiderForm.lang,
col: state.spiderForm.col,
cron: state.spiderForm.cron,
cron_enabled: state.spiderForm.cron_enabled ? 1 : 0
})
.then(() => {
dispatch('getSpiderList')
@@ -61,7 +64,9 @@ const actions = {
cmd: state.spiderForm.cmd,
type: state.spiderForm.type,
lang: state.spiderForm.lang,
col: state.spiderForm.col
col: state.spiderForm.col,
cron: state.spiderForm.cron,
cron_enabled: state.spiderForm.cron_enabled ? 1 : 0
})
.then(() => {
dispatch('getSpiderList')
@@ -76,7 +81,9 @@ const actions = {
getSpiderData ({ state, commit }, id) {
return request.get(`/spiders/${id}`)
.then(response => {
commit('SET_SPIDER_FORM', response.data)
let data = response.data
data.cron_enabled = !!data.cron_enabled
commit('SET_SPIDER_FORM', data)
})
},
deploySpider ({ state, dispatch }, id) {

View File

@@ -0,0 +1,15 @@
<template>
<div class="app-container">
Schedule List
</div>
</template>
<script>
export default {
name: 'ScheduleList'
}
</script>
<style scoped>
</style>

View File

@@ -1,14 +0,0 @@
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class BaiduItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
url = scrapy.Field()

View File

@@ -1,103 +0,0 @@
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class BaiduSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class BaiduDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)

View File

@@ -1,11 +0,0 @@
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
class BaiduPipeline(object):
def process_item(self, item, spider):
return item

View File

@@ -1,91 +0,0 @@
# -*- coding: utf-8 -*-
# Scrapy settings for baidu project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'baidu'
SPIDER_MODULES = ['baidu.spiders']
NEWSPIDER_MODULE = 'baidu.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'baidu (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'baidu.middlewares.BaiduSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'baidu.middlewares.BaiduDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'baidu.pipelines.BaiduPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

View File

@@ -1,4 +0,0 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@@ -1,13 +0,0 @@
# -*- coding: utf-8 -*-
from time import sleep
import scrapy
class BaiduSpiderSpider(scrapy.Spider):
name = 'baidu_spider'
allowed_domains = ['baidu.com']
start_urls = ['http://baidu.com/s?wd=百度']
def parse(self, response):
sleep(30)

View File

@@ -1,11 +0,0 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = baidu.settings
[deploy]
#url = http://localhost:6800/
project = baidu

View File

@@ -1 +0,0 @@
# /Users/yeqing/projects/crawlab/spiders

Binary file not shown.

View File

@@ -1,11 +0,0 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = taobao.settings
[deploy]
#url = http://localhost:6800/
project = taobao

View File

@@ -1,13 +0,0 @@
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class TaobaoItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()

View File

@@ -1,103 +0,0 @@
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class TaobaoSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class TaobaoDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)

View File

@@ -1,12 +0,0 @@
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
class TaobaoPipeline(object):
def process_item(self, item, spider):
print('task_id: %s' % spider.task_id)
return item

View File

@@ -1,91 +0,0 @@
# -*- coding: utf-8 -*-
# Scrapy settings for taobao project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'taobao'
SPIDER_MODULES = ['taobao.spiders']
NEWSPIDER_MODULE = 'taobao.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'taobao (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'taobao.middlewares.TaobaoSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'taobao.middlewares.TaobaoDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'taobao.pipelines.TaobaoPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

View File

@@ -1,4 +0,0 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@@ -1,15 +0,0 @@
# -*- coding: utf-8 -*-
import os
import scrapy
from ..items import TaobaoItem
class TaobaoSpiderSpider(scrapy.Spider):
name = 'taobao_spider'
allowed_domains = ['taobao.com']
start_urls = ['http://taobao.com/']
def parse(self, response):
yield TaobaoItem()