mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-30 18:00:56 +01:00
添加demo爬虫
This commit is contained in:
4
spiders/chinaz/Spiderfile
Normal file
4
spiders/chinaz/Spiderfile
Normal file
@@ -0,0 +1,4 @@
|
||||
name: "chinaz"
|
||||
display_name: "站长之家 (Scrapy)"
|
||||
col: "results_chinaz"
|
||||
cmd: "scrapy crawl chinaz_spider"
|
||||
@@ -5,24 +5,3 @@
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
import os
|
||||
|
||||
from pymongo import MongoClient
|
||||
|
||||
MONGO_HOST = os.environ.get('MONGO_HOST') or 'localhost'
|
||||
MONGO_PORT = int(os.environ.get('MONGO_PORT') or '27017')
|
||||
MONGO_DB = os.environ.get('MONGO_DB') or 'crawlab_test'
|
||||
|
||||
|
||||
class MongoPipeline(object):
|
||||
mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
|
||||
db = mongo[MONGO_DB]
|
||||
col_name = os.environ.get('CRAWLAB_COLLECTION') or 'sites'
|
||||
col = db[col_name]
|
||||
|
||||
def process_item(self, item, spider):
|
||||
item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
|
||||
item['_id'] = item['domain']
|
||||
if self.col.find_one({'_id': item['_id']}) is None:
|
||||
self.col.save(item)
|
||||
return item
|
||||
|
||||
@@ -65,7 +65,7 @@ ROBOTSTXT_OBEY = True
|
||||
# Configure item pipelines
|
||||
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
'chinaz.pipelines.MongoPipeline': 300,
|
||||
'crawlab.pipelines.CrawlabMongoPipeline': 300,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
|
||||
@@ -4,22 +4,3 @@
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
import os
|
||||
|
||||
from pymongo import MongoClient
|
||||
|
||||
MONGO_HOST = os.environ.get('MONGO_HOST') or 'localhost'
|
||||
MONGO_PORT = int(os.environ.get('MONGO_PORT') or '27017')
|
||||
MONGO_DB = os.environ.get('MONGO_DB') or 'crawlab_test'
|
||||
|
||||
|
||||
class MongoPipeline(object):
|
||||
mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
|
||||
db = mongo[MONGO_DB]
|
||||
col_name = os.environ.get('CRAWLAB_COLLECTION')
|
||||
col = db[col_name]
|
||||
|
||||
def process_item(self, item, spider):
|
||||
item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
|
||||
self.col.save(item)
|
||||
return item
|
||||
|
||||
@@ -64,7 +64,7 @@ ROBOTSTXT_OBEY = True
|
||||
# Configure item pipelines
|
||||
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
'realestate.pipelines.MongoPipeline': 300,
|
||||
'crawlab.pipelines.CrawlabMongoPipeline': 300,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
|
||||
Reference in New Issue
Block a user