mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-22 17:31:03 +01:00
added demo spiders
This commit is contained in:
@@ -6,9 +6,9 @@ type ConfigSpiderData struct {
|
||||
DisplayName string `yaml:"display_name" json:"display_name"`
|
||||
Col string `yaml:"col" json:"col"`
|
||||
Remark string `yaml:"remark" json:"remark"`
|
||||
Type string `yaml:"type" bson:"type"`
|
||||
|
||||
// 可配置爬虫
|
||||
Version string `yaml:"version" json:"version"`
|
||||
Engine string `yaml:"engine" json:"engine"`
|
||||
StartUrl string `yaml:"start_url" json:"start_url"`
|
||||
StartStage string `yaml:"start_stage" json:"start_stage"`
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"crawlab/entity"
|
||||
"crawlab/model"
|
||||
"crawlab/model/config_spider"
|
||||
"crawlab/services/spider_handler"
|
||||
"crawlab/utils"
|
||||
"errors"
|
||||
"fmt"
|
||||
@@ -227,6 +228,17 @@ func ProcessSpiderFilesFromConfigData(spider model.Spider, configData entity.Con
|
||||
spider.FileId = fid
|
||||
_ = spider.Save()
|
||||
|
||||
// 获取爬虫同步实例
|
||||
spiderSync := spider_handler.SpiderSync{
|
||||
Spider: spider,
|
||||
}
|
||||
|
||||
// 获取gfFile
|
||||
gfFile2 := model.GetGridFs(spider.FileId)
|
||||
|
||||
// 生成MD5
|
||||
spiderSync.CreateMd5File(gfFile2.Md5)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@@ -313,30 +313,58 @@ func InitSpiderService() error {
|
||||
continue
|
||||
}
|
||||
|
||||
// 添加该爬虫到数据库
|
||||
spider = model.Spider{
|
||||
Id: bson.NewObjectId(),
|
||||
Name: configData.Name,
|
||||
DisplayName: configData.DisplayName,
|
||||
Type: constants.Customized,
|
||||
Col: configData.Col,
|
||||
Cmd: configData.Cmd,
|
||||
Src: spiderPath,
|
||||
Remark: configData.Remark,
|
||||
ProjectId: bson.ObjectIdHex(constants.ObjectIdNull),
|
||||
FileId: bson.ObjectIdHex(constants.ObjectIdNull),
|
||||
}
|
||||
if err := spider.Add(); err != nil {
|
||||
log.Errorf("add spider error: " + err.Error())
|
||||
debug.PrintStack()
|
||||
continue
|
||||
}
|
||||
if configData.Type == constants.Customized {
|
||||
// 添加该爬虫到数据库
|
||||
spider = model.Spider{
|
||||
Id: bson.NewObjectId(),
|
||||
Name: configData.Name,
|
||||
DisplayName: configData.DisplayName,
|
||||
Type: constants.Customized,
|
||||
Col: configData.Col,
|
||||
Src: spiderPath,
|
||||
Remark: configData.Remark,
|
||||
ProjectId: bson.ObjectIdHex(constants.ObjectIdNull),
|
||||
FileId: bson.ObjectIdHex(constants.ObjectIdNull),
|
||||
Cmd: configData.Cmd,
|
||||
}
|
||||
if err := spider.Add(); err != nil {
|
||||
log.Errorf("add spider error: " + err.Error())
|
||||
debug.PrintStack()
|
||||
continue
|
||||
}
|
||||
|
||||
// 上传爬虫到GridFS
|
||||
if err := UploadSpiderToGridFsFromMaster(spider); err != nil {
|
||||
log.Errorf("upload spider error: " + err.Error())
|
||||
debug.PrintStack()
|
||||
continue
|
||||
// 上传爬虫到GridFS
|
||||
if err := UploadSpiderToGridFsFromMaster(spider); err != nil {
|
||||
log.Errorf("upload spider error: " + err.Error())
|
||||
debug.PrintStack()
|
||||
continue
|
||||
}
|
||||
} else if configData.Type == constants.Configurable || configData.Type == "config" {
|
||||
// 添加该爬虫到数据库
|
||||
spider = model.Spider{
|
||||
Id: bson.NewObjectId(),
|
||||
Name: configData.Name,
|
||||
DisplayName: configData.DisplayName,
|
||||
Type: constants.Configurable,
|
||||
Col: configData.Col,
|
||||
Src: spiderPath,
|
||||
Remark: configData.Remark,
|
||||
ProjectId: bson.ObjectIdHex(constants.ObjectIdNull),
|
||||
FileId: bson.ObjectIdHex(constants.ObjectIdNull),
|
||||
Config: configData,
|
||||
}
|
||||
if err := spider.Add(); err != nil {
|
||||
log.Errorf("add spider error: " + err.Error())
|
||||
debug.PrintStack()
|
||||
continue
|
||||
}
|
||||
|
||||
// 根据序列化后的数据处理爬虫文件
|
||||
if err := ProcessSpiderFilesFromConfigData(spider, configData); err != nil {
|
||||
log.Errorf("add spider error: " + err.Error())
|
||||
debug.PrintStack()
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
51
spiders/amazon_config/Spiderfile
Normal file
51
spiders/amazon_config/Spiderfile
Normal file
@@ -0,0 +1,51 @@
|
||||
name: "amazon_config"
|
||||
display_name: "亚马逊中国(可配置)"
|
||||
remark: "亚马逊中国搜索手机,列表+分页"
|
||||
type: "configurable"
|
||||
col: "results_amazon_config"
|
||||
engine: scrapy
|
||||
start_url: https://www.amazon.cn/s?k=%E6%89%8B%E6%9C%BA&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_2
|
||||
start_stage: list
|
||||
stages:
|
||||
- name: list
|
||||
is_list: true
|
||||
list_css: .s-result-item
|
||||
list_xpath: ""
|
||||
page_css: .a-last > a
|
||||
page_xpath: ""
|
||||
page_attr: href
|
||||
fields:
|
||||
- name: title
|
||||
css: span.a-text-normal
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: url
|
||||
css: .a-link-normal
|
||||
xpath: ""
|
||||
attr: href
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: price
|
||||
css: ""
|
||||
xpath: .//*[@class="a-price-whole"]
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: price_fraction
|
||||
css: ""
|
||||
xpath: .//*[@class="a-price-fraction"]
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: img
|
||||
css: .s-image-square-aspect > img
|
||||
xpath: ""
|
||||
attr: src
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
settings:
|
||||
ROBOTSTXT_OBEY: "false"
|
||||
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
|
||||
like Gecko) Chrome/78.0.3904.108 Safari/537.36
|
||||
57
spiders/autohome_config/Spiderfile
Normal file
57
spiders/autohome_config/Spiderfile
Normal file
@@ -0,0 +1,57 @@
|
||||
name: "autohome_config"
|
||||
display_name: "汽车之家(可配置)"
|
||||
remark: "汽车之家文章,列表+详情+分页"
|
||||
type: "configurable"
|
||||
col: "results_autohome_config"
|
||||
engine: scrapy
|
||||
start_url: https://www.autohome.com.cn/all/
|
||||
start_stage: list
|
||||
stages:
|
||||
- name: list
|
||||
is_list: true
|
||||
list_css: ul.article > li
|
||||
list_xpath: ""
|
||||
page_css: a.page-item-next
|
||||
page_xpath: ""
|
||||
page_attr: href
|
||||
fields:
|
||||
- name: title
|
||||
css: li > a > h3
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: url
|
||||
css: li > a
|
||||
xpath: ""
|
||||
attr: href
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: abstract
|
||||
css: li > a > p
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: time
|
||||
css: li > a .fn-left
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: views
|
||||
css: li > a .fn-right > em:first-child
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: comments
|
||||
css: li > a .fn-right > em:last-child
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
settings:
|
||||
ROBOTSTXT_OBEY: "false"
|
||||
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
|
||||
like Gecko) Chrome/78.0.3904.108 Safari/537.36
|
||||
39
spiders/baidu_config/Spiderfile
Normal file
39
spiders/baidu_config/Spiderfile
Normal file
@@ -0,0 +1,39 @@
|
||||
name: "baidu_config"
|
||||
display_name: "百度搜索(可配置)"
|
||||
remark: "百度搜索Crawlab,列表+分页"
|
||||
type: "configurable"
|
||||
col: "results_baidu_config"
|
||||
engine: scrapy
|
||||
start_url: http://www.baidu.com/s?wd=crawlab
|
||||
start_stage: list
|
||||
stages:
|
||||
- name: list
|
||||
is_list: true
|
||||
list_css: ""
|
||||
list_xpath: //body
|
||||
page_css: ""
|
||||
page_xpath: //body
|
||||
page_attr: href
|
||||
fields:
|
||||
- name: title
|
||||
css: ""
|
||||
xpath: .//h3/a
|
||||
attr: href
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: url
|
||||
css: ""
|
||||
xpath: .//h3/a
|
||||
attr: href
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: abstract
|
||||
css: ""
|
||||
xpath: .//*[@class="c-abstract"]
|
||||
attr: href
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
settings:
|
||||
ROBOTSTXT_OBEY: "false"
|
||||
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
|
||||
like Gecko) Chrome/78.0.3904.108 Safari/537.36
|
||||
@@ -1,4 +1,5 @@
|
||||
name: "chinaz"
|
||||
display_name: "站长之家 (Scrapy)"
|
||||
col: "results_chinaz"
|
||||
type: "customized"
|
||||
cmd: "scrapy crawl chinaz_spider"
|
||||
60
spiders/csdn_config/Spiderfile
Normal file
60
spiders/csdn_config/Spiderfile
Normal file
@@ -0,0 +1,60 @@
|
||||
name: "csdn_config"
|
||||
display_name: "CSDN(可配置)"
|
||||
remark: "CSDN Crawlab 文章,列表+详情+分页"
|
||||
type: "configurable"
|
||||
col: "results_csdn_config"
|
||||
engine: scrapy
|
||||
start_url: https://so.csdn.net/so/search/s.do?q=crawlab
|
||||
start_stage: list
|
||||
stages:
|
||||
- name: list
|
||||
is_list: true
|
||||
list_css: .search-list-con > .search-list
|
||||
list_xpath: ""
|
||||
page_css: a.btn-next
|
||||
page_xpath: ""
|
||||
page_attr: href
|
||||
fields:
|
||||
- name: url
|
||||
css: ""
|
||||
xpath: .//*[@class="limit_width"]/a
|
||||
attr: href
|
||||
next_stage: detail
|
||||
remark: ""
|
||||
- name: detail
|
||||
is_list: false
|
||||
list_css: ""
|
||||
list_xpath: ""
|
||||
page_css: ""
|
||||
page_xpath: ""
|
||||
page_attr: ""
|
||||
fields:
|
||||
- name: content
|
||||
css: ""
|
||||
xpath: .//div[@id="content_views"]
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: views
|
||||
css: .read-count
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: title
|
||||
css: .title-article
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: author
|
||||
css: .follow-nickName
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
settings:
|
||||
AUTOTHROTTLE_ENABLED: "false"
|
||||
ROBOTSTXT_OBEY: "false"
|
||||
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
|
||||
like Gecko) Chrome/79.0.3945.117 Safari/537.36
|
||||
57
spiders/douban_config/Spiderfile
Normal file
57
spiders/douban_config/Spiderfile
Normal file
@@ -0,0 +1,57 @@
|
||||
name: "douban_config"
|
||||
display_name: "豆瓣读书(可配置)"
|
||||
remark: "豆瓣读书新书推荐,列表"
|
||||
type: "configurable"
|
||||
col: "results_douban_config"
|
||||
engine: scrapy
|
||||
start_url: https://book.douban.com/latest
|
||||
start_stage: list
|
||||
stages:
|
||||
- name: list
|
||||
is_list: true
|
||||
list_css: ul.cover-col-4 > li
|
||||
list_xpath: ""
|
||||
page_css: ""
|
||||
page_xpath: ""
|
||||
page_attr: ""
|
||||
fields:
|
||||
- name: title
|
||||
css: h2 > a
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: url
|
||||
css: h2 > a
|
||||
xpath: ""
|
||||
attr: href
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: img
|
||||
css: a.cover img
|
||||
xpath: ""
|
||||
attr: src
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: rating
|
||||
css: p.rating > .color-lightgray
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: abstract
|
||||
css: p:last-child
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: info
|
||||
css: .color-gray
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
settings:
|
||||
ROBOTSTXT_OBEY: "false"
|
||||
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
|
||||
like Gecko) Chrome/78.0.3904.108 Safari/537.36
|
||||
5
spiders/jd/Spiderfile
Normal file
5
spiders/jd/Spiderfile
Normal file
@@ -0,0 +1,5 @@
|
||||
name: "jd"
|
||||
display_name: "京东 (Scrapy)"
|
||||
col: "results_jd"
|
||||
type: "customized"
|
||||
cmd: "scrapy crawl jd_spider"
|
||||
@@ -12,3 +12,4 @@ class JdItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
name = scrapy.Field()
|
||||
price = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
|
||||
@@ -4,14 +4,3 @@
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
from pymongo import MongoClient
|
||||
|
||||
|
||||
class JdPipeline(object):
|
||||
mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
|
||||
db = mongo[MONGO_DB]
|
||||
col_name = os.environ.get('CRAWLAB_COLLECTION') or 'jd_products'
|
||||
col = db[col_name]
|
||||
|
||||
def process_item(self, item, spider):
|
||||
return item
|
||||
|
||||
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'jd.spiders'
|
||||
#USER_AGENT = 'jd (+http://www.yourdomain.com)'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
@@ -65,7 +65,7 @@ ROBOTSTXT_OBEY = True
|
||||
# Configure item pipelines
|
||||
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
'jd.pipelines.JdPipeline': 300,
|
||||
'crawlab.pipelines.CrawlabMongoPipeline': 300,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
|
||||
@@ -1,11 +1,21 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import scrapy
|
||||
|
||||
from jd.items import JdItem
|
||||
|
||||
|
||||
class JdSpiderSpider(scrapy.Spider):
|
||||
name = 'jd_spider'
|
||||
allowed_domains = ['jd.com']
|
||||
start_urls = ['http://jd.com/']
|
||||
|
||||
def start_requests(self):
|
||||
for i in range(1, 50):
|
||||
yield scrapy.Request(url=f'https://search.jd.com/Search?keyword=手机&enc=utf-8&page={i}')
|
||||
|
||||
def parse(self, response):
|
||||
pass
|
||||
for el in response.css('.gl-item'):
|
||||
yield JdItem(
|
||||
url=el.css('.p-name > a::attr("href")').extract_first(),
|
||||
name=el.css('.p-name > a::attr("title")').extract_first(),
|
||||
price=float(el.css('.p-price i::text').extract_first()),
|
||||
)
|
||||
|
||||
4
spiders/realestate/Spiderfile
Normal file
4
spiders/realestate/Spiderfile
Normal file
@@ -0,0 +1,4 @@
|
||||
name: "realestate"
|
||||
display_name: "链家网 (Scrapy)"
|
||||
col: "results_realestate"
|
||||
cmd: "scrapy crawl lianjia"
|
||||
5
spiders/sinastock/Spiderfile
Normal file
5
spiders/sinastock/Spiderfile
Normal file
@@ -0,0 +1,5 @@
|
||||
name: "sinastock"
|
||||
display_name: "新浪股票 (Scrapy)"
|
||||
type: "customized"
|
||||
col: "results_sinastock"
|
||||
cmd: "scrapy crawl sinastock_spider"
|
||||
@@ -4,25 +4,3 @@
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
import os
|
||||
|
||||
from pymongo import MongoClient
|
||||
|
||||
|
||||
class SinastockPipeline(object):
|
||||
mongo = MongoClient(
|
||||
host=os.environ.get('MONGO_HOST') or 'localhost',
|
||||
port=int(os.environ.get('MONGO_PORT') or 27017)
|
||||
)
|
||||
db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test']
|
||||
col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'stock_news')
|
||||
|
||||
# create indexes
|
||||
col.create_index('stocks')
|
||||
col.create_index('url')
|
||||
|
||||
def process_item(self, item, spider):
|
||||
item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
|
||||
if self.col.find_one({'url': item['url']}) is None:
|
||||
self.col.save(item)
|
||||
return item
|
||||
|
||||
@@ -64,7 +64,7 @@ ROBOTSTXT_OBEY = True
|
||||
# Configure item pipelines
|
||||
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
'sinastock.pipelines.SinastockPipeline': 300,
|
||||
'crawlab.pipelines.CrawlabMongoPipeline': 300,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
|
||||
54
spiders/v2ex_config/Spiderfile
Normal file
54
spiders/v2ex_config/Spiderfile
Normal file
@@ -0,0 +1,54 @@
|
||||
name: "v2ex_config"
|
||||
display_name: "V2ex(可配置)"
|
||||
remark: "V2ex,列表+详情"
|
||||
type: "configurable"
|
||||
col: "results_v2ex_config"
|
||||
engine: scrapy
|
||||
start_url: https://v2ex.com/
|
||||
start_stage: list
|
||||
stages:
|
||||
- name: list
|
||||
is_list: true
|
||||
list_css: .cell.item
|
||||
list_xpath: ""
|
||||
page_css: ""
|
||||
page_xpath: ""
|
||||
page_attr: href
|
||||
fields:
|
||||
- name: title
|
||||
css: a.topic-link
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: url
|
||||
css: a.topic-link
|
||||
xpath: ""
|
||||
attr: href
|
||||
next_stage: detail
|
||||
remark: ""
|
||||
- name: replies
|
||||
css: .count_livid
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: detail
|
||||
is_list: false
|
||||
list_css: ""
|
||||
list_xpath: ""
|
||||
page_css: ""
|
||||
page_xpath: ""
|
||||
page_attr: ""
|
||||
fields:
|
||||
- name: content
|
||||
css: ""
|
||||
xpath: .//*[@class="markdown_body"]
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
settings:
|
||||
AUTOTHROTTLE_ENABLED: "true"
|
||||
ROBOTSTXT_OBEY: "false"
|
||||
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
|
||||
like Gecko) Chrome/79.0.3945.117 Safari/537.36
|
||||
5
spiders/xueqiu/Spiderfile
Normal file
5
spiders/xueqiu/Spiderfile
Normal file
@@ -0,0 +1,5 @@
|
||||
name: "xueqiu"
|
||||
display_name: "雪球网 (Scrapy)"
|
||||
type: "customized"
|
||||
col: "results_xueqiu"
|
||||
cmd: "scrapy crawl xueqiu_spider"
|
||||
@@ -4,26 +4,3 @@
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
import os
|
||||
|
||||
from pymongo import MongoClient
|
||||
|
||||
|
||||
class XueqiuPipeline(object):
|
||||
mongo = MongoClient(
|
||||
host=os.environ.get('MONGO_HOST') or 'localhost',
|
||||
port=int(os.environ.get('MONGO_PORT') or 27017)
|
||||
)
|
||||
db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test']
|
||||
col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'results_xueqiu')
|
||||
|
||||
# create indexes
|
||||
col.create_index('stocks')
|
||||
col.create_index('id')
|
||||
col.create_index('url')
|
||||
|
||||
def process_item(self, item, spider):
|
||||
item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
|
||||
if self.col.find_one({'id': item['id']}) is None:
|
||||
self.col.save(item)
|
||||
return item
|
||||
|
||||
@@ -18,7 +18,7 @@ NEWSPIDER_MODULE = 'xueqiu.spiders'
|
||||
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
# CONCURRENT_REQUESTS = 32
|
||||
@@ -64,7 +64,7 @@ ROBOTSTXT_OBEY = True
|
||||
# Configure item pipelines
|
||||
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
'xueqiu.pipelines.XueqiuPipeline': 300,
|
||||
'crawlab.pipelines.CrawlabMongoPipeline': 300,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
|
||||
39
spiders/xueqiu_config/Spiderfile
Normal file
39
spiders/xueqiu_config/Spiderfile
Normal file
@@ -0,0 +1,39 @@
|
||||
name: "xueqiu_config"
|
||||
display_name: "雪球网(可配置)"
|
||||
remark: "雪球网新闻,列表"
|
||||
type: "configurable"
|
||||
col: "results_xueqiu_config"
|
||||
engine: scrapy
|
||||
start_url: https://xueqiu.com/
|
||||
start_stage: list
|
||||
stages:
|
||||
- name: list
|
||||
is_list: true
|
||||
list_css: ""
|
||||
list_xpath: .//*[contains(@class, "AnonymousHome_home__timeline__item")]
|
||||
page_css: ""
|
||||
page_xpath: ""
|
||||
page_attr: ""
|
||||
fields:
|
||||
- name: title
|
||||
css: h3 > a
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: url
|
||||
css: h3 > a
|
||||
xpath: ""
|
||||
attr: href
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: abstract
|
||||
css: p
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
settings:
|
||||
ROBOTSTXT_OBEY: "false"
|
||||
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
|
||||
like Gecko) Chrome/78.0.3904.108 Safari/537.36
|
||||
45
spiders/zongheng_config/Spiderfile
Normal file
45
spiders/zongheng_config/Spiderfile
Normal file
@@ -0,0 +1,45 @@
|
||||
name: "zongheng_config"
|
||||
display_name: "纵横(可配置)"
|
||||
remark: "纵横小说网,列表"
|
||||
type: "configurable"
|
||||
col: "results_zongheng_config"
|
||||
engine: scrapy
|
||||
start_url: http://www.zongheng.com/rank/details.html?rt=1&d=1
|
||||
start_stage: list
|
||||
stages:
|
||||
- name: list
|
||||
is_list: true
|
||||
list_css: .rank_d_list
|
||||
list_xpath: ""
|
||||
page_css: ""
|
||||
page_xpath: ""
|
||||
page_attr: href
|
||||
fields:
|
||||
- name: title
|
||||
css: .rank_d_b_name > a
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: url
|
||||
css: .rank_d_b_name > a
|
||||
xpath: ""
|
||||
attr: href
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: abstract
|
||||
css: body
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: votes
|
||||
css: .rank_d_b_ticket
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
settings:
|
||||
ROBOTSTXT_OBEY: "false"
|
||||
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
|
||||
like Gecko) Chrome/78.0.3904.108 Safari/537.36
|
||||
Reference in New Issue
Block a user