added demo spiders

This commit is contained in:
marvzhang
2020-02-03 09:21:41 +08:00
parent 5740774ccc
commit cb6a8f79d8
24 changed files with 504 additions and 87 deletions

View File

@@ -6,9 +6,9 @@ type ConfigSpiderData struct {
DisplayName string `yaml:"display_name" json:"display_name"`
Col string `yaml:"col" json:"col"`
Remark string `yaml:"remark" json:"remark"`
Type string `yaml:"type" bson:"type"`
// 可配置爬虫
Version string `yaml:"version" json:"version"`
Engine string `yaml:"engine" json:"engine"`
StartUrl string `yaml:"start_url" json:"start_url"`
StartStage string `yaml:"start_stage" json:"start_stage"`

View File

@@ -6,6 +6,7 @@ import (
"crawlab/entity"
"crawlab/model"
"crawlab/model/config_spider"
"crawlab/services/spider_handler"
"crawlab/utils"
"errors"
"fmt"
@@ -227,6 +228,17 @@ func ProcessSpiderFilesFromConfigData(spider model.Spider, configData entity.Con
spider.FileId = fid
_ = spider.Save()
// 获取爬虫同步实例
spiderSync := spider_handler.SpiderSync{
Spider: spider,
}
// 获取gfFile
gfFile2 := model.GetGridFs(spider.FileId)
// 生成MD5
spiderSync.CreateMd5File(gfFile2.Md5)
return nil
}

View File

@@ -313,30 +313,58 @@ func InitSpiderService() error {
continue
}
// 添加该爬虫到数据库
spider = model.Spider{
Id: bson.NewObjectId(),
Name: configData.Name,
DisplayName: configData.DisplayName,
Type: constants.Customized,
Col: configData.Col,
Cmd: configData.Cmd,
Src: spiderPath,
Remark: configData.Remark,
ProjectId: bson.ObjectIdHex(constants.ObjectIdNull),
FileId: bson.ObjectIdHex(constants.ObjectIdNull),
}
if err := spider.Add(); err != nil {
log.Errorf("add spider error: " + err.Error())
debug.PrintStack()
continue
}
if configData.Type == constants.Customized {
// 添加该爬虫到数据库
spider = model.Spider{
Id: bson.NewObjectId(),
Name: configData.Name,
DisplayName: configData.DisplayName,
Type: constants.Customized,
Col: configData.Col,
Src: spiderPath,
Remark: configData.Remark,
ProjectId: bson.ObjectIdHex(constants.ObjectIdNull),
FileId: bson.ObjectIdHex(constants.ObjectIdNull),
Cmd: configData.Cmd,
}
if err := spider.Add(); err != nil {
log.Errorf("add spider error: " + err.Error())
debug.PrintStack()
continue
}
// 上传爬虫到GridFS
if err := UploadSpiderToGridFsFromMaster(spider); err != nil {
log.Errorf("upload spider error: " + err.Error())
debug.PrintStack()
continue
// 上传爬虫到GridFS
if err := UploadSpiderToGridFsFromMaster(spider); err != nil {
log.Errorf("upload spider error: " + err.Error())
debug.PrintStack()
continue
}
} else if configData.Type == constants.Configurable || configData.Type == "config" {
// 添加该爬虫到数据库
spider = model.Spider{
Id: bson.NewObjectId(),
Name: configData.Name,
DisplayName: configData.DisplayName,
Type: constants.Configurable,
Col: configData.Col,
Src: spiderPath,
Remark: configData.Remark,
ProjectId: bson.ObjectIdHex(constants.ObjectIdNull),
FileId: bson.ObjectIdHex(constants.ObjectIdNull),
Config: configData,
}
if err := spider.Add(); err != nil {
log.Errorf("add spider error: " + err.Error())
debug.PrintStack()
continue
}
// 根据序列化后的数据处理爬虫文件
if err := ProcessSpiderFilesFromConfigData(spider, configData); err != nil {
log.Errorf("add spider error: " + err.Error())
debug.PrintStack()
continue
}
}
}

View File

@@ -0,0 +1,51 @@
name: "amazon_config"
display_name: "亚马逊中国(可配置)"
remark: "亚马逊中国搜索手机,列表+分页"
type: "configurable"
col: "results_amazon_config"
engine: scrapy
start_url: https://www.amazon.cn/s?k=%E6%89%8B%E6%9C%BA&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_2
start_stage: list
stages:
- name: list
is_list: true
list_css: .s-result-item
list_xpath: ""
page_css: .a-last > a
page_xpath: ""
page_attr: href
fields:
- name: title
css: span.a-text-normal
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: url
css: .a-link-normal
xpath: ""
attr: href
next_stage: ""
remark: ""
- name: price
css: ""
xpath: .//*[@class="a-price-whole"]
attr: ""
next_stage: ""
remark: ""
- name: price_fraction
css: ""
xpath: .//*[@class="a-price-fraction"]
attr: ""
next_stage: ""
remark: ""
- name: img
css: .s-image-square-aspect > img
xpath: ""
attr: src
next_stage: ""
remark: ""
settings:
ROBOTSTXT_OBEY: "false"
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/78.0.3904.108 Safari/537.36

View File

@@ -0,0 +1,57 @@
name: "autohome_config"
display_name: "汽车之家(可配置)"
remark: "汽车之家文章,列表+详情+分页"
type: "configurable"
col: "results_autohome_config"
engine: scrapy
start_url: https://www.autohome.com.cn/all/
start_stage: list
stages:
- name: list
is_list: true
list_css: ul.article > li
list_xpath: ""
page_css: a.page-item-next
page_xpath: ""
page_attr: href
fields:
- name: title
css: li > a > h3
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: url
css: li > a
xpath: ""
attr: href
next_stage: ""
remark: ""
- name: abstract
css: li > a > p
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: time
css: li > a .fn-left
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: views
css: li > a .fn-right > em:first-child
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: comments
css: li > a .fn-right > em:last-child
xpath: ""
attr: ""
next_stage: ""
remark: ""
settings:
ROBOTSTXT_OBEY: "false"
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/78.0.3904.108 Safari/537.36

View File

@@ -0,0 +1,39 @@
name: "baidu_config"
display_name: "百度搜索(可配置)"
remark: "百度搜索Crawlab列表+分页"
type: "configurable"
col: "results_baidu_config"
engine: scrapy
start_url: http://www.baidu.com/s?wd=crawlab
start_stage: list
stages:
- name: list
is_list: true
list_css: ""
list_xpath: //body
page_css: ""
page_xpath: //body
page_attr: href
fields:
- name: title
css: ""
xpath: .//h3/a
attr: href
next_stage: ""
remark: ""
- name: url
css: ""
xpath: .//h3/a
attr: href
next_stage: ""
remark: ""
- name: abstract
css: ""
xpath: .//*[@class="c-abstract"]
attr: href
next_stage: ""
remark: ""
settings:
ROBOTSTXT_OBEY: "false"
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/78.0.3904.108 Safari/537.36

View File

@@ -1,4 +1,5 @@
name: "chinaz"
display_name: "站长之家 (Scrapy)"
col: "results_chinaz"
type: "customized"
cmd: "scrapy crawl chinaz_spider"

View File

@@ -0,0 +1,60 @@
name: "csdn_config"
display_name: "CSDN可配置"
remark: "CSDN Crawlab 文章,列表+详情+分页"
type: "configurable"
col: "results_csdn_config"
engine: scrapy
start_url: https://so.csdn.net/so/search/s.do?q=crawlab
start_stage: list
stages:
- name: list
is_list: true
list_css: .search-list-con > .search-list
list_xpath: ""
page_css: a.btn-next
page_xpath: ""
page_attr: href
fields:
- name: url
css: ""
xpath: .//*[@class="limit_width"]/a
attr: href
next_stage: detail
remark: ""
- name: detail
is_list: false
list_css: ""
list_xpath: ""
page_css: ""
page_xpath: ""
page_attr: ""
fields:
- name: content
css: ""
xpath: .//div[@id="content_views"]
attr: ""
next_stage: ""
remark: ""
- name: views
css: .read-count
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: title
css: .title-article
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: author
css: .follow-nickName
xpath: ""
attr: ""
next_stage: ""
remark: ""
settings:
AUTOTHROTTLE_ENABLED: "false"
ROBOTSTXT_OBEY: "false"
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/79.0.3945.117 Safari/537.36

View File

@@ -0,0 +1,57 @@
name: "douban_config"
display_name: "豆瓣读书(可配置)"
remark: "豆瓣读书新书推荐,列表"
type: "configurable"
col: "results_douban_config"
engine: scrapy
start_url: https://book.douban.com/latest
start_stage: list
stages:
- name: list
is_list: true
list_css: ul.cover-col-4 > li
list_xpath: ""
page_css: ""
page_xpath: ""
page_attr: ""
fields:
- name: title
css: h2 > a
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: url
css: h2 > a
xpath: ""
attr: href
next_stage: ""
remark: ""
- name: img
css: a.cover img
xpath: ""
attr: src
next_stage: ""
remark: ""
- name: rating
css: p.rating > .color-lightgray
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: abstract
css: p:last-child
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: info
css: .color-gray
xpath: ""
attr: ""
next_stage: ""
remark: ""
settings:
ROBOTSTXT_OBEY: "false"
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/78.0.3904.108 Safari/537.36

5
spiders/jd/Spiderfile Normal file
View File

@@ -0,0 +1,5 @@
name: "jd"
display_name: "京东 (Scrapy)"
col: "results_jd"
type: "customized"
cmd: "scrapy crawl jd_spider"

View File

@@ -12,3 +12,4 @@ class JdItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
price = scrapy.Field()
url = scrapy.Field()

View File

@@ -4,14 +4,3 @@
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from pymongo import MongoClient
class JdPipeline(object):
mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
db = mongo[MONGO_DB]
col_name = os.environ.get('CRAWLAB_COLLECTION') or 'jd_products'
col = db[col_name]
def process_item(self, item, spider):
return item

View File

@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'jd.spiders'
#USER_AGENT = 'jd (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
@@ -65,7 +65,7 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'jd.pipelines.JdPipeline': 300,
'crawlab.pipelines.CrawlabMongoPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)

View File

@@ -1,11 +1,21 @@
# -*- coding: utf-8 -*-
import scrapy
from jd.items import JdItem
class JdSpiderSpider(scrapy.Spider):
name = 'jd_spider'
allowed_domains = ['jd.com']
start_urls = ['http://jd.com/']
def start_requests(self):
for i in range(1, 50):
yield scrapy.Request(url=f'https://search.jd.com/Search?keyword=手机&enc=utf-8&page={i}')
def parse(self, response):
pass
for el in response.css('.gl-item'):
yield JdItem(
url=el.css('.p-name > a::attr("href")').extract_first(),
name=el.css('.p-name > a::attr("title")').extract_first(),
price=float(el.css('.p-price i::text').extract_first()),
)

View File

@@ -0,0 +1,4 @@
name: "realestate"
display_name: "链家网 (Scrapy)"
col: "results_realestate"
cmd: "scrapy crawl lianjia"

View File

@@ -0,0 +1,5 @@
name: "sinastock"
display_name: "新浪股票 (Scrapy)"
type: "customized"
col: "results_sinastock"
cmd: "scrapy crawl sinastock_spider"

View File

@@ -4,25 +4,3 @@
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os
from pymongo import MongoClient
class SinastockPipeline(object):
mongo = MongoClient(
host=os.environ.get('MONGO_HOST') or 'localhost',
port=int(os.environ.get('MONGO_PORT') or 27017)
)
db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test']
col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'stock_news')
# create indexes
col.create_index('stocks')
col.create_index('url')
def process_item(self, item, spider):
item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
if self.col.find_one({'url': item['url']}) is None:
self.col.save(item)
return item

View File

@@ -64,7 +64,7 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'sinastock.pipelines.SinastockPipeline': 300,
'crawlab.pipelines.CrawlabMongoPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)

View File

@@ -0,0 +1,54 @@
name: "v2ex_config"
display_name: "V2ex可配置"
remark: "V2ex列表+详情"
type: "configurable"
col: "results_v2ex_config"
engine: scrapy
start_url: https://v2ex.com/
start_stage: list
stages:
- name: list
is_list: true
list_css: .cell.item
list_xpath: ""
page_css: ""
page_xpath: ""
page_attr: href
fields:
- name: title
css: a.topic-link
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: url
css: a.topic-link
xpath: ""
attr: href
next_stage: detail
remark: ""
- name: replies
css: .count_livid
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: detail
is_list: false
list_css: ""
list_xpath: ""
page_css: ""
page_xpath: ""
page_attr: ""
fields:
- name: content
css: ""
xpath: .//*[@class="markdown_body"]
attr: ""
next_stage: ""
remark: ""
settings:
AUTOTHROTTLE_ENABLED: "true"
ROBOTSTXT_OBEY: "false"
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/79.0.3945.117 Safari/537.36

View File

@@ -0,0 +1,5 @@
name: "xueqiu"
display_name: "雪球网 (Scrapy)"
type: "customized"
col: "results_xueqiu"
cmd: "scrapy crawl xueqiu_spider"

View File

@@ -4,26 +4,3 @@
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os
from pymongo import MongoClient
class XueqiuPipeline(object):
mongo = MongoClient(
host=os.environ.get('MONGO_HOST') or 'localhost',
port=int(os.environ.get('MONGO_PORT') or 27017)
)
db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test']
col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'results_xueqiu')
# create indexes
col.create_index('stocks')
col.create_index('id')
col.create_index('url')
def process_item(self, item, spider):
item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
if self.col.find_one({'id': item['id']}) is None:
self.col.save(item)
return item

View File

@@ -18,7 +18,7 @@ NEWSPIDER_MODULE = 'xueqiu.spiders'
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
@@ -64,7 +64,7 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'xueqiu.pipelines.XueqiuPipeline': 300,
'crawlab.pipelines.CrawlabMongoPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)

View File

@@ -0,0 +1,39 @@
name: "xueqiu_config"
display_name: "雪球网(可配置)"
remark: "雪球网新闻,列表"
type: "configurable"
col: "results_xueqiu_config"
engine: scrapy
start_url: https://xueqiu.com/
start_stage: list
stages:
- name: list
is_list: true
list_css: ""
list_xpath: .//*[contains(@class, "AnonymousHome_home__timeline__item")]
page_css: ""
page_xpath: ""
page_attr: ""
fields:
- name: title
css: h3 > a
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: url
css: h3 > a
xpath: ""
attr: href
next_stage: ""
remark: ""
- name: abstract
css: p
xpath: ""
attr: ""
next_stage: ""
remark: ""
settings:
ROBOTSTXT_OBEY: "false"
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/78.0.3904.108 Safari/537.36

View File

@@ -0,0 +1,45 @@
name: "zongheng_config"
display_name: "纵横(可配置)"
remark: "纵横小说网,列表"
type: "configurable"
col: "results_zongheng_config"
engine: scrapy
start_url: http://www.zongheng.com/rank/details.html?rt=1&d=1
start_stage: list
stages:
- name: list
is_list: true
list_css: .rank_d_list
list_xpath: ""
page_css: ""
page_xpath: ""
page_attr: href
fields:
- name: title
css: .rank_d_b_name > a
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: url
css: .rank_d_b_name > a
xpath: ""
attr: href
next_stage: ""
remark: ""
- name: abstract
css: body
xpath: ""
attr: ""
next_stage: ""
remark: ""
- name: votes
css: .rank_d_b_ticket
xpath: ""
attr: ""
next_stage: ""
remark: ""
settings:
ROBOTSTXT_OBEY: "false"
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/78.0.3904.108 Safari/537.36