mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-22 17:31:03 +01:00
support send log to ES
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,4 +1,5 @@
|
||||
.idea/
|
||||
.vscode/
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
|
||||
51
backend/app/spiders/amazon_config/Spiderfile
Executable file
51
backend/app/spiders/amazon_config/Spiderfile
Executable file
@@ -0,0 +1,51 @@
|
||||
name: "amazon_config"
|
||||
display_name: "亚马逊中国(可配置)"
|
||||
remark: "亚马逊中国搜索手机,列表+分页"
|
||||
type: "configurable"
|
||||
col: "results_amazon_config"
|
||||
engine: scrapy
|
||||
start_url: https://www.amazon.cn/s?k=%E6%89%8B%E6%9C%BA&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_2
|
||||
start_stage: list
|
||||
stages:
|
||||
- name: list
|
||||
is_list: true
|
||||
list_css: .s-result-item
|
||||
list_xpath: ""
|
||||
page_css: .a-last > a
|
||||
page_xpath: ""
|
||||
page_attr: href
|
||||
fields:
|
||||
- name: title
|
||||
css: span.a-text-normal
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: url
|
||||
css: .a-link-normal
|
||||
xpath: ""
|
||||
attr: href
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: price
|
||||
css: ""
|
||||
xpath: .//*[@class="a-price-whole"]
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: price_fraction
|
||||
css: ""
|
||||
xpath: .//*[@class="a-price-fraction"]
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: img
|
||||
css: .s-image-square-aspect > img
|
||||
xpath: ""
|
||||
attr: src
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
settings:
|
||||
ROBOTSTXT_OBEY: "false"
|
||||
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
|
||||
like Gecko) Chrome/78.0.3904.108 Safari/537.36
|
||||
0
backend/app/spiders/amazon_config/config_spider/__init__.py
Executable file
0
backend/app/spiders/amazon_config/config_spider/__init__.py
Executable file
20
backend/app/spiders/amazon_config/config_spider/items.py
Executable file
20
backend/app/spiders/amazon_config/config_spider/items.py
Executable file
@@ -0,0 +1,20 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class Item(scrapy.Item):
|
||||
_id = scrapy.Field()
|
||||
task_id = scrapy.Field()
|
||||
ts = scrapy.Field()
|
||||
title = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
price = scrapy.Field()
|
||||
price_fraction = scrapy.Field()
|
||||
img = scrapy.Field()
|
||||
|
||||
103
backend/app/spiders/amazon_config/config_spider/middlewares.py
Executable file
103
backend/app/spiders/amazon_config/config_spider/middlewares.py
Executable file
@@ -0,0 +1,103 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your spider middleware
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class ConfigSpiderSpiderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the spider middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
# Called for each response that goes through the spider
|
||||
# middleware and into the spider.
|
||||
|
||||
# Should return None or raise an exception.
|
||||
return None
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
# Called with the results returned from the Spider, after
|
||||
# it has processed the response.
|
||||
|
||||
# Must return an iterable of Request, dict or Item objects.
|
||||
for i in result:
|
||||
yield i
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
# Called when a spider or process_spider_input() method
|
||||
# (from other spider middleware) raises an exception.
|
||||
|
||||
# Should return either None or an iterable of Request, dict
|
||||
# or Item objects.
|
||||
pass
|
||||
|
||||
def process_start_requests(self, start_requests, spider):
|
||||
# Called with the start requests of the spider, and works
|
||||
# similarly to the process_spider_output() method, except
|
||||
# that it doesn’t have a response associated.
|
||||
|
||||
# Must return only requests (not items).
|
||||
for r in start_requests:
|
||||
yield r
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
|
||||
|
||||
class ConfigSpiderDownloaderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the downloader middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# Called for each request that goes through the downloader
|
||||
# middleware.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this request
|
||||
# - or return a Response object
|
||||
# - or return a Request object
|
||||
# - or raise IgnoreRequest: process_exception() methods of
|
||||
# installed downloader middleware will be called
|
||||
return None
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
# Called with the response returned from the downloader.
|
||||
|
||||
# Must either;
|
||||
# - return a Response object
|
||||
# - return a Request object
|
||||
# - or raise IgnoreRequest
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
# Called when a download handler or a process_request()
|
||||
# (from other downloader middleware) raises an exception.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this exception
|
||||
# - return a Response object: stops process_exception() chain
|
||||
# - return a Request object: stops process_exception() chain
|
||||
pass
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
27
backend/app/spiders/amazon_config/config_spider/pipelines.py
Executable file
27
backend/app/spiders/amazon_config/config_spider/pipelines.py
Executable file
@@ -0,0 +1,27 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
import os
|
||||
from pymongo import MongoClient
|
||||
|
||||
mongo = MongoClient(
|
||||
host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost',
|
||||
port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017),
|
||||
username=os.environ.get('CRAWLAB_MONGO_USERNAME'),
|
||||
password=os.environ.get('CRAWLAB_MONGO_PASSWORD'),
|
||||
authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin'
|
||||
)
|
||||
db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test']
|
||||
col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test']
|
||||
task_id = os.environ.get('CRAWLAB_TASK_ID')
|
||||
|
||||
class ConfigSpiderPipeline(object):
|
||||
def process_item(self, item, spider):
|
||||
item['task_id'] = task_id
|
||||
if col is not None:
|
||||
col.save(item)
|
||||
return item
|
||||
111
backend/app/spiders/amazon_config/config_spider/settings.py
Executable file
111
backend/app/spiders/amazon_config/config_spider/settings.py
Executable file
@@ -0,0 +1,111 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
|
||||
# Scrapy settings for config_spider project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = 'Crawlab Configurable Spider'
|
||||
|
||||
SPIDER_MODULES = ['config_spider.spiders']
|
||||
NEWSPIDER_MODULE = 'config_spider.spiders'
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
USER_AGENT = 'Crawlab Spider'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
#DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
# 'Accept-Language': 'en',
|
||||
#}
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
#DOWNLOADER_MIDDLEWARES = {
|
||||
# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||
#}
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
'config_spider.pipelines.ConfigSpiderPipeline': 300,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = 'httpcache'
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
|
||||
for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]:
|
||||
setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '')
|
||||
setting_value = os.environ.get(setting_env_name)
|
||||
if setting_value.lower() == 'true':
|
||||
setting_value = True
|
||||
elif setting_value.lower() == 'false':
|
||||
setting_value = False
|
||||
elif re.search(r'^\d+$', setting_value) is not None:
|
||||
setting_value = int(setting_value)
|
||||
elif re.search(r'^\{.*\}$', setting_value.strip()) is not None:
|
||||
setting_value = json.loads(setting_value)
|
||||
elif re.search(r'^\[.*\]$', setting_value.strip()) is not None:
|
||||
setting_value = json.loads(setting_value)
|
||||
else:
|
||||
pass
|
||||
locals()[setting_name] = setting_value
|
||||
|
||||
4
backend/app/spiders/amazon_config/config_spider/spiders/__init__.py
Executable file
4
backend/app/spiders/amazon_config/config_spider/spiders/__init__.py
Executable file
@@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
37
backend/app/spiders/amazon_config/config_spider/spiders/spider.py
Executable file
37
backend/app/spiders/amazon_config/config_spider/spiders/spider.py
Executable file
@@ -0,0 +1,37 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import scrapy
|
||||
import re
|
||||
from config_spider.items import Item
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
def get_real_url(response, url):
|
||||
if re.search(r'^https?', url):
|
||||
return url
|
||||
elif re.search(r'^\/\/', url):
|
||||
u = urlparse(response.url)
|
||||
return u.scheme + url
|
||||
return urljoin(response.url, url)
|
||||
|
||||
class ConfigSpider(scrapy.Spider):
|
||||
name = 'config_spider'
|
||||
|
||||
def start_requests(self):
|
||||
yield scrapy.Request(url='https://www.amazon.cn/s?k=%E6%89%8B%E6%9C%BA&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_2', callback=self.parse_list)
|
||||
|
||||
def parse_list(self, response):
|
||||
prev_item = response.meta.get('item')
|
||||
for elem in response.css('.s-result-item'):
|
||||
item = Item()
|
||||
item['title'] = elem.css('span.a-text-normal::text').extract_first()
|
||||
item['url'] = elem.css('.a-link-normal::attr("href")').extract_first()
|
||||
item['price'] = elem.xpath('string(.//*[@class="a-price-whole"])').extract_first()
|
||||
item['price_fraction'] = elem.xpath('string(.//*[@class="a-price-fraction"])').extract_first()
|
||||
item['img'] = elem.css('.s-image-square-aspect > img::attr("src")').extract_first()
|
||||
if prev_item is not None:
|
||||
for key, value in prev_item.items():
|
||||
item[key] = value
|
||||
yield item
|
||||
next_url = response.css('.a-last > a::attr("href")').extract_first()
|
||||
yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item})
|
||||
|
||||
|
||||
1
backend/app/spiders/amazon_config/md5.txt
Executable file
1
backend/app/spiders/amazon_config/md5.txt
Executable file
@@ -0,0 +1 @@
|
||||
4b716dd3c15b993ccb7a9f0be1cc0de9
|
||||
11
backend/app/spiders/amazon_config/scrapy.cfg
Executable file
11
backend/app/spiders/amazon_config/scrapy.cfg
Executable file
@@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = config_spider.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = config_spider
|
||||
57
backend/app/spiders/autohome_config/Spiderfile
Executable file
57
backend/app/spiders/autohome_config/Spiderfile
Executable file
@@ -0,0 +1,57 @@
|
||||
name: "autohome_config"
|
||||
display_name: "汽车之家(可配置)"
|
||||
remark: "汽车之家文章,列表+详情+分页"
|
||||
type: "configurable"
|
||||
col: "results_autohome_config"
|
||||
engine: scrapy
|
||||
start_url: https://www.autohome.com.cn/all/
|
||||
start_stage: list
|
||||
stages:
|
||||
- name: list
|
||||
is_list: true
|
||||
list_css: ul.article > li
|
||||
list_xpath: ""
|
||||
page_css: a.page-item-next
|
||||
page_xpath: ""
|
||||
page_attr: href
|
||||
fields:
|
||||
- name: title
|
||||
css: li > a > h3
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: url
|
||||
css: li > a
|
||||
xpath: ""
|
||||
attr: href
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: abstract
|
||||
css: li > a > p
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: time
|
||||
css: li > a .fn-left
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: views
|
||||
css: li > a .fn-right > em:first-child
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: comments
|
||||
css: li > a .fn-right > em:last-child
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
settings:
|
||||
ROBOTSTXT_OBEY: "false"
|
||||
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
|
||||
like Gecko) Chrome/78.0.3904.108 Safari/537.36
|
||||
0
backend/app/spiders/autohome_config/config_spider/__init__.py
Executable file
0
backend/app/spiders/autohome_config/config_spider/__init__.py
Executable file
21
backend/app/spiders/autohome_config/config_spider/items.py
Executable file
21
backend/app/spiders/autohome_config/config_spider/items.py
Executable file
@@ -0,0 +1,21 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class Item(scrapy.Item):
|
||||
_id = scrapy.Field()
|
||||
task_id = scrapy.Field()
|
||||
ts = scrapy.Field()
|
||||
title = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
abstract = scrapy.Field()
|
||||
time = scrapy.Field()
|
||||
views = scrapy.Field()
|
||||
comments = scrapy.Field()
|
||||
|
||||
103
backend/app/spiders/autohome_config/config_spider/middlewares.py
Executable file
103
backend/app/spiders/autohome_config/config_spider/middlewares.py
Executable file
@@ -0,0 +1,103 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your spider middleware
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class ConfigSpiderSpiderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the spider middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
# Called for each response that goes through the spider
|
||||
# middleware and into the spider.
|
||||
|
||||
# Should return None or raise an exception.
|
||||
return None
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
# Called with the results returned from the Spider, after
|
||||
# it has processed the response.
|
||||
|
||||
# Must return an iterable of Request, dict or Item objects.
|
||||
for i in result:
|
||||
yield i
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
# Called when a spider or process_spider_input() method
|
||||
# (from other spider middleware) raises an exception.
|
||||
|
||||
# Should return either None or an iterable of Request, dict
|
||||
# or Item objects.
|
||||
pass
|
||||
|
||||
def process_start_requests(self, start_requests, spider):
|
||||
# Called with the start requests of the spider, and works
|
||||
# similarly to the process_spider_output() method, except
|
||||
# that it doesn’t have a response associated.
|
||||
|
||||
# Must return only requests (not items).
|
||||
for r in start_requests:
|
||||
yield r
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
|
||||
|
||||
class ConfigSpiderDownloaderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the downloader middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# Called for each request that goes through the downloader
|
||||
# middleware.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this request
|
||||
# - or return a Response object
|
||||
# - or return a Request object
|
||||
# - or raise IgnoreRequest: process_exception() methods of
|
||||
# installed downloader middleware will be called
|
||||
return None
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
# Called with the response returned from the downloader.
|
||||
|
||||
# Must either;
|
||||
# - return a Response object
|
||||
# - return a Request object
|
||||
# - or raise IgnoreRequest
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
# Called when a download handler or a process_request()
|
||||
# (from other downloader middleware) raises an exception.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this exception
|
||||
# - return a Response object: stops process_exception() chain
|
||||
# - return a Request object: stops process_exception() chain
|
||||
pass
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
27
backend/app/spiders/autohome_config/config_spider/pipelines.py
Executable file
27
backend/app/spiders/autohome_config/config_spider/pipelines.py
Executable file
@@ -0,0 +1,27 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
import os
|
||||
from pymongo import MongoClient
|
||||
|
||||
mongo = MongoClient(
|
||||
host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost',
|
||||
port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017),
|
||||
username=os.environ.get('CRAWLAB_MONGO_USERNAME'),
|
||||
password=os.environ.get('CRAWLAB_MONGO_PASSWORD'),
|
||||
authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin'
|
||||
)
|
||||
db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test']
|
||||
col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test']
|
||||
task_id = os.environ.get('CRAWLAB_TASK_ID')
|
||||
|
||||
class ConfigSpiderPipeline(object):
|
||||
def process_item(self, item, spider):
|
||||
item['task_id'] = task_id
|
||||
if col is not None:
|
||||
col.save(item)
|
||||
return item
|
||||
111
backend/app/spiders/autohome_config/config_spider/settings.py
Executable file
111
backend/app/spiders/autohome_config/config_spider/settings.py
Executable file
@@ -0,0 +1,111 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
|
||||
# Scrapy settings for config_spider project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = 'Crawlab Configurable Spider'
|
||||
|
||||
SPIDER_MODULES = ['config_spider.spiders']
|
||||
NEWSPIDER_MODULE = 'config_spider.spiders'
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
USER_AGENT = 'Crawlab Spider'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
#DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
# 'Accept-Language': 'en',
|
||||
#}
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
#DOWNLOADER_MIDDLEWARES = {
|
||||
# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||
#}
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
'config_spider.pipelines.ConfigSpiderPipeline': 300,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = 'httpcache'
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
|
||||
for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]:
|
||||
setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '')
|
||||
setting_value = os.environ.get(setting_env_name)
|
||||
if setting_value.lower() == 'true':
|
||||
setting_value = True
|
||||
elif setting_value.lower() == 'false':
|
||||
setting_value = False
|
||||
elif re.search(r'^\d+$', setting_value) is not None:
|
||||
setting_value = int(setting_value)
|
||||
elif re.search(r'^\{.*\}$', setting_value.strip()) is not None:
|
||||
setting_value = json.loads(setting_value)
|
||||
elif re.search(r'^\[.*\]$', setting_value.strip()) is not None:
|
||||
setting_value = json.loads(setting_value)
|
||||
else:
|
||||
pass
|
||||
locals()[setting_name] = setting_value
|
||||
|
||||
4
backend/app/spiders/autohome_config/config_spider/spiders/__init__.py
Executable file
4
backend/app/spiders/autohome_config/config_spider/spiders/__init__.py
Executable file
@@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
38
backend/app/spiders/autohome_config/config_spider/spiders/spider.py
Executable file
38
backend/app/spiders/autohome_config/config_spider/spiders/spider.py
Executable file
@@ -0,0 +1,38 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import scrapy
|
||||
import re
|
||||
from config_spider.items import Item
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
def get_real_url(response, url):
|
||||
if re.search(r'^https?', url):
|
||||
return url
|
||||
elif re.search(r'^\/\/', url):
|
||||
u = urlparse(response.url)
|
||||
return u.scheme + url
|
||||
return urljoin(response.url, url)
|
||||
|
||||
class ConfigSpider(scrapy.Spider):
|
||||
name = 'config_spider'
|
||||
|
||||
def start_requests(self):
|
||||
yield scrapy.Request(url='https://www.autohome.com.cn/all/', callback=self.parse_list)
|
||||
|
||||
def parse_list(self, response):
|
||||
prev_item = response.meta.get('item')
|
||||
for elem in response.css('ul.article > li'):
|
||||
item = Item()
|
||||
item['title'] = elem.css('li > a > h3::text').extract_first()
|
||||
item['url'] = elem.css('li > a::attr("href")').extract_first()
|
||||
item['abstract'] = elem.css('li > a > p::text').extract_first()
|
||||
item['time'] = elem.css('li > a .fn-left::text').extract_first()
|
||||
item['views'] = elem.css('li > a .fn-right > em:first-child::text').extract_first()
|
||||
item['comments'] = elem.css('li > a .fn-right > em:last-child::text').extract_first()
|
||||
if prev_item is not None:
|
||||
for key, value in prev_item.items():
|
||||
item[key] = value
|
||||
yield item
|
||||
next_url = response.css('a.page-item-next::attr("href")').extract_first()
|
||||
yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item})
|
||||
|
||||
|
||||
1
backend/app/spiders/autohome_config/md5.txt
Executable file
1
backend/app/spiders/autohome_config/md5.txt
Executable file
@@ -0,0 +1 @@
|
||||
d784a11085e298eaf344eadc3a3e9411
|
||||
11
backend/app/spiders/autohome_config/scrapy.cfg
Executable file
11
backend/app/spiders/autohome_config/scrapy.cfg
Executable file
@@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = config_spider.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = config_spider
|
||||
39
backend/app/spiders/baidu_config/Spiderfile
Executable file
39
backend/app/spiders/baidu_config/Spiderfile
Executable file
@@ -0,0 +1,39 @@
|
||||
name: "baidu_config"
|
||||
display_name: "百度搜索(可配置)"
|
||||
remark: "百度搜索Crawlab,列表+分页"
|
||||
type: "configurable"
|
||||
col: "results_baidu_config"
|
||||
engine: scrapy
|
||||
start_url: http://www.baidu.com/s?wd=crawlab
|
||||
start_stage: list
|
||||
stages:
|
||||
- name: list
|
||||
is_list: true
|
||||
list_css: ".result.c-container"
|
||||
list_xpath: ""
|
||||
page_css: "a.n"
|
||||
page_xpath: ""
|
||||
page_attr: href
|
||||
fields:
|
||||
- name: title
|
||||
css: ""
|
||||
xpath: .//h3/a
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: url
|
||||
css: ""
|
||||
xpath: .//h3/a
|
||||
attr: href
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: abstract
|
||||
css: ""
|
||||
xpath: .//*[@class="c-abstract"]
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
settings:
|
||||
ROBOTSTXT_OBEY: "false"
|
||||
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
|
||||
like Gecko) Chrome/78.0.3904.108 Safari/537.36
|
||||
0
backend/app/spiders/baidu_config/config_spider/__init__.py
Executable file
0
backend/app/spiders/baidu_config/config_spider/__init__.py
Executable file
18
backend/app/spiders/baidu_config/config_spider/items.py
Executable file
18
backend/app/spiders/baidu_config/config_spider/items.py
Executable file
@@ -0,0 +1,18 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class Item(scrapy.Item):
|
||||
_id = scrapy.Field()
|
||||
task_id = scrapy.Field()
|
||||
ts = scrapy.Field()
|
||||
title = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
abstract = scrapy.Field()
|
||||
|
||||
103
backend/app/spiders/baidu_config/config_spider/middlewares.py
Executable file
103
backend/app/spiders/baidu_config/config_spider/middlewares.py
Executable file
@@ -0,0 +1,103 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your spider middleware
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class ConfigSpiderSpiderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the spider middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
# Called for each response that goes through the spider
|
||||
# middleware and into the spider.
|
||||
|
||||
# Should return None or raise an exception.
|
||||
return None
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
# Called with the results returned from the Spider, after
|
||||
# it has processed the response.
|
||||
|
||||
# Must return an iterable of Request, dict or Item objects.
|
||||
for i in result:
|
||||
yield i
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
# Called when a spider or process_spider_input() method
|
||||
# (from other spider middleware) raises an exception.
|
||||
|
||||
# Should return either None or an iterable of Request, dict
|
||||
# or Item objects.
|
||||
pass
|
||||
|
||||
def process_start_requests(self, start_requests, spider):
|
||||
# Called with the start requests of the spider, and works
|
||||
# similarly to the process_spider_output() method, except
|
||||
# that it doesn’t have a response associated.
|
||||
|
||||
# Must return only requests (not items).
|
||||
for r in start_requests:
|
||||
yield r
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
|
||||
|
||||
class ConfigSpiderDownloaderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the downloader middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# Called for each request that goes through the downloader
|
||||
# middleware.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this request
|
||||
# - or return a Response object
|
||||
# - or return a Request object
|
||||
# - or raise IgnoreRequest: process_exception() methods of
|
||||
# installed downloader middleware will be called
|
||||
return None
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
# Called with the response returned from the downloader.
|
||||
|
||||
# Must either;
|
||||
# - return a Response object
|
||||
# - return a Request object
|
||||
# - or raise IgnoreRequest
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
# Called when a download handler or a process_request()
|
||||
# (from other downloader middleware) raises an exception.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this exception
|
||||
# - return a Response object: stops process_exception() chain
|
||||
# - return a Request object: stops process_exception() chain
|
||||
pass
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
27
backend/app/spiders/baidu_config/config_spider/pipelines.py
Executable file
27
backend/app/spiders/baidu_config/config_spider/pipelines.py
Executable file
@@ -0,0 +1,27 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
import os
|
||||
from pymongo import MongoClient
|
||||
|
||||
mongo = MongoClient(
|
||||
host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost',
|
||||
port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017),
|
||||
username=os.environ.get('CRAWLAB_MONGO_USERNAME'),
|
||||
password=os.environ.get('CRAWLAB_MONGO_PASSWORD'),
|
||||
authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin'
|
||||
)
|
||||
db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test']
|
||||
col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test']
|
||||
task_id = os.environ.get('CRAWLAB_TASK_ID')
|
||||
|
||||
class ConfigSpiderPipeline(object):
|
||||
def process_item(self, item, spider):
|
||||
item['task_id'] = task_id
|
||||
if col is not None:
|
||||
col.save(item)
|
||||
return item
|
||||
111
backend/app/spiders/baidu_config/config_spider/settings.py
Executable file
111
backend/app/spiders/baidu_config/config_spider/settings.py
Executable file
@@ -0,0 +1,111 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
|
||||
# Scrapy settings for config_spider project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = 'Crawlab Configurable Spider'
|
||||
|
||||
SPIDER_MODULES = ['config_spider.spiders']
|
||||
NEWSPIDER_MODULE = 'config_spider.spiders'
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
USER_AGENT = 'Crawlab Spider'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
#DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
# 'Accept-Language': 'en',
|
||||
#}
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
#DOWNLOADER_MIDDLEWARES = {
|
||||
# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||
#}
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
'config_spider.pipelines.ConfigSpiderPipeline': 300,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = 'httpcache'
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
|
||||
for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]:
|
||||
setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '')
|
||||
setting_value = os.environ.get(setting_env_name)
|
||||
if setting_value.lower() == 'true':
|
||||
setting_value = True
|
||||
elif setting_value.lower() == 'false':
|
||||
setting_value = False
|
||||
elif re.search(r'^\d+$', setting_value) is not None:
|
||||
setting_value = int(setting_value)
|
||||
elif re.search(r'^\{.*\}$', setting_value.strip()) is not None:
|
||||
setting_value = json.loads(setting_value)
|
||||
elif re.search(r'^\[.*\]$', setting_value.strip()) is not None:
|
||||
setting_value = json.loads(setting_value)
|
||||
else:
|
||||
pass
|
||||
locals()[setting_name] = setting_value
|
||||
|
||||
4
backend/app/spiders/baidu_config/config_spider/spiders/__init__.py
Executable file
4
backend/app/spiders/baidu_config/config_spider/spiders/__init__.py
Executable file
@@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
35
backend/app/spiders/baidu_config/config_spider/spiders/spider.py
Executable file
35
backend/app/spiders/baidu_config/config_spider/spiders/spider.py
Executable file
@@ -0,0 +1,35 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import scrapy
|
||||
import re
|
||||
from config_spider.items import Item
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
def get_real_url(response, url):
|
||||
if re.search(r'^https?', url):
|
||||
return url
|
||||
elif re.search(r'^\/\/', url):
|
||||
u = urlparse(response.url)
|
||||
return u.scheme + url
|
||||
return urljoin(response.url, url)
|
||||
|
||||
class ConfigSpider(scrapy.Spider):
|
||||
name = 'config_spider'
|
||||
|
||||
def start_requests(self):
|
||||
yield scrapy.Request(url='http://www.baidu.com/s?wd=crawlab', callback=self.parse_list)
|
||||
|
||||
def parse_list(self, response):
|
||||
prev_item = response.meta.get('item')
|
||||
for elem in response.css('.result.c-container'):
|
||||
item = Item()
|
||||
item['title'] = elem.xpath('string(.//h3/a)').extract_first()
|
||||
item['url'] = elem.xpath('.//h3/a/@href').extract_first()
|
||||
item['abstract'] = elem.xpath('string(.//*[@class="c-abstract"])').extract_first()
|
||||
if prev_item is not None:
|
||||
for key, value in prev_item.items():
|
||||
item[key] = value
|
||||
yield item
|
||||
next_url = response.css('a.n::attr("href")').extract_first()
|
||||
yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item})
|
||||
|
||||
|
||||
1
backend/app/spiders/baidu_config/md5.txt
Executable file
1
backend/app/spiders/baidu_config/md5.txt
Executable file
@@ -0,0 +1 @@
|
||||
ba25f6f3567b256473d3f0ec6af783fd
|
||||
11
backend/app/spiders/baidu_config/scrapy.cfg
Executable file
11
backend/app/spiders/baidu_config/scrapy.cfg
Executable file
@@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = config_spider.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = config_spider
|
||||
6
backend/app/spiders/bing_general/Spiderfile
Executable file
6
backend/app/spiders/bing_general/Spiderfile
Executable file
@@ -0,0 +1,6 @@
|
||||
name: "bing_general"
|
||||
display_name: "必应搜索 (通用)"
|
||||
remark: "必应搜索 Crawlab,列表+分页"
|
||||
col: "results_bing_general"
|
||||
type: "customized"
|
||||
cmd: "python bing_spider.py"
|
||||
41
backend/app/spiders/bing_general/bing_spider.py
Executable file
41
backend/app/spiders/bing_general/bing_spider.py
Executable file
@@ -0,0 +1,41 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup as bs
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import re
|
||||
from crawlab import save_item
|
||||
|
||||
s = requests.Session()
|
||||
|
||||
def get_real_url(response, url):
|
||||
if re.search(r'^https?', url):
|
||||
return url
|
||||
elif re.search(r'^\/\/', url):
|
||||
u = urlparse(response.url)
|
||||
return u.scheme + url
|
||||
return urljoin(response.url, url)
|
||||
|
||||
def start_requests():
|
||||
for i in range(0, 9):
|
||||
fr = 'PERE' if not i else 'MORE'
|
||||
url = f'https://cn.bing.com/search?q=crawlab&first={10 * i + 1}&FROM={fr}'
|
||||
request_page(url)
|
||||
|
||||
def request_page(url):
|
||||
print(f'requesting {url}')
|
||||
r = s.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'})
|
||||
parse_list(r)
|
||||
|
||||
def parse_list(response):
|
||||
soup = bs(response.content.decode('utf-8'))
|
||||
for el in list(soup.select('#b_results > li')):
|
||||
try:
|
||||
save_item({
|
||||
'title': el.select_one('h2').text,
|
||||
'url': el.select_one('h2 a').attrs.get('href'),
|
||||
'abstract': el.select_one('.b_caption p').text,
|
||||
})
|
||||
except:
|
||||
pass
|
||||
|
||||
if __name__ == '__main__':
|
||||
start_requests()
|
||||
1
backend/app/spiders/bing_general/md5.txt
Executable file
1
backend/app/spiders/bing_general/md5.txt
Executable file
@@ -0,0 +1 @@
|
||||
cf295b694a20c99c4857f838aa0402a7
|
||||
5
backend/app/spiders/chinaz/Spiderfile
Executable file
5
backend/app/spiders/chinaz/Spiderfile
Executable file
@@ -0,0 +1,5 @@
|
||||
name: "chinaz"
|
||||
display_name: "站长之家 (Scrapy)"
|
||||
col: "results_chinaz"
|
||||
type: "customized"
|
||||
cmd: "scrapy crawl chinaz_spider"
|
||||
0
backend/app/spiders/chinaz/chinaz/__init__.py
Executable file
0
backend/app/spiders/chinaz/chinaz/__init__.py
Executable file
21
backend/app/spiders/chinaz/chinaz/items.py
Executable file
21
backend/app/spiders/chinaz/chinaz/items.py
Executable file
@@ -0,0 +1,21 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://doc.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class ChinazItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
_id = scrapy.Field()
|
||||
task_id = scrapy.Field()
|
||||
name = scrapy.Field()
|
||||
domain = scrapy.Field()
|
||||
description = scrapy.Field()
|
||||
rank = scrapy.Field()
|
||||
main_category = scrapy.Field()
|
||||
category = scrapy.Field()
|
||||
location = scrapy.Field()
|
||||
103
backend/app/spiders/chinaz/chinaz/middlewares.py
Executable file
103
backend/app/spiders/chinaz/chinaz/middlewares.py
Executable file
@@ -0,0 +1,103 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your spider middleware
|
||||
#
|
||||
# See documentation in:
|
||||
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class ChinazSpiderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the spider middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
# Called for each response that goes through the spider
|
||||
# middleware and into the spider.
|
||||
|
||||
# Should return None or raise an exception.
|
||||
return None
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
# Called with the results returned from the Spider, after
|
||||
# it has processed the response.
|
||||
|
||||
# Must return an iterable of Request, dict or Item objects.
|
||||
for i in result:
|
||||
yield i
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
# Called when a spider or process_spider_input() method
|
||||
# (from other spider middleware) raises an exception.
|
||||
|
||||
# Should return either None or an iterable of Response, dict
|
||||
# or Item objects.
|
||||
pass
|
||||
|
||||
def process_start_requests(self, start_requests, spider):
|
||||
# Called with the start requests of the spider, and works
|
||||
# similarly to the process_spider_output() method, except
|
||||
# that it doesn’t have a response associated.
|
||||
|
||||
# Must return only requests (not items).
|
||||
for r in start_requests:
|
||||
yield r
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
|
||||
|
||||
class ChinazDownloaderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the downloader middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# Called for each request that goes through the downloader
|
||||
# middleware.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this request
|
||||
# - or return a Response object
|
||||
# - or return a Request object
|
||||
# - or raise IgnoreRequest: process_exception() methods of
|
||||
# installed downloader middleware will be called
|
||||
return None
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
# Called with the response returned from the downloader.
|
||||
|
||||
# Must either;
|
||||
# - return a Response object
|
||||
# - return a Request object
|
||||
# - or raise IgnoreRequest
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
# Called when a download handler or a process_request()
|
||||
# (from other downloader middleware) raises an exception.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this exception
|
||||
# - return a Response object: stops process_exception() chain
|
||||
# - return a Request object: stops process_exception() chain
|
||||
pass
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
7
backend/app/spiders/chinaz/chinaz/pipelines.py
Executable file
7
backend/app/spiders/chinaz/chinaz/pipelines.py
Executable file
@@ -0,0 +1,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
90
backend/app/spiders/chinaz/chinaz/settings.py
Executable file
90
backend/app/spiders/chinaz/chinaz/settings.py
Executable file
@@ -0,0 +1,90 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Scrapy settings for chinaz project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://doc.scrapy.org/en/latest/topics/settings.html
|
||||
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = 'chinaz'
|
||||
|
||||
SPIDER_MODULES = ['chinaz.spiders']
|
||||
NEWSPIDER_MODULE = 'chinaz.spiders'
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
#USER_AGENT = 'chinaz (+http://www.yourdomain.com)'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
#DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
# 'Accept-Language': 'en',
|
||||
#}
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# 'chinaz.middlewares.ChinazSpiderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
#DOWNLOADER_MIDDLEWARES = {
|
||||
# 'chinaz.middlewares.ChinazDownloaderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://doc.scrapy.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||
#}
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
'crawlab.pipelines.CrawlabMongoPipeline': 300,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = 'httpcache'
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
4
backend/app/spiders/chinaz/chinaz/spiders/__init__.py
Executable file
4
backend/app/spiders/chinaz/chinaz/spiders/__init__.py
Executable file
@@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
63
backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py
Executable file
63
backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py
Executable file
@@ -0,0 +1,63 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import scrapy
|
||||
from chinaz.items import ChinazItem
|
||||
|
||||
|
||||
class ChinazSpiderSpider(scrapy.Spider):
|
||||
name = 'chinaz_spider'
|
||||
allowed_domains = ['chinaz.com']
|
||||
start_urls = ['http://top.chinaz.com/hangye/']
|
||||
|
||||
def parse(self, response):
|
||||
for item in response.css('.listCentent > li'):
|
||||
name = item.css('h3.rightTxtHead > a::text').extract_first()
|
||||
href = item.css('h3.rightTxtHead > a::attr("href")').extract_first()
|
||||
domain = item.css('h3.rightTxtHead > span::text').extract_first()
|
||||
description = item.css('p.RtCInfo::text').extract_first()
|
||||
rank = item.css('.RtCRateCent > strong::text').extract_first()
|
||||
rank = int(rank)
|
||||
item = ChinazItem(
|
||||
_id=domain,
|
||||
name=name,
|
||||
domain=domain,
|
||||
description=description,
|
||||
rank=rank,
|
||||
)
|
||||
yield scrapy.Request(
|
||||
url='http://top.chinaz.com' + href,
|
||||
callback=self.parse_item,
|
||||
meta={
|
||||
'item': item
|
||||
}
|
||||
)
|
||||
|
||||
# pagination
|
||||
a_list = response.css('.ListPageWrap > a::attr("href")').extract()
|
||||
url = 'http://top.chinaz.com/hangye/' + a_list[-1]
|
||||
yield scrapy.Request(url=url, callback=self.parse)
|
||||
|
||||
def parse_item(self, response):
|
||||
item = response.meta['item']
|
||||
|
||||
# category info extraction
|
||||
arr = response.css('.TopMainTag-show .SimSun')
|
||||
res1 = arr[0].css('a::text').extract()
|
||||
main_category = res1[0]
|
||||
if len(res1) == 1:
|
||||
category = '其他'
|
||||
else:
|
||||
category = res1[1]
|
||||
|
||||
# location info extraction
|
||||
res2 = arr[1].css('a::text').extract()
|
||||
if len(res2) > 0:
|
||||
location = res2[0]
|
||||
else:
|
||||
location = '其他'
|
||||
|
||||
# assign values to item
|
||||
item['main_category'] = main_category
|
||||
item['category'] = category
|
||||
item['location'] = location
|
||||
|
||||
yield item
|
||||
1
backend/app/spiders/chinaz/md5.txt
Executable file
1
backend/app/spiders/chinaz/md5.txt
Executable file
@@ -0,0 +1 @@
|
||||
1976593e49bf0238602ce35d051bd137
|
||||
11
backend/app/spiders/chinaz/scrapy.cfg
Executable file
11
backend/app/spiders/chinaz/scrapy.cfg
Executable file
@@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = chinaz.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = chinaz
|
||||
60
backend/app/spiders/csdn_config/Spiderfile
Executable file
60
backend/app/spiders/csdn_config/Spiderfile
Executable file
@@ -0,0 +1,60 @@
|
||||
name: "csdn_config"
|
||||
display_name: "CSDN(可配置)"
|
||||
remark: "CSDN Crawlab 文章,列表+详情+分页"
|
||||
type: "configurable"
|
||||
col: "results_csdn_config"
|
||||
engine: scrapy
|
||||
start_url: https://so.csdn.net/so/search/s.do?q=crawlab
|
||||
start_stage: list
|
||||
stages:
|
||||
- name: list
|
||||
is_list: true
|
||||
list_css: .search-list-con > .search-list
|
||||
list_xpath: ""
|
||||
page_css: a.btn-next
|
||||
page_xpath: ""
|
||||
page_attr: href
|
||||
fields:
|
||||
- name: url
|
||||
css: ""
|
||||
xpath: .//*[@class="limit_width"]/a
|
||||
attr: href
|
||||
next_stage: detail
|
||||
remark: ""
|
||||
- name: detail
|
||||
is_list: false
|
||||
list_css: ""
|
||||
list_xpath: ""
|
||||
page_css: ""
|
||||
page_xpath: ""
|
||||
page_attr: ""
|
||||
fields:
|
||||
- name: content
|
||||
css: ""
|
||||
xpath: .//div[@id="content_views"]
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: views
|
||||
css: .read-count
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: title
|
||||
css: .title-article
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: author
|
||||
css: .follow-nickName
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
settings:
|
||||
AUTOTHROTTLE_ENABLED: "false"
|
||||
ROBOTSTXT_OBEY: "false"
|
||||
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
|
||||
like Gecko) Chrome/79.0.3945.117 Safari/537.36
|
||||
0
backend/app/spiders/csdn_config/config_spider/__init__.py
Executable file
0
backend/app/spiders/csdn_config/config_spider/__init__.py
Executable file
20
backend/app/spiders/csdn_config/config_spider/items.py
Executable file
20
backend/app/spiders/csdn_config/config_spider/items.py
Executable file
@@ -0,0 +1,20 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class Item(scrapy.Item):
|
||||
_id = scrapy.Field()
|
||||
task_id = scrapy.Field()
|
||||
ts = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
content = scrapy.Field()
|
||||
views = scrapy.Field()
|
||||
title = scrapy.Field()
|
||||
author = scrapy.Field()
|
||||
|
||||
103
backend/app/spiders/csdn_config/config_spider/middlewares.py
Executable file
103
backend/app/spiders/csdn_config/config_spider/middlewares.py
Executable file
@@ -0,0 +1,103 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your spider middleware
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class ConfigSpiderSpiderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the spider middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
# Called for each response that goes through the spider
|
||||
# middleware and into the spider.
|
||||
|
||||
# Should return None or raise an exception.
|
||||
return None
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
# Called with the results returned from the Spider, after
|
||||
# it has processed the response.
|
||||
|
||||
# Must return an iterable of Request, dict or Item objects.
|
||||
for i in result:
|
||||
yield i
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
# Called when a spider or process_spider_input() method
|
||||
# (from other spider middleware) raises an exception.
|
||||
|
||||
# Should return either None or an iterable of Request, dict
|
||||
# or Item objects.
|
||||
pass
|
||||
|
||||
def process_start_requests(self, start_requests, spider):
|
||||
# Called with the start requests of the spider, and works
|
||||
# similarly to the process_spider_output() method, except
|
||||
# that it doesn’t have a response associated.
|
||||
|
||||
# Must return only requests (not items).
|
||||
for r in start_requests:
|
||||
yield r
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
|
||||
|
||||
class ConfigSpiderDownloaderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the downloader middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# Called for each request that goes through the downloader
|
||||
# middleware.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this request
|
||||
# - or return a Response object
|
||||
# - or return a Request object
|
||||
# - or raise IgnoreRequest: process_exception() methods of
|
||||
# installed downloader middleware will be called
|
||||
return None
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
# Called with the response returned from the downloader.
|
||||
|
||||
# Must either;
|
||||
# - return a Response object
|
||||
# - return a Request object
|
||||
# - or raise IgnoreRequest
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
# Called when a download handler or a process_request()
|
||||
# (from other downloader middleware) raises an exception.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this exception
|
||||
# - return a Response object: stops process_exception() chain
|
||||
# - return a Request object: stops process_exception() chain
|
||||
pass
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
27
backend/app/spiders/csdn_config/config_spider/pipelines.py
Executable file
27
backend/app/spiders/csdn_config/config_spider/pipelines.py
Executable file
@@ -0,0 +1,27 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
import os
|
||||
from pymongo import MongoClient
|
||||
|
||||
mongo = MongoClient(
|
||||
host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost',
|
||||
port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017),
|
||||
username=os.environ.get('CRAWLAB_MONGO_USERNAME'),
|
||||
password=os.environ.get('CRAWLAB_MONGO_PASSWORD'),
|
||||
authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin'
|
||||
)
|
||||
db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test']
|
||||
col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test']
|
||||
task_id = os.environ.get('CRAWLAB_TASK_ID')
|
||||
|
||||
class ConfigSpiderPipeline(object):
|
||||
def process_item(self, item, spider):
|
||||
item['task_id'] = task_id
|
||||
if col is not None:
|
||||
col.save(item)
|
||||
return item
|
||||
111
backend/app/spiders/csdn_config/config_spider/settings.py
Executable file
111
backend/app/spiders/csdn_config/config_spider/settings.py
Executable file
@@ -0,0 +1,111 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
|
||||
# Scrapy settings for config_spider project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = 'Crawlab Configurable Spider'
|
||||
|
||||
SPIDER_MODULES = ['config_spider.spiders']
|
||||
NEWSPIDER_MODULE = 'config_spider.spiders'
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
USER_AGENT = 'Crawlab Spider'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
#DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
# 'Accept-Language': 'en',
|
||||
#}
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
#DOWNLOADER_MIDDLEWARES = {
|
||||
# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||
#}
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
'config_spider.pipelines.ConfigSpiderPipeline': 300,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = 'httpcache'
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
|
||||
for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]:
|
||||
setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '')
|
||||
setting_value = os.environ.get(setting_env_name)
|
||||
if setting_value.lower() == 'true':
|
||||
setting_value = True
|
||||
elif setting_value.lower() == 'false':
|
||||
setting_value = False
|
||||
elif re.search(r'^\d+$', setting_value) is not None:
|
||||
setting_value = int(setting_value)
|
||||
elif re.search(r'^\{.*\}$', setting_value.strip()) is not None:
|
||||
setting_value = json.loads(setting_value)
|
||||
elif re.search(r'^\[.*\]$', setting_value.strip()) is not None:
|
||||
setting_value = json.loads(setting_value)
|
||||
else:
|
||||
pass
|
||||
locals()[setting_name] = setting_value
|
||||
|
||||
4
backend/app/spiders/csdn_config/config_spider/spiders/__init__.py
Executable file
4
backend/app/spiders/csdn_config/config_spider/spiders/__init__.py
Executable file
@@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
41
backend/app/spiders/csdn_config/config_spider/spiders/spider.py
Executable file
41
backend/app/spiders/csdn_config/config_spider/spiders/spider.py
Executable file
@@ -0,0 +1,41 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import scrapy
|
||||
import re
|
||||
from config_spider.items import Item
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
def get_real_url(response, url):
|
||||
if re.search(r'^https?', url):
|
||||
return url
|
||||
elif re.search(r'^\/\/', url):
|
||||
u = urlparse(response.url)
|
||||
return u.scheme + url
|
||||
return urljoin(response.url, url)
|
||||
|
||||
class ConfigSpider(scrapy.Spider):
|
||||
name = 'config_spider'
|
||||
|
||||
def start_requests(self):
|
||||
yield scrapy.Request(url='https://so.csdn.net/so/search/s.do?q=crawlab', callback=self.parse_list)
|
||||
|
||||
def parse_list(self, response):
|
||||
prev_item = response.meta.get('item')
|
||||
for elem in response.css('.search-list-con > .search-list'):
|
||||
item = Item()
|
||||
item['url'] = elem.xpath('.//*[@class="limit_width"]/a/@href').extract_first()
|
||||
if prev_item is not None:
|
||||
for key, value in prev_item.items():
|
||||
item[key] = value
|
||||
yield scrapy.Request(url=get_real_url(response, item['url']), callback=self.parse_detail, meta={'item': item})
|
||||
next_url = response.css('a.btn-next::attr("href")').extract_first()
|
||||
yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item})
|
||||
|
||||
def parse_detail(self, response):
|
||||
item = Item() if response.meta.get('item') is None else response.meta.get('item')
|
||||
item['content'] = response.xpath('string(.//div[@id="content_views"])').extract_first()
|
||||
item['views'] = response.css('.read-count::text').extract_first()
|
||||
item['title'] = response.css('.title-article::text').extract_first()
|
||||
item['author'] = response.css('.follow-nickName::text').extract_first()
|
||||
yield item
|
||||
|
||||
|
||||
1
backend/app/spiders/csdn_config/md5.txt
Executable file
1
backend/app/spiders/csdn_config/md5.txt
Executable file
@@ -0,0 +1 @@
|
||||
b6889c74e006a5e619b525d84db62ffd
|
||||
11
backend/app/spiders/csdn_config/scrapy.cfg
Executable file
11
backend/app/spiders/csdn_config/scrapy.cfg
Executable file
@@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = config_spider.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = config_spider
|
||||
57
backend/app/spiders/douban_config/Spiderfile
Executable file
57
backend/app/spiders/douban_config/Spiderfile
Executable file
@@ -0,0 +1,57 @@
|
||||
name: "douban_config"
|
||||
display_name: "豆瓣读书(可配置)"
|
||||
remark: "豆瓣读书新书推荐,列表"
|
||||
type: "configurable"
|
||||
col: "results_douban_config"
|
||||
engine: scrapy
|
||||
start_url: https://book.douban.com/latest
|
||||
start_stage: list
|
||||
stages:
|
||||
- name: list
|
||||
is_list: true
|
||||
list_css: ul.cover-col-4 > li
|
||||
list_xpath: ""
|
||||
page_css: ""
|
||||
page_xpath: ""
|
||||
page_attr: ""
|
||||
fields:
|
||||
- name: title
|
||||
css: h2 > a
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: url
|
||||
css: h2 > a
|
||||
xpath: ""
|
||||
attr: href
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: img
|
||||
css: a.cover img
|
||||
xpath: ""
|
||||
attr: src
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: rating
|
||||
css: p.rating > .color-lightgray
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: abstract
|
||||
css: p:last-child
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: info
|
||||
css: .color-gray
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
settings:
|
||||
ROBOTSTXT_OBEY: "false"
|
||||
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
|
||||
like Gecko) Chrome/78.0.3904.108 Safari/537.36
|
||||
0
backend/app/spiders/douban_config/config_spider/__init__.py
Executable file
0
backend/app/spiders/douban_config/config_spider/__init__.py
Executable file
21
backend/app/spiders/douban_config/config_spider/items.py
Executable file
21
backend/app/spiders/douban_config/config_spider/items.py
Executable file
@@ -0,0 +1,21 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class Item(scrapy.Item):
|
||||
_id = scrapy.Field()
|
||||
task_id = scrapy.Field()
|
||||
ts = scrapy.Field()
|
||||
title = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
img = scrapy.Field()
|
||||
rating = scrapy.Field()
|
||||
abstract = scrapy.Field()
|
||||
info = scrapy.Field()
|
||||
|
||||
103
backend/app/spiders/douban_config/config_spider/middlewares.py
Executable file
103
backend/app/spiders/douban_config/config_spider/middlewares.py
Executable file
@@ -0,0 +1,103 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your spider middleware
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class ConfigSpiderSpiderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the spider middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
# Called for each response that goes through the spider
|
||||
# middleware and into the spider.
|
||||
|
||||
# Should return None or raise an exception.
|
||||
return None
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
# Called with the results returned from the Spider, after
|
||||
# it has processed the response.
|
||||
|
||||
# Must return an iterable of Request, dict or Item objects.
|
||||
for i in result:
|
||||
yield i
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
# Called when a spider or process_spider_input() method
|
||||
# (from other spider middleware) raises an exception.
|
||||
|
||||
# Should return either None or an iterable of Request, dict
|
||||
# or Item objects.
|
||||
pass
|
||||
|
||||
def process_start_requests(self, start_requests, spider):
|
||||
# Called with the start requests of the spider, and works
|
||||
# similarly to the process_spider_output() method, except
|
||||
# that it doesn’t have a response associated.
|
||||
|
||||
# Must return only requests (not items).
|
||||
for r in start_requests:
|
||||
yield r
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
|
||||
|
||||
class ConfigSpiderDownloaderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the downloader middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# Called for each request that goes through the downloader
|
||||
# middleware.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this request
|
||||
# - or return a Response object
|
||||
# - or return a Request object
|
||||
# - or raise IgnoreRequest: process_exception() methods of
|
||||
# installed downloader middleware will be called
|
||||
return None
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
# Called with the response returned from the downloader.
|
||||
|
||||
# Must either;
|
||||
# - return a Response object
|
||||
# - return a Request object
|
||||
# - or raise IgnoreRequest
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
# Called when a download handler or a process_request()
|
||||
# (from other downloader middleware) raises an exception.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this exception
|
||||
# - return a Response object: stops process_exception() chain
|
||||
# - return a Request object: stops process_exception() chain
|
||||
pass
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
27
backend/app/spiders/douban_config/config_spider/pipelines.py
Executable file
27
backend/app/spiders/douban_config/config_spider/pipelines.py
Executable file
@@ -0,0 +1,27 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
import os
|
||||
from pymongo import MongoClient
|
||||
|
||||
mongo = MongoClient(
|
||||
host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost',
|
||||
port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017),
|
||||
username=os.environ.get('CRAWLAB_MONGO_USERNAME'),
|
||||
password=os.environ.get('CRAWLAB_MONGO_PASSWORD'),
|
||||
authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin'
|
||||
)
|
||||
db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test']
|
||||
col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test']
|
||||
task_id = os.environ.get('CRAWLAB_TASK_ID')
|
||||
|
||||
class ConfigSpiderPipeline(object):
|
||||
def process_item(self, item, spider):
|
||||
item['task_id'] = task_id
|
||||
if col is not None:
|
||||
col.save(item)
|
||||
return item
|
||||
111
backend/app/spiders/douban_config/config_spider/settings.py
Executable file
111
backend/app/spiders/douban_config/config_spider/settings.py
Executable file
@@ -0,0 +1,111 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
|
||||
# Scrapy settings for config_spider project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = 'Crawlab Configurable Spider'
|
||||
|
||||
SPIDER_MODULES = ['config_spider.spiders']
|
||||
NEWSPIDER_MODULE = 'config_spider.spiders'
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
USER_AGENT = 'Crawlab Spider'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
#DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
# 'Accept-Language': 'en',
|
||||
#}
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
#DOWNLOADER_MIDDLEWARES = {
|
||||
# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||
#}
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
'config_spider.pipelines.ConfigSpiderPipeline': 300,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = 'httpcache'
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
|
||||
for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]:
|
||||
setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '')
|
||||
setting_value = os.environ.get(setting_env_name)
|
||||
if setting_value.lower() == 'true':
|
||||
setting_value = True
|
||||
elif setting_value.lower() == 'false':
|
||||
setting_value = False
|
||||
elif re.search(r'^\d+$', setting_value) is not None:
|
||||
setting_value = int(setting_value)
|
||||
elif re.search(r'^\{.*\}$', setting_value.strip()) is not None:
|
||||
setting_value = json.loads(setting_value)
|
||||
elif re.search(r'^\[.*\]$', setting_value.strip()) is not None:
|
||||
setting_value = json.loads(setting_value)
|
||||
else:
|
||||
pass
|
||||
locals()[setting_name] = setting_value
|
||||
|
||||
4
backend/app/spiders/douban_config/config_spider/spiders/__init__.py
Executable file
4
backend/app/spiders/douban_config/config_spider/spiders/__init__.py
Executable file
@@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
36
backend/app/spiders/douban_config/config_spider/spiders/spider.py
Executable file
36
backend/app/spiders/douban_config/config_spider/spiders/spider.py
Executable file
@@ -0,0 +1,36 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import scrapy
|
||||
import re
|
||||
from config_spider.items import Item
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
def get_real_url(response, url):
|
||||
if re.search(r'^https?', url):
|
||||
return url
|
||||
elif re.search(r'^\/\/', url):
|
||||
u = urlparse(response.url)
|
||||
return u.scheme + url
|
||||
return urljoin(response.url, url)
|
||||
|
||||
class ConfigSpider(scrapy.Spider):
|
||||
name = 'config_spider'
|
||||
|
||||
def start_requests(self):
|
||||
yield scrapy.Request(url='https://book.douban.com/latest', callback=self.parse_list)
|
||||
|
||||
def parse_list(self, response):
|
||||
prev_item = response.meta.get('item')
|
||||
for elem in response.css('ul.cover-col-4 > li'):
|
||||
item = Item()
|
||||
item['title'] = elem.css('h2 > a::text').extract_first()
|
||||
item['url'] = elem.css('h2 > a::attr("href")').extract_first()
|
||||
item['img'] = elem.css('a.cover img::attr("src")').extract_first()
|
||||
item['rating'] = elem.css('p.rating > .color-lightgray::text').extract_first()
|
||||
item['abstract'] = elem.css('p:last-child::text').extract_first()
|
||||
item['info'] = elem.css('.color-gray::text').extract_first()
|
||||
if prev_item is not None:
|
||||
for key, value in prev_item.items():
|
||||
item[key] = value
|
||||
yield item
|
||||
|
||||
|
||||
1
backend/app/spiders/douban_config/md5.txt
Executable file
1
backend/app/spiders/douban_config/md5.txt
Executable file
@@ -0,0 +1 @@
|
||||
4d59a6c83b0e125d5321beae86bb93ce
|
||||
11
backend/app/spiders/douban_config/scrapy.cfg
Executable file
11
backend/app/spiders/douban_config/scrapy.cfg
Executable file
@@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = config_spider.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = config_spider
|
||||
5
backend/app/spiders/jd/Spiderfile
Executable file
5
backend/app/spiders/jd/Spiderfile
Executable file
@@ -0,0 +1,5 @@
|
||||
name: "jd"
|
||||
display_name: "京东 (Scrapy)"
|
||||
col: "results_jd"
|
||||
type: "customized"
|
||||
cmd: "scrapy crawl jd_spider"
|
||||
0
backend/app/spiders/jd/jd/__init__.py
Executable file
0
backend/app/spiders/jd/jd/__init__.py
Executable file
15
backend/app/spiders/jd/jd/items.py
Executable file
15
backend/app/spiders/jd/jd/items.py
Executable file
@@ -0,0 +1,15 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://doc.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class JdItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
name = scrapy.Field()
|
||||
price = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
103
backend/app/spiders/jd/jd/middlewares.py
Executable file
103
backend/app/spiders/jd/jd/middlewares.py
Executable file
@@ -0,0 +1,103 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your spider middleware
|
||||
#
|
||||
# See documentation in:
|
||||
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class JdSpiderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the spider middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
# Called for each response that goes through the spider
|
||||
# middleware and into the spider.
|
||||
|
||||
# Should return None or raise an exception.
|
||||
return None
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
# Called with the results returned from the Spider, after
|
||||
# it has processed the response.
|
||||
|
||||
# Must return an iterable of Request, dict or Item objects.
|
||||
for i in result:
|
||||
yield i
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
# Called when a spider or process_spider_input() method
|
||||
# (from other spider middleware) raises an exception.
|
||||
|
||||
# Should return either None or an iterable of Response, dict
|
||||
# or Item objects.
|
||||
pass
|
||||
|
||||
def process_start_requests(self, start_requests, spider):
|
||||
# Called with the start requests of the spider, and works
|
||||
# similarly to the process_spider_output() method, except
|
||||
# that it doesn’t have a response associated.
|
||||
|
||||
# Must return only requests (not items).
|
||||
for r in start_requests:
|
||||
yield r
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
|
||||
|
||||
class JdDownloaderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the downloader middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# Called for each request that goes through the downloader
|
||||
# middleware.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this request
|
||||
# - or return a Response object
|
||||
# - or return a Request object
|
||||
# - or raise IgnoreRequest: process_exception() methods of
|
||||
# installed downloader middleware will be called
|
||||
return None
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
# Called with the response returned from the downloader.
|
||||
|
||||
# Must either;
|
||||
# - return a Response object
|
||||
# - return a Request object
|
||||
# - or raise IgnoreRequest
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
# Called when a download handler or a process_request()
|
||||
# (from other downloader middleware) raises an exception.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this exception
|
||||
# - return a Response object: stops process_exception() chain
|
||||
# - return a Request object: stops process_exception() chain
|
||||
pass
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
6
backend/app/spiders/jd/jd/pipelines.py
Executable file
6
backend/app/spiders/jd/jd/pipelines.py
Executable file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
90
backend/app/spiders/jd/jd/settings.py
Executable file
90
backend/app/spiders/jd/jd/settings.py
Executable file
@@ -0,0 +1,90 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Scrapy settings for jd project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://doc.scrapy.org/en/latest/topics/settings.html
|
||||
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = 'jd'
|
||||
|
||||
SPIDER_MODULES = ['jd.spiders']
|
||||
NEWSPIDER_MODULE = 'jd.spiders'
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
#USER_AGENT = 'jd (+http://www.yourdomain.com)'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
#DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
# 'Accept-Language': 'en',
|
||||
#}
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# 'jd.middlewares.JdSpiderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
#DOWNLOADER_MIDDLEWARES = {
|
||||
# 'jd.middlewares.JdDownloaderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://doc.scrapy.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||
#}
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
'crawlab.pipelines.CrawlabMongoPipeline': 300,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = 'httpcache'
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
4
backend/app/spiders/jd/jd/spiders/__init__.py
Executable file
4
backend/app/spiders/jd/jd/spiders/__init__.py
Executable file
@@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
21
backend/app/spiders/jd/jd/spiders/jd_spider.py
Executable file
21
backend/app/spiders/jd/jd/spiders/jd_spider.py
Executable file
@@ -0,0 +1,21 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import scrapy
|
||||
|
||||
from jd.items import JdItem
|
||||
|
||||
|
||||
class JdSpiderSpider(scrapy.Spider):
|
||||
name = 'jd_spider'
|
||||
allowed_domains = ['jd.com']
|
||||
|
||||
def start_requests(self):
|
||||
for i in range(1, 50):
|
||||
yield scrapy.Request(url=f'https://search.jd.com/Search?keyword=手机&enc=utf-8&page={i}')
|
||||
|
||||
def parse(self, response):
|
||||
for el in response.css('.gl-item'):
|
||||
yield JdItem(
|
||||
url=el.css('.p-name > a::attr("href")').extract_first(),
|
||||
name=el.css('.p-name > a::attr("title")').extract_first(),
|
||||
price=float(el.css('.p-price i::text').extract_first()),
|
||||
)
|
||||
1
backend/app/spiders/jd/md5.txt
Executable file
1
backend/app/spiders/jd/md5.txt
Executable file
@@ -0,0 +1 @@
|
||||
621486d31459514eb27a082d159d9b8c
|
||||
11
backend/app/spiders/jd/scrapy.cfg
Executable file
11
backend/app/spiders/jd/scrapy.cfg
Executable file
@@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = jd.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = jd
|
||||
5
backend/app/spiders/sinastock/Spiderfile
Executable file
5
backend/app/spiders/sinastock/Spiderfile
Executable file
@@ -0,0 +1,5 @@
|
||||
name: "sinastock"
|
||||
display_name: "新浪股票 (Scrapy)"
|
||||
type: "customized"
|
||||
col: "results_sinastock"
|
||||
cmd: "scrapy crawl sinastock_spider"
|
||||
1
backend/app/spiders/sinastock/md5.txt
Executable file
1
backend/app/spiders/sinastock/md5.txt
Executable file
@@ -0,0 +1 @@
|
||||
80bc091fa45ef4a85c9f1a66c81a4ed7
|
||||
11
backend/app/spiders/sinastock/scrapy.cfg
Executable file
11
backend/app/spiders/sinastock/scrapy.cfg
Executable file
@@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = sinastock.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = sinastock
|
||||
0
backend/app/spiders/sinastock/sinastock/__init__.py
Executable file
0
backend/app/spiders/sinastock/sinastock/__init__.py
Executable file
21
backend/app/spiders/sinastock/sinastock/items.py
Executable file
21
backend/app/spiders/sinastock/sinastock/items.py
Executable file
@@ -0,0 +1,21 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://doc.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class NewsItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
_id = scrapy.Field()
|
||||
title = scrapy.Field()
|
||||
ts_str = scrapy.Field()
|
||||
ts = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
text = scrapy.Field()
|
||||
task_id = scrapy.Field()
|
||||
source = scrapy.Field()
|
||||
stocks = scrapy.Field()
|
||||
103
backend/app/spiders/sinastock/sinastock/middlewares.py
Executable file
103
backend/app/spiders/sinastock/sinastock/middlewares.py
Executable file
@@ -0,0 +1,103 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your spider middleware
|
||||
#
|
||||
# See documentation in:
|
||||
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class SinastockSpiderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the spider middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
# Called for each response that goes through the spider
|
||||
# middleware and into the spider.
|
||||
|
||||
# Should return None or raise an exception.
|
||||
return None
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
# Called with the results returned from the Spider, after
|
||||
# it has processed the response.
|
||||
|
||||
# Must return an iterable of Request, dict or Item objects.
|
||||
for i in result:
|
||||
yield i
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
# Called when a spider or process_spider_input() method
|
||||
# (from other spider middleware) raises an exception.
|
||||
|
||||
# Should return either None or an iterable of Response, dict
|
||||
# or Item objects.
|
||||
pass
|
||||
|
||||
def process_start_requests(self, start_requests, spider):
|
||||
# Called with the start requests of the spider, and works
|
||||
# similarly to the process_spider_output() method, except
|
||||
# that it doesn’t have a response associated.
|
||||
|
||||
# Must return only requests (not items).
|
||||
for r in start_requests:
|
||||
yield r
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
|
||||
|
||||
class SinastockDownloaderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the downloader middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# Called for each request that goes through the downloader
|
||||
# middleware.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this request
|
||||
# - or return a Response object
|
||||
# - or return a Request object
|
||||
# - or raise IgnoreRequest: process_exception() methods of
|
||||
# installed downloader middleware will be called
|
||||
return None
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
# Called with the response returned from the downloader.
|
||||
|
||||
# Must either;
|
||||
# - return a Response object
|
||||
# - return a Request object
|
||||
# - or raise IgnoreRequest
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
# Called when a download handler or a process_request()
|
||||
# (from other downloader middleware) raises an exception.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this exception
|
||||
# - return a Response object: stops process_exception() chain
|
||||
# - return a Request object: stops process_exception() chain
|
||||
pass
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
6
backend/app/spiders/sinastock/sinastock/pipelines.py
Executable file
6
backend/app/spiders/sinastock/sinastock/pipelines.py
Executable file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
89
backend/app/spiders/sinastock/sinastock/settings.py
Executable file
89
backend/app/spiders/sinastock/sinastock/settings.py
Executable file
@@ -0,0 +1,89 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Scrapy settings for sinastock project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://doc.scrapy.org/en/latest/topics/settings.html
|
||||
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = 'sinastock'
|
||||
|
||||
SPIDER_MODULES = ['sinastock.spiders']
|
||||
NEWSPIDER_MODULE = 'sinastock.spiders'
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
# USER_AGENT = 'sinastock (+http://www.yourdomain.com)'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
# CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
# DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
# CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
# COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
# TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
# DEFAULT_REQUEST_HEADERS = {
|
||||
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
# 'Accept-Language': 'en',
|
||||
# }
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
# SPIDER_MIDDLEWARES = {
|
||||
# 'sinastock.middlewares.SinastockSpiderMiddleware': 543,
|
||||
# }
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# DOWNLOADER_MIDDLEWARES = {
|
||||
# 'sinastock.middlewares.SinastockDownloaderMiddleware': 543,
|
||||
# }
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://doc.scrapy.org/en/latest/topics/extensions.html
|
||||
# EXTENSIONS = {
|
||||
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||
# }
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
'crawlab.pipelines.CrawlabMongoPipeline': 300,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
|
||||
# AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
# AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
# AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
# AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
# HTTPCACHE_ENABLED = True
|
||||
# HTTPCACHE_EXPIRATION_SECS = 0
|
||||
# HTTPCACHE_DIR = 'httpcache'
|
||||
# HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
4
backend/app/spiders/sinastock/sinastock/spiders/__init__.py
Executable file
4
backend/app/spiders/sinastock/sinastock/spiders/__init__.py
Executable file
@@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
59
backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py
Executable file
59
backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py
Executable file
@@ -0,0 +1,59 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime
|
||||
|
||||
import scrapy
|
||||
from pymongo import MongoClient
|
||||
|
||||
from sinastock.items import NewsItem
|
||||
|
||||
class SinastockSpiderSpider(scrapy.Spider):
|
||||
name = 'sinastock_spider'
|
||||
allowed_domains = ['finance.sina.com.cn']
|
||||
mongo = MongoClient(
|
||||
host=os.environ.get('MONGO_HOST') or 'localhost',
|
||||
port=int(os.environ.get('MONGO_PORT') or 27017)
|
||||
)
|
||||
db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test']
|
||||
col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'stock_news')
|
||||
|
||||
def start_requests(self):
|
||||
col = self.db['stocks']
|
||||
for s in col.find({}):
|
||||
code, ex = s['ts_code'].split('.')
|
||||
for i in range(10):
|
||||
url = f'http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol={ex.lower()}{code}&Page={i + 1}'
|
||||
yield scrapy.Request(
|
||||
url=url,
|
||||
callback=self.parse,
|
||||
meta={'ts_code': s['ts_code']}
|
||||
)
|
||||
|
||||
def parse(self, response):
|
||||
for a in response.css('.datelist > ul > a'):
|
||||
url = a.css('a::attr("href")').extract_first()
|
||||
item = NewsItem(
|
||||
title=a.css('a::text').extract_first(),
|
||||
url=url,
|
||||
source='sina',
|
||||
stocks=[response.meta['ts_code']]
|
||||
)
|
||||
yield scrapy.Request(
|
||||
url=url,
|
||||
callback=self.parse_detail,
|
||||
meta={'item': item}
|
||||
)
|
||||
|
||||
def parse_detail(self, response):
|
||||
item = response.meta['item']
|
||||
text = response.css('#artibody').extract_first()
|
||||
pre = re.compile('>(.*?)<')
|
||||
text = ''.join(pre.findall(text))
|
||||
item['text'] = text.replace('\u3000', '')
|
||||
item['ts_str'] = response.css('.date::text').extract_first()
|
||||
if item['text'] is None or item['ts_str'] is None:
|
||||
pass
|
||||
else:
|
||||
item['ts'] = datetime.strptime(item['ts_str'], '%Y年%m月%d日 %H:%M')
|
||||
yield item
|
||||
54
backend/app/spiders/v2ex_config/Spiderfile
Executable file
54
backend/app/spiders/v2ex_config/Spiderfile
Executable file
@@ -0,0 +1,54 @@
|
||||
name: "v2ex_config"
|
||||
display_name: "V2ex(可配置)"
|
||||
remark: "V2ex,列表+详情"
|
||||
type: "configurable"
|
||||
col: "results_v2ex_config"
|
||||
engine: scrapy
|
||||
start_url: https://v2ex.com/
|
||||
start_stage: list
|
||||
stages:
|
||||
- name: list
|
||||
is_list: true
|
||||
list_css: .cell.item
|
||||
list_xpath: ""
|
||||
page_css: ""
|
||||
page_xpath: ""
|
||||
page_attr: href
|
||||
fields:
|
||||
- name: title
|
||||
css: a.topic-link
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: url
|
||||
css: a.topic-link
|
||||
xpath: ""
|
||||
attr: href
|
||||
next_stage: detail
|
||||
remark: ""
|
||||
- name: replies
|
||||
css: .count_livid
|
||||
xpath: ""
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: detail
|
||||
is_list: false
|
||||
list_css: ""
|
||||
list_xpath: ""
|
||||
page_css: ""
|
||||
page_xpath: ""
|
||||
page_attr: ""
|
||||
fields:
|
||||
- name: content
|
||||
css: ""
|
||||
xpath: .//*[@class="markdown_body"]
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
settings:
|
||||
AUTOTHROTTLE_ENABLED: "true"
|
||||
ROBOTSTXT_OBEY: "false"
|
||||
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
|
||||
like Gecko) Chrome/79.0.3945.117 Safari/537.36
|
||||
0
backend/app/spiders/v2ex_config/config_spider/__init__.py
Executable file
0
backend/app/spiders/v2ex_config/config_spider/__init__.py
Executable file
19
backend/app/spiders/v2ex_config/config_spider/items.py
Executable file
19
backend/app/spiders/v2ex_config/config_spider/items.py
Executable file
@@ -0,0 +1,19 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class Item(scrapy.Item):
|
||||
_id = scrapy.Field()
|
||||
task_id = scrapy.Field()
|
||||
ts = scrapy.Field()
|
||||
title = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
replies = scrapy.Field()
|
||||
content = scrapy.Field()
|
||||
|
||||
103
backend/app/spiders/v2ex_config/config_spider/middlewares.py
Executable file
103
backend/app/spiders/v2ex_config/config_spider/middlewares.py
Executable file
@@ -0,0 +1,103 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your spider middleware
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class ConfigSpiderSpiderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the spider middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
# Called for each response that goes through the spider
|
||||
# middleware and into the spider.
|
||||
|
||||
# Should return None or raise an exception.
|
||||
return None
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
# Called with the results returned from the Spider, after
|
||||
# it has processed the response.
|
||||
|
||||
# Must return an iterable of Request, dict or Item objects.
|
||||
for i in result:
|
||||
yield i
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
# Called when a spider or process_spider_input() method
|
||||
# (from other spider middleware) raises an exception.
|
||||
|
||||
# Should return either None or an iterable of Request, dict
|
||||
# or Item objects.
|
||||
pass
|
||||
|
||||
def process_start_requests(self, start_requests, spider):
|
||||
# Called with the start requests of the spider, and works
|
||||
# similarly to the process_spider_output() method, except
|
||||
# that it doesn’t have a response associated.
|
||||
|
||||
# Must return only requests (not items).
|
||||
for r in start_requests:
|
||||
yield r
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
|
||||
|
||||
class ConfigSpiderDownloaderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the downloader middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# Called for each request that goes through the downloader
|
||||
# middleware.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this request
|
||||
# - or return a Response object
|
||||
# - or return a Request object
|
||||
# - or raise IgnoreRequest: process_exception() methods of
|
||||
# installed downloader middleware will be called
|
||||
return None
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
# Called with the response returned from the downloader.
|
||||
|
||||
# Must either;
|
||||
# - return a Response object
|
||||
# - return a Request object
|
||||
# - or raise IgnoreRequest
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
# Called when a download handler or a process_request()
|
||||
# (from other downloader middleware) raises an exception.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this exception
|
||||
# - return a Response object: stops process_exception() chain
|
||||
# - return a Request object: stops process_exception() chain
|
||||
pass
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
27
backend/app/spiders/v2ex_config/config_spider/pipelines.py
Executable file
27
backend/app/spiders/v2ex_config/config_spider/pipelines.py
Executable file
@@ -0,0 +1,27 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
import os
|
||||
from pymongo import MongoClient
|
||||
|
||||
mongo = MongoClient(
|
||||
host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost',
|
||||
port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017),
|
||||
username=os.environ.get('CRAWLAB_MONGO_USERNAME'),
|
||||
password=os.environ.get('CRAWLAB_MONGO_PASSWORD'),
|
||||
authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin'
|
||||
)
|
||||
db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test']
|
||||
col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test']
|
||||
task_id = os.environ.get('CRAWLAB_TASK_ID')
|
||||
|
||||
class ConfigSpiderPipeline(object):
|
||||
def process_item(self, item, spider):
|
||||
item['task_id'] = task_id
|
||||
if col is not None:
|
||||
col.save(item)
|
||||
return item
|
||||
111
backend/app/spiders/v2ex_config/config_spider/settings.py
Executable file
111
backend/app/spiders/v2ex_config/config_spider/settings.py
Executable file
@@ -0,0 +1,111 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
|
||||
# Scrapy settings for config_spider project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = 'Crawlab Configurable Spider'
|
||||
|
||||
SPIDER_MODULES = ['config_spider.spiders']
|
||||
NEWSPIDER_MODULE = 'config_spider.spiders'
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
USER_AGENT = 'Crawlab Spider'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
#DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
# 'Accept-Language': 'en',
|
||||
#}
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
#DOWNLOADER_MIDDLEWARES = {
|
||||
# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||
#}
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
'config_spider.pipelines.ConfigSpiderPipeline': 300,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = 'httpcache'
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
|
||||
for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]:
|
||||
setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '')
|
||||
setting_value = os.environ.get(setting_env_name)
|
||||
if setting_value.lower() == 'true':
|
||||
setting_value = True
|
||||
elif setting_value.lower() == 'false':
|
||||
setting_value = False
|
||||
elif re.search(r'^\d+$', setting_value) is not None:
|
||||
setting_value = int(setting_value)
|
||||
elif re.search(r'^\{.*\}$', setting_value.strip()) is not None:
|
||||
setting_value = json.loads(setting_value)
|
||||
elif re.search(r'^\[.*\]$', setting_value.strip()) is not None:
|
||||
setting_value = json.loads(setting_value)
|
||||
else:
|
||||
pass
|
||||
locals()[setting_name] = setting_value
|
||||
|
||||
4
backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py
Executable file
4
backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py
Executable file
@@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
38
backend/app/spiders/v2ex_config/config_spider/spiders/spider.py
Executable file
38
backend/app/spiders/v2ex_config/config_spider/spiders/spider.py
Executable file
@@ -0,0 +1,38 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import scrapy
|
||||
import re
|
||||
from config_spider.items import Item
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
def get_real_url(response, url):
|
||||
if re.search(r'^https?', url):
|
||||
return url
|
||||
elif re.search(r'^\/\/', url):
|
||||
u = urlparse(response.url)
|
||||
return u.scheme + url
|
||||
return urljoin(response.url, url)
|
||||
|
||||
class ConfigSpider(scrapy.Spider):
|
||||
name = 'config_spider'
|
||||
|
||||
def start_requests(self):
|
||||
yield scrapy.Request(url='https://v2ex.com/', callback=self.parse_list)
|
||||
|
||||
def parse_list(self, response):
|
||||
prev_item = response.meta.get('item')
|
||||
for elem in response.css('.cell.item'):
|
||||
item = Item()
|
||||
item['title'] = elem.css('a.topic-link::text').extract_first()
|
||||
item['url'] = elem.css('a.topic-link::attr("href")').extract_first()
|
||||
item['replies'] = elem.css('.count_livid::text').extract_first()
|
||||
if prev_item is not None:
|
||||
for key, value in prev_item.items():
|
||||
item[key] = value
|
||||
yield scrapy.Request(url=get_real_url(response, item['url']), callback=self.parse_detail, meta={'item': item})
|
||||
|
||||
def parse_detail(self, response):
|
||||
item = Item() if response.meta.get('item') is None else response.meta.get('item')
|
||||
item['content'] = response.xpath('string(.//*[@class="markdown_body"])').extract_first()
|
||||
yield item
|
||||
|
||||
|
||||
1
backend/app/spiders/v2ex_config/md5.txt
Executable file
1
backend/app/spiders/v2ex_config/md5.txt
Executable file
@@ -0,0 +1 @@
|
||||
402c0a07873ef74b9b574bc0f6b28423
|
||||
11
backend/app/spiders/v2ex_config/scrapy.cfg
Executable file
11
backend/app/spiders/v2ex_config/scrapy.cfg
Executable file
@@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = config_spider.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = config_spider
|
||||
5
backend/app/spiders/xueqiu/Spiderfile
Executable file
5
backend/app/spiders/xueqiu/Spiderfile
Executable file
@@ -0,0 +1,5 @@
|
||||
name: "xueqiu"
|
||||
display_name: "雪球网 (Scrapy)"
|
||||
type: "customized"
|
||||
col: "results_xueqiu"
|
||||
cmd: "scrapy crawl xueqiu_spider"
|
||||
1
backend/app/spiders/xueqiu/md5.txt
Executable file
1
backend/app/spiders/xueqiu/md5.txt
Executable file
@@ -0,0 +1 @@
|
||||
df177994199caa691d87fc0c5031326d
|
||||
11
backend/app/spiders/xueqiu/scrapy.cfg
Executable file
11
backend/app/spiders/xueqiu/scrapy.cfg
Executable file
@@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = xueqiu.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = xueqiu
|
||||
0
backend/app/spiders/xueqiu/xueqiu/__init__.py
Executable file
0
backend/app/spiders/xueqiu/xueqiu/__init__.py
Executable file
23
backend/app/spiders/xueqiu/xueqiu/items.py
Executable file
23
backend/app/spiders/xueqiu/xueqiu/items.py
Executable file
@@ -0,0 +1,23 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://doc.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class XueqiuItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
_id = scrapy.Field()
|
||||
task_id = scrapy.Field()
|
||||
id = scrapy.Field()
|
||||
text = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
target = scrapy.Field()
|
||||
view_count = scrapy.Field()
|
||||
mark = scrapy.Field()
|
||||
created_at = scrapy.Field()
|
||||
ts = scrapy.Field()
|
||||
source = scrapy.Field()
|
||||
103
backend/app/spiders/xueqiu/xueqiu/middlewares.py
Executable file
103
backend/app/spiders/xueqiu/xueqiu/middlewares.py
Executable file
@@ -0,0 +1,103 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your spider middleware
|
||||
#
|
||||
# See documentation in:
|
||||
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class XueqiuSpiderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the spider middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
# Called for each response that goes through the spider
|
||||
# middleware and into the spider.
|
||||
|
||||
# Should return None or raise an exception.
|
||||
return None
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
# Called with the results returned from the Spider, after
|
||||
# it has processed the response.
|
||||
|
||||
# Must return an iterable of Request, dict or Item objects.
|
||||
for i in result:
|
||||
yield i
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
# Called when a spider or process_spider_input() method
|
||||
# (from other spider middleware) raises an exception.
|
||||
|
||||
# Should return either None or an iterable of Response, dict
|
||||
# or Item objects.
|
||||
pass
|
||||
|
||||
def process_start_requests(self, start_requests, spider):
|
||||
# Called with the start requests of the spider, and works
|
||||
# similarly to the process_spider_output() method, except
|
||||
# that it doesn’t have a response associated.
|
||||
|
||||
# Must return only requests (not items).
|
||||
for r in start_requests:
|
||||
yield r
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
|
||||
|
||||
class XueqiuDownloaderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the downloader middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# Called for each request that goes through the downloader
|
||||
# middleware.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this request
|
||||
# - or return a Response object
|
||||
# - or return a Request object
|
||||
# - or raise IgnoreRequest: process_exception() methods of
|
||||
# installed downloader middleware will be called
|
||||
return None
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
# Called with the response returned from the downloader.
|
||||
|
||||
# Must either;
|
||||
# - return a Response object
|
||||
# - return a Request object
|
||||
# - or raise IgnoreRequest
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
# Called when a download handler or a process_request()
|
||||
# (from other downloader middleware) raises an exception.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this exception
|
||||
# - return a Response object: stops process_exception() chain
|
||||
# - return a Request object: stops process_exception() chain
|
||||
pass
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user