diff --git a/.gitignore b/.gitignore index 0b6328c9..6eb0d9f1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .idea/ +.vscode/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/backend/app/spiders/amazon_config/Spiderfile b/backend/app/spiders/amazon_config/Spiderfile new file mode 100755 index 00000000..eea8a538 --- /dev/null +++ b/backend/app/spiders/amazon_config/Spiderfile @@ -0,0 +1,51 @@ +name: "amazon_config" +display_name: "亚马逊中国(可配置)" +remark: "亚马逊中国搜索手机,列表+分页" +type: "configurable" +col: "results_amazon_config" +engine: scrapy +start_url: https://www.amazon.cn/s?k=%E6%89%8B%E6%9C%BA&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_2 +start_stage: list +stages: +- name: list + is_list: true + list_css: .s-result-item + list_xpath: "" + page_css: .a-last > a + page_xpath: "" + page_attr: href + fields: + - name: title + css: span.a-text-normal + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: .a-link-normal + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: price + css: "" + xpath: .//*[@class="a-price-whole"] + attr: "" + next_stage: "" + remark: "" + - name: price_fraction + css: "" + xpath: .//*[@class="a-price-fraction"] + attr: "" + next_stage: "" + remark: "" + - name: img + css: .s-image-square-aspect > img + xpath: "" + attr: src + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/amazon_config/config_spider/__init__.py b/backend/app/spiders/amazon_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/amazon_config/config_spider/items.py b/backend/app/spiders/amazon_config/config_spider/items.py new file mode 100755 index 00000000..79bf0adb --- /dev/null +++ b/backend/app/spiders/amazon_config/config_spider/items.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + title = scrapy.Field() + url = scrapy.Field() + price = scrapy.Field() + price_fraction = scrapy.Field() + img = scrapy.Field() + diff --git a/backend/app/spiders/amazon_config/config_spider/middlewares.py b/backend/app/spiders/amazon_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/amazon_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/amazon_config/config_spider/pipelines.py b/backend/app/spiders/amazon_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/amazon_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/amazon_config/config_spider/settings.py b/backend/app/spiders/amazon_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/amazon_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/amazon_config/config_spider/spiders/__init__.py b/backend/app/spiders/amazon_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/amazon_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/amazon_config/config_spider/spiders/spider.py b/backend/app/spiders/amazon_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..a7421df3 --- /dev/null +++ b/backend/app/spiders/amazon_config/config_spider/spiders/spider.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='https://www.amazon.cn/s?k=%E6%89%8B%E6%9C%BA&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_2', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.css('.s-result-item'): + item = Item() + item['title'] = elem.css('span.a-text-normal::text').extract_first() + item['url'] = elem.css('.a-link-normal::attr("href")').extract_first() + item['price'] = elem.xpath('string(.//*[@class="a-price-whole"])').extract_first() + item['price_fraction'] = elem.xpath('string(.//*[@class="a-price-fraction"])').extract_first() + item['img'] = elem.css('.s-image-square-aspect > img::attr("src")').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield item + next_url = response.css('.a-last > a::attr("href")').extract_first() + yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) + + diff --git a/backend/app/spiders/amazon_config/md5.txt b/backend/app/spiders/amazon_config/md5.txt new file mode 100755 index 00000000..52c5423f --- /dev/null +++ b/backend/app/spiders/amazon_config/md5.txt @@ -0,0 +1 @@ +4b716dd3c15b993ccb7a9f0be1cc0de9 diff --git a/backend/app/spiders/amazon_config/scrapy.cfg b/backend/app/spiders/amazon_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/amazon_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/app/spiders/autohome_config/Spiderfile b/backend/app/spiders/autohome_config/Spiderfile new file mode 100755 index 00000000..e69880cb --- /dev/null +++ b/backend/app/spiders/autohome_config/Spiderfile @@ -0,0 +1,57 @@ +name: "autohome_config" +display_name: "汽车之家(可配置)" +remark: "汽车之家文章,列表+详情+分页" +type: "configurable" +col: "results_autohome_config" +engine: scrapy +start_url: https://www.autohome.com.cn/all/ +start_stage: list +stages: +- name: list + is_list: true + list_css: ul.article > li + list_xpath: "" + page_css: a.page-item-next + page_xpath: "" + page_attr: href + fields: + - name: title + css: li > a > h3 + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: li > a + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: abstract + css: li > a > p + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: time + css: li > a .fn-left + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: views + css: li > a .fn-right > em:first-child + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: comments + css: li > a .fn-right > em:last-child + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/autohome_config/config_spider/__init__.py b/backend/app/spiders/autohome_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/autohome_config/config_spider/items.py b/backend/app/spiders/autohome_config/config_spider/items.py new file mode 100755 index 00000000..206203d5 --- /dev/null +++ b/backend/app/spiders/autohome_config/config_spider/items.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + title = scrapy.Field() + url = scrapy.Field() + abstract = scrapy.Field() + time = scrapy.Field() + views = scrapy.Field() + comments = scrapy.Field() + diff --git a/backend/app/spiders/autohome_config/config_spider/middlewares.py b/backend/app/spiders/autohome_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/autohome_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/autohome_config/config_spider/pipelines.py b/backend/app/spiders/autohome_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/autohome_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/autohome_config/config_spider/settings.py b/backend/app/spiders/autohome_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/autohome_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/autohome_config/config_spider/spiders/__init__.py b/backend/app/spiders/autohome_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/autohome_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/autohome_config/config_spider/spiders/spider.py b/backend/app/spiders/autohome_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..83753f5a --- /dev/null +++ b/backend/app/spiders/autohome_config/config_spider/spiders/spider.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='https://www.autohome.com.cn/all/', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.css('ul.article > li'): + item = Item() + item['title'] = elem.css('li > a > h3::text').extract_first() + item['url'] = elem.css('li > a::attr("href")').extract_first() + item['abstract'] = elem.css('li > a > p::text').extract_first() + item['time'] = elem.css('li > a .fn-left::text').extract_first() + item['views'] = elem.css('li > a .fn-right > em:first-child::text').extract_first() + item['comments'] = elem.css('li > a .fn-right > em:last-child::text').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield item + next_url = response.css('a.page-item-next::attr("href")').extract_first() + yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) + + diff --git a/backend/app/spiders/autohome_config/md5.txt b/backend/app/spiders/autohome_config/md5.txt new file mode 100755 index 00000000..c4707adf --- /dev/null +++ b/backend/app/spiders/autohome_config/md5.txt @@ -0,0 +1 @@ +d784a11085e298eaf344eadc3a3e9411 diff --git a/backend/app/spiders/autohome_config/scrapy.cfg b/backend/app/spiders/autohome_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/autohome_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/app/spiders/baidu_config/Spiderfile b/backend/app/spiders/baidu_config/Spiderfile new file mode 100755 index 00000000..a29d4acb --- /dev/null +++ b/backend/app/spiders/baidu_config/Spiderfile @@ -0,0 +1,39 @@ +name: "baidu_config" +display_name: "百度搜索(可配置)" +remark: "百度搜索Crawlab,列表+分页" +type: "configurable" +col: "results_baidu_config" +engine: scrapy +start_url: http://www.baidu.com/s?wd=crawlab +start_stage: list +stages: +- name: list + is_list: true + list_css: ".result.c-container" + list_xpath: "" + page_css: "a.n" + page_xpath: "" + page_attr: href + fields: + - name: title + css: "" + xpath: .//h3/a + attr: "" + next_stage: "" + remark: "" + - name: url + css: "" + xpath: .//h3/a + attr: href + next_stage: "" + remark: "" + - name: abstract + css: "" + xpath: .//*[@class="c-abstract"] + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/baidu_config/config_spider/__init__.py b/backend/app/spiders/baidu_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/baidu_config/config_spider/items.py b/backend/app/spiders/baidu_config/config_spider/items.py new file mode 100755 index 00000000..9282765f --- /dev/null +++ b/backend/app/spiders/baidu_config/config_spider/items.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + title = scrapy.Field() + url = scrapy.Field() + abstract = scrapy.Field() + diff --git a/backend/app/spiders/baidu_config/config_spider/middlewares.py b/backend/app/spiders/baidu_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/baidu_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/baidu_config/config_spider/pipelines.py b/backend/app/spiders/baidu_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/baidu_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/baidu_config/config_spider/settings.py b/backend/app/spiders/baidu_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/baidu_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/baidu_config/config_spider/spiders/__init__.py b/backend/app/spiders/baidu_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/baidu_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/baidu_config/config_spider/spiders/spider.py b/backend/app/spiders/baidu_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..e5fd793f --- /dev/null +++ b/backend/app/spiders/baidu_config/config_spider/spiders/spider.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='http://www.baidu.com/s?wd=crawlab', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.css('.result.c-container'): + item = Item() + item['title'] = elem.xpath('string(.//h3/a)').extract_first() + item['url'] = elem.xpath('.//h3/a/@href').extract_first() + item['abstract'] = elem.xpath('string(.//*[@class="c-abstract"])').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield item + next_url = response.css('a.n::attr("href")').extract_first() + yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) + + diff --git a/backend/app/spiders/baidu_config/md5.txt b/backend/app/spiders/baidu_config/md5.txt new file mode 100755 index 00000000..32137b76 --- /dev/null +++ b/backend/app/spiders/baidu_config/md5.txt @@ -0,0 +1 @@ +ba25f6f3567b256473d3f0ec6af783fd diff --git a/backend/app/spiders/baidu_config/scrapy.cfg b/backend/app/spiders/baidu_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/baidu_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/app/spiders/bing_general/Spiderfile b/backend/app/spiders/bing_general/Spiderfile new file mode 100755 index 00000000..614c135e --- /dev/null +++ b/backend/app/spiders/bing_general/Spiderfile @@ -0,0 +1,6 @@ +name: "bing_general" +display_name: "必应搜索 (通用)" +remark: "必应搜索 Crawlab,列表+分页" +col: "results_bing_general" +type: "customized" +cmd: "python bing_spider.py" \ No newline at end of file diff --git a/backend/app/spiders/bing_general/bing_spider.py b/backend/app/spiders/bing_general/bing_spider.py new file mode 100755 index 00000000..e982e4ee --- /dev/null +++ b/backend/app/spiders/bing_general/bing_spider.py @@ -0,0 +1,41 @@ +import requests +from bs4 import BeautifulSoup as bs +from urllib.parse import urljoin, urlparse +import re +from crawlab import save_item + +s = requests.Session() + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +def start_requests(): + for i in range(0, 9): + fr = 'PERE' if not i else 'MORE' + url = f'https://cn.bing.com/search?q=crawlab&first={10 * i + 1}&FROM={fr}' + request_page(url) + +def request_page(url): + print(f'requesting {url}') + r = s.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}) + parse_list(r) + +def parse_list(response): + soup = bs(response.content.decode('utf-8')) + for el in list(soup.select('#b_results > li')): + try: + save_item({ + 'title': el.select_one('h2').text, + 'url': el.select_one('h2 a').attrs.get('href'), + 'abstract': el.select_one('.b_caption p').text, + }) + except: + pass + +if __name__ == '__main__': + start_requests() \ No newline at end of file diff --git a/backend/app/spiders/bing_general/md5.txt b/backend/app/spiders/bing_general/md5.txt new file mode 100755 index 00000000..42fb6afd --- /dev/null +++ b/backend/app/spiders/bing_general/md5.txt @@ -0,0 +1 @@ +cf295b694a20c99c4857f838aa0402a7 diff --git a/backend/app/spiders/chinaz/Spiderfile b/backend/app/spiders/chinaz/Spiderfile new file mode 100755 index 00000000..2fb940bb --- /dev/null +++ b/backend/app/spiders/chinaz/Spiderfile @@ -0,0 +1,5 @@ +name: "chinaz" +display_name: "站长之家 (Scrapy)" +col: "results_chinaz" +type: "customized" +cmd: "scrapy crawl chinaz_spider" \ No newline at end of file diff --git a/backend/app/spiders/chinaz/chinaz/__init__.py b/backend/app/spiders/chinaz/chinaz/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/chinaz/chinaz/items.py b/backend/app/spiders/chinaz/chinaz/items.py new file mode 100755 index 00000000..1fdcac1b --- /dev/null +++ b/backend/app/spiders/chinaz/chinaz/items.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class ChinazItem(scrapy.Item): + # define the fields for your item here like: + _id = scrapy.Field() + task_id = scrapy.Field() + name = scrapy.Field() + domain = scrapy.Field() + description = scrapy.Field() + rank = scrapy.Field() + main_category = scrapy.Field() + category = scrapy.Field() + location = scrapy.Field() diff --git a/backend/app/spiders/chinaz/chinaz/middlewares.py b/backend/app/spiders/chinaz/chinaz/middlewares.py new file mode 100755 index 00000000..c98995d5 --- /dev/null +++ b/backend/app/spiders/chinaz/chinaz/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ChinazSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ChinazDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/chinaz/chinaz/pipelines.py b/backend/app/spiders/chinaz/chinaz/pipelines.py new file mode 100755 index 00000000..b29f9eb7 --- /dev/null +++ b/backend/app/spiders/chinaz/chinaz/pipelines.py @@ -0,0 +1,7 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html + diff --git a/backend/app/spiders/chinaz/chinaz/settings.py b/backend/app/spiders/chinaz/chinaz/settings.py new file mode 100755 index 00000000..932ec9ed --- /dev/null +++ b/backend/app/spiders/chinaz/chinaz/settings.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for chinaz project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'chinaz' + +SPIDER_MODULES = ['chinaz.spiders'] +NEWSPIDER_MODULE = 'chinaz.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'chinaz (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'chinaz.middlewares.ChinazSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'chinaz.middlewares.ChinazDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'crawlab.pipelines.CrawlabMongoPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/chinaz/chinaz/spiders/__init__.py b/backend/app/spiders/chinaz/chinaz/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/chinaz/chinaz/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py b/backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py new file mode 100755 index 00000000..28ad84e7 --- /dev/null +++ b/backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- +import scrapy +from chinaz.items import ChinazItem + + +class ChinazSpiderSpider(scrapy.Spider): + name = 'chinaz_spider' + allowed_domains = ['chinaz.com'] + start_urls = ['http://top.chinaz.com/hangye/'] + + def parse(self, response): + for item in response.css('.listCentent > li'): + name = item.css('h3.rightTxtHead > a::text').extract_first() + href = item.css('h3.rightTxtHead > a::attr("href")').extract_first() + domain = item.css('h3.rightTxtHead > span::text').extract_first() + description = item.css('p.RtCInfo::text').extract_first() + rank = item.css('.RtCRateCent > strong::text').extract_first() + rank = int(rank) + item = ChinazItem( + _id=domain, + name=name, + domain=domain, + description=description, + rank=rank, + ) + yield scrapy.Request( + url='http://top.chinaz.com' + href, + callback=self.parse_item, + meta={ + 'item': item + } + ) + + # pagination + a_list = response.css('.ListPageWrap > a::attr("href")').extract() + url = 'http://top.chinaz.com/hangye/' + a_list[-1] + yield scrapy.Request(url=url, callback=self.parse) + + def parse_item(self, response): + item = response.meta['item'] + + # category info extraction + arr = response.css('.TopMainTag-show .SimSun') + res1 = arr[0].css('a::text').extract() + main_category = res1[0] + if len(res1) == 1: + category = '其他' + else: + category = res1[1] + + # location info extraction + res2 = arr[1].css('a::text').extract() + if len(res2) > 0: + location = res2[0] + else: + location = '其他' + + # assign values to item + item['main_category'] = main_category + item['category'] = category + item['location'] = location + + yield item diff --git a/backend/app/spiders/chinaz/md5.txt b/backend/app/spiders/chinaz/md5.txt new file mode 100755 index 00000000..f5e15fb9 --- /dev/null +++ b/backend/app/spiders/chinaz/md5.txt @@ -0,0 +1 @@ +1976593e49bf0238602ce35d051bd137 diff --git a/backend/app/spiders/chinaz/scrapy.cfg b/backend/app/spiders/chinaz/scrapy.cfg new file mode 100755 index 00000000..d3b44a1a --- /dev/null +++ b/backend/app/spiders/chinaz/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = chinaz.settings + +[deploy] +#url = http://localhost:6800/ +project = chinaz diff --git a/backend/app/spiders/csdn_config/Spiderfile b/backend/app/spiders/csdn_config/Spiderfile new file mode 100755 index 00000000..67f4f8c5 --- /dev/null +++ b/backend/app/spiders/csdn_config/Spiderfile @@ -0,0 +1,60 @@ +name: "csdn_config" +display_name: "CSDN(可配置)" +remark: "CSDN Crawlab 文章,列表+详情+分页" +type: "configurable" +col: "results_csdn_config" +engine: scrapy +start_url: https://so.csdn.net/so/search/s.do?q=crawlab +start_stage: list +stages: +- name: list + is_list: true + list_css: .search-list-con > .search-list + list_xpath: "" + page_css: a.btn-next + page_xpath: "" + page_attr: href + fields: + - name: url + css: "" + xpath: .//*[@class="limit_width"]/a + attr: href + next_stage: detail + remark: "" +- name: detail + is_list: false + list_css: "" + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: "" + fields: + - name: content + css: "" + xpath: .//div[@id="content_views"] + attr: "" + next_stage: "" + remark: "" + - name: views + css: .read-count + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: title + css: .title-article + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: author + css: .follow-nickName + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + AUTOTHROTTLE_ENABLED: "false" + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/79.0.3945.117 Safari/537.36 diff --git a/backend/app/spiders/csdn_config/config_spider/__init__.py b/backend/app/spiders/csdn_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/csdn_config/config_spider/items.py b/backend/app/spiders/csdn_config/config_spider/items.py new file mode 100755 index 00000000..3c8e5e54 --- /dev/null +++ b/backend/app/spiders/csdn_config/config_spider/items.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + url = scrapy.Field() + content = scrapy.Field() + views = scrapy.Field() + title = scrapy.Field() + author = scrapy.Field() + diff --git a/backend/app/spiders/csdn_config/config_spider/middlewares.py b/backend/app/spiders/csdn_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/csdn_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/csdn_config/config_spider/pipelines.py b/backend/app/spiders/csdn_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/csdn_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/csdn_config/config_spider/settings.py b/backend/app/spiders/csdn_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/csdn_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/csdn_config/config_spider/spiders/__init__.py b/backend/app/spiders/csdn_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/csdn_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/csdn_config/config_spider/spiders/spider.py b/backend/app/spiders/csdn_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..9ecc4aae --- /dev/null +++ b/backend/app/spiders/csdn_config/config_spider/spiders/spider.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='https://so.csdn.net/so/search/s.do?q=crawlab', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.css('.search-list-con > .search-list'): + item = Item() + item['url'] = elem.xpath('.//*[@class="limit_width"]/a/@href').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield scrapy.Request(url=get_real_url(response, item['url']), callback=self.parse_detail, meta={'item': item}) + next_url = response.css('a.btn-next::attr("href")').extract_first() + yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) + + def parse_detail(self, response): + item = Item() if response.meta.get('item') is None else response.meta.get('item') + item['content'] = response.xpath('string(.//div[@id="content_views"])').extract_first() + item['views'] = response.css('.read-count::text').extract_first() + item['title'] = response.css('.title-article::text').extract_first() + item['author'] = response.css('.follow-nickName::text').extract_first() + yield item + + diff --git a/backend/app/spiders/csdn_config/md5.txt b/backend/app/spiders/csdn_config/md5.txt new file mode 100755 index 00000000..e169c42a --- /dev/null +++ b/backend/app/spiders/csdn_config/md5.txt @@ -0,0 +1 @@ +b6889c74e006a5e619b525d84db62ffd diff --git a/backend/app/spiders/csdn_config/scrapy.cfg b/backend/app/spiders/csdn_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/csdn_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/app/spiders/douban_config/Spiderfile b/backend/app/spiders/douban_config/Spiderfile new file mode 100755 index 00000000..84f0647a --- /dev/null +++ b/backend/app/spiders/douban_config/Spiderfile @@ -0,0 +1,57 @@ +name: "douban_config" +display_name: "豆瓣读书(可配置)" +remark: "豆瓣读书新书推荐,列表" +type: "configurable" +col: "results_douban_config" +engine: scrapy +start_url: https://book.douban.com/latest +start_stage: list +stages: +- name: list + is_list: true + list_css: ul.cover-col-4 > li + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: "" + fields: + - name: title + css: h2 > a + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: h2 > a + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: img + css: a.cover img + xpath: "" + attr: src + next_stage: "" + remark: "" + - name: rating + css: p.rating > .color-lightgray + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: abstract + css: p:last-child + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: info + css: .color-gray + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/douban_config/config_spider/__init__.py b/backend/app/spiders/douban_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/douban_config/config_spider/items.py b/backend/app/spiders/douban_config/config_spider/items.py new file mode 100755 index 00000000..d6959b8d --- /dev/null +++ b/backend/app/spiders/douban_config/config_spider/items.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + title = scrapy.Field() + url = scrapy.Field() + img = scrapy.Field() + rating = scrapy.Field() + abstract = scrapy.Field() + info = scrapy.Field() + diff --git a/backend/app/spiders/douban_config/config_spider/middlewares.py b/backend/app/spiders/douban_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/douban_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/douban_config/config_spider/pipelines.py b/backend/app/spiders/douban_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/douban_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/douban_config/config_spider/settings.py b/backend/app/spiders/douban_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/douban_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/douban_config/config_spider/spiders/__init__.py b/backend/app/spiders/douban_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/douban_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/douban_config/config_spider/spiders/spider.py b/backend/app/spiders/douban_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..61bb648d --- /dev/null +++ b/backend/app/spiders/douban_config/config_spider/spiders/spider.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='https://book.douban.com/latest', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.css('ul.cover-col-4 > li'): + item = Item() + item['title'] = elem.css('h2 > a::text').extract_first() + item['url'] = elem.css('h2 > a::attr("href")').extract_first() + item['img'] = elem.css('a.cover img::attr("src")').extract_first() + item['rating'] = elem.css('p.rating > .color-lightgray::text').extract_first() + item['abstract'] = elem.css('p:last-child::text').extract_first() + item['info'] = elem.css('.color-gray::text').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield item + + diff --git a/backend/app/spiders/douban_config/md5.txt b/backend/app/spiders/douban_config/md5.txt new file mode 100755 index 00000000..374e3804 --- /dev/null +++ b/backend/app/spiders/douban_config/md5.txt @@ -0,0 +1 @@ +4d59a6c83b0e125d5321beae86bb93ce diff --git a/backend/app/spiders/douban_config/scrapy.cfg b/backend/app/spiders/douban_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/douban_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/app/spiders/jd/Spiderfile b/backend/app/spiders/jd/Spiderfile new file mode 100755 index 00000000..d090472b --- /dev/null +++ b/backend/app/spiders/jd/Spiderfile @@ -0,0 +1,5 @@ +name: "jd" +display_name: "京东 (Scrapy)" +col: "results_jd" +type: "customized" +cmd: "scrapy crawl jd_spider" \ No newline at end of file diff --git a/backend/app/spiders/jd/jd/__init__.py b/backend/app/spiders/jd/jd/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/jd/jd/items.py b/backend/app/spiders/jd/jd/items.py new file mode 100755 index 00000000..b2c5e647 --- /dev/null +++ b/backend/app/spiders/jd/jd/items.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class JdItem(scrapy.Item): + # define the fields for your item here like: + name = scrapy.Field() + price = scrapy.Field() + url = scrapy.Field() diff --git a/backend/app/spiders/jd/jd/middlewares.py b/backend/app/spiders/jd/jd/middlewares.py new file mode 100755 index 00000000..6fceded5 --- /dev/null +++ b/backend/app/spiders/jd/jd/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class JdSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class JdDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/jd/jd/pipelines.py b/backend/app/spiders/jd/jd/pipelines.py new file mode 100755 index 00000000..5a7d7cbf --- /dev/null +++ b/backend/app/spiders/jd/jd/pipelines.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/backend/app/spiders/jd/jd/settings.py b/backend/app/spiders/jd/jd/settings.py new file mode 100755 index 00000000..ef89ed0c --- /dev/null +++ b/backend/app/spiders/jd/jd/settings.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for jd project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'jd' + +SPIDER_MODULES = ['jd.spiders'] +NEWSPIDER_MODULE = 'jd.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'jd (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'jd.middlewares.JdSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'jd.middlewares.JdDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'crawlab.pipelines.CrawlabMongoPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/jd/jd/spiders/__init__.py b/backend/app/spiders/jd/jd/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/jd/jd/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/jd/jd/spiders/jd_spider.py b/backend/app/spiders/jd/jd/spiders/jd_spider.py new file mode 100755 index 00000000..4ec94fa9 --- /dev/null +++ b/backend/app/spiders/jd/jd/spiders/jd_spider.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +import scrapy + +from jd.items import JdItem + + +class JdSpiderSpider(scrapy.Spider): + name = 'jd_spider' + allowed_domains = ['jd.com'] + + def start_requests(self): + for i in range(1, 50): + yield scrapy.Request(url=f'https://search.jd.com/Search?keyword=手机&enc=utf-8&page={i}') + + def parse(self, response): + for el in response.css('.gl-item'): + yield JdItem( + url=el.css('.p-name > a::attr("href")').extract_first(), + name=el.css('.p-name > a::attr("title")').extract_first(), + price=float(el.css('.p-price i::text').extract_first()), + ) diff --git a/backend/app/spiders/jd/md5.txt b/backend/app/spiders/jd/md5.txt new file mode 100755 index 00000000..dcd53f51 --- /dev/null +++ b/backend/app/spiders/jd/md5.txt @@ -0,0 +1 @@ +621486d31459514eb27a082d159d9b8c diff --git a/backend/app/spiders/jd/scrapy.cfg b/backend/app/spiders/jd/scrapy.cfg new file mode 100755 index 00000000..87cf0280 --- /dev/null +++ b/backend/app/spiders/jd/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = jd.settings + +[deploy] +#url = http://localhost:6800/ +project = jd diff --git a/backend/app/spiders/sinastock/Spiderfile b/backend/app/spiders/sinastock/Spiderfile new file mode 100755 index 00000000..b110cb48 --- /dev/null +++ b/backend/app/spiders/sinastock/Spiderfile @@ -0,0 +1,5 @@ +name: "sinastock" +display_name: "新浪股票 (Scrapy)" +type: "customized" +col: "results_sinastock" +cmd: "scrapy crawl sinastock_spider" \ No newline at end of file diff --git a/backend/app/spiders/sinastock/md5.txt b/backend/app/spiders/sinastock/md5.txt new file mode 100755 index 00000000..1e5d8ab9 --- /dev/null +++ b/backend/app/spiders/sinastock/md5.txt @@ -0,0 +1 @@ +80bc091fa45ef4a85c9f1a66c81a4ed7 diff --git a/backend/app/spiders/sinastock/scrapy.cfg b/backend/app/spiders/sinastock/scrapy.cfg new file mode 100755 index 00000000..4969ad96 --- /dev/null +++ b/backend/app/spiders/sinastock/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = sinastock.settings + +[deploy] +#url = http://localhost:6800/ +project = sinastock diff --git a/backend/app/spiders/sinastock/sinastock/__init__.py b/backend/app/spiders/sinastock/sinastock/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/sinastock/sinastock/items.py b/backend/app/spiders/sinastock/sinastock/items.py new file mode 100755 index 00000000..6e3e5d8e --- /dev/null +++ b/backend/app/spiders/sinastock/sinastock/items.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class NewsItem(scrapy.Item): + # define the fields for your item here like: + _id = scrapy.Field() + title = scrapy.Field() + ts_str = scrapy.Field() + ts = scrapy.Field() + url = scrapy.Field() + text = scrapy.Field() + task_id = scrapy.Field() + source = scrapy.Field() + stocks = scrapy.Field() diff --git a/backend/app/spiders/sinastock/sinastock/middlewares.py b/backend/app/spiders/sinastock/sinastock/middlewares.py new file mode 100755 index 00000000..912b5e57 --- /dev/null +++ b/backend/app/spiders/sinastock/sinastock/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class SinastockSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class SinastockDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/sinastock/sinastock/pipelines.py b/backend/app/spiders/sinastock/sinastock/pipelines.py new file mode 100755 index 00000000..5a7d7cbf --- /dev/null +++ b/backend/app/spiders/sinastock/sinastock/pipelines.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/backend/app/spiders/sinastock/sinastock/settings.py b/backend/app/spiders/sinastock/sinastock/settings.py new file mode 100755 index 00000000..3e01d3ca --- /dev/null +++ b/backend/app/spiders/sinastock/sinastock/settings.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for sinastock project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'sinastock' + +SPIDER_MODULES = ['sinastock.spiders'] +NEWSPIDER_MODULE = 'sinastock.spiders' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +# USER_AGENT = 'sinastock (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +# CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +# DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +# } + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# 'sinastock.middlewares.SinastockSpiderMiddleware': 543, +# } + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# DOWNLOADER_MIDDLEWARES = { +# 'sinastock.middlewares.SinastockDownloaderMiddleware': 543, +# } + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +# } + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'crawlab.pipelines.CrawlabMongoPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/sinastock/sinastock/spiders/__init__.py b/backend/app/spiders/sinastock/sinastock/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/sinastock/sinastock/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py b/backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py new file mode 100755 index 00000000..54daf763 --- /dev/null +++ b/backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- +import os +import re +from datetime import datetime + +import scrapy +from pymongo import MongoClient + +from sinastock.items import NewsItem + +class SinastockSpiderSpider(scrapy.Spider): + name = 'sinastock_spider' + allowed_domains = ['finance.sina.com.cn'] + mongo = MongoClient( + host=os.environ.get('MONGO_HOST') or 'localhost', + port=int(os.environ.get('MONGO_PORT') or 27017) + ) + db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test'] + col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'stock_news') + + def start_requests(self): + col = self.db['stocks'] + for s in col.find({}): + code, ex = s['ts_code'].split('.') + for i in range(10): + url = f'http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol={ex.lower()}{code}&Page={i + 1}' + yield scrapy.Request( + url=url, + callback=self.parse, + meta={'ts_code': s['ts_code']} + ) + + def parse(self, response): + for a in response.css('.datelist > ul > a'): + url = a.css('a::attr("href")').extract_first() + item = NewsItem( + title=a.css('a::text').extract_first(), + url=url, + source='sina', + stocks=[response.meta['ts_code']] + ) + yield scrapy.Request( + url=url, + callback=self.parse_detail, + meta={'item': item} + ) + + def parse_detail(self, response): + item = response.meta['item'] + text = response.css('#artibody').extract_first() + pre = re.compile('>(.*?)<') + text = ''.join(pre.findall(text)) + item['text'] = text.replace('\u3000', '') + item['ts_str'] = response.css('.date::text').extract_first() + if item['text'] is None or item['ts_str'] is None: + pass + else: + item['ts'] = datetime.strptime(item['ts_str'], '%Y年%m月%d日 %H:%M') + yield item diff --git a/backend/app/spiders/v2ex_config/Spiderfile b/backend/app/spiders/v2ex_config/Spiderfile new file mode 100755 index 00000000..bb18d40a --- /dev/null +++ b/backend/app/spiders/v2ex_config/Spiderfile @@ -0,0 +1,54 @@ +name: "v2ex_config" +display_name: "V2ex(可配置)" +remark: "V2ex,列表+详情" +type: "configurable" +col: "results_v2ex_config" +engine: scrapy +start_url: https://v2ex.com/ +start_stage: list +stages: +- name: list + is_list: true + list_css: .cell.item + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: href + fields: + - name: title + css: a.topic-link + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: a.topic-link + xpath: "" + attr: href + next_stage: detail + remark: "" + - name: replies + css: .count_livid + xpath: "" + attr: "" + next_stage: "" + remark: "" +- name: detail + is_list: false + list_css: "" + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: "" + fields: + - name: content + css: "" + xpath: .//*[@class="markdown_body"] + attr: "" + next_stage: "" + remark: "" +settings: + AUTOTHROTTLE_ENABLED: "true" + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/79.0.3945.117 Safari/537.36 diff --git a/backend/app/spiders/v2ex_config/config_spider/__init__.py b/backend/app/spiders/v2ex_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/v2ex_config/config_spider/items.py b/backend/app/spiders/v2ex_config/config_spider/items.py new file mode 100755 index 00000000..d2c01a06 --- /dev/null +++ b/backend/app/spiders/v2ex_config/config_spider/items.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + title = scrapy.Field() + url = scrapy.Field() + replies = scrapy.Field() + content = scrapy.Field() + diff --git a/backend/app/spiders/v2ex_config/config_spider/middlewares.py b/backend/app/spiders/v2ex_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/v2ex_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/v2ex_config/config_spider/pipelines.py b/backend/app/spiders/v2ex_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/v2ex_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/v2ex_config/config_spider/settings.py b/backend/app/spiders/v2ex_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/v2ex_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py b/backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/v2ex_config/config_spider/spiders/spider.py b/backend/app/spiders/v2ex_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..4763e040 --- /dev/null +++ b/backend/app/spiders/v2ex_config/config_spider/spiders/spider.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='https://v2ex.com/', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.css('.cell.item'): + item = Item() + item['title'] = elem.css('a.topic-link::text').extract_first() + item['url'] = elem.css('a.topic-link::attr("href")').extract_first() + item['replies'] = elem.css('.count_livid::text').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield scrapy.Request(url=get_real_url(response, item['url']), callback=self.parse_detail, meta={'item': item}) + + def parse_detail(self, response): + item = Item() if response.meta.get('item') is None else response.meta.get('item') + item['content'] = response.xpath('string(.//*[@class="markdown_body"])').extract_first() + yield item + + diff --git a/backend/app/spiders/v2ex_config/md5.txt b/backend/app/spiders/v2ex_config/md5.txt new file mode 100755 index 00000000..5d725b2c --- /dev/null +++ b/backend/app/spiders/v2ex_config/md5.txt @@ -0,0 +1 @@ +402c0a07873ef74b9b574bc0f6b28423 diff --git a/backend/app/spiders/v2ex_config/scrapy.cfg b/backend/app/spiders/v2ex_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/v2ex_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/app/spiders/xueqiu/Spiderfile b/backend/app/spiders/xueqiu/Spiderfile new file mode 100755 index 00000000..38aa5dbe --- /dev/null +++ b/backend/app/spiders/xueqiu/Spiderfile @@ -0,0 +1,5 @@ +name: "xueqiu" +display_name: "雪球网 (Scrapy)" +type: "customized" +col: "results_xueqiu" +cmd: "scrapy crawl xueqiu_spider" \ No newline at end of file diff --git a/backend/app/spiders/xueqiu/md5.txt b/backend/app/spiders/xueqiu/md5.txt new file mode 100755 index 00000000..6a9a2072 --- /dev/null +++ b/backend/app/spiders/xueqiu/md5.txt @@ -0,0 +1 @@ +df177994199caa691d87fc0c5031326d diff --git a/backend/app/spiders/xueqiu/scrapy.cfg b/backend/app/spiders/xueqiu/scrapy.cfg new file mode 100755 index 00000000..2c5ce3b3 --- /dev/null +++ b/backend/app/spiders/xueqiu/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = xueqiu.settings + +[deploy] +#url = http://localhost:6800/ +project = xueqiu diff --git a/backend/app/spiders/xueqiu/xueqiu/__init__.py b/backend/app/spiders/xueqiu/xueqiu/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/xueqiu/xueqiu/items.py b/backend/app/spiders/xueqiu/xueqiu/items.py new file mode 100755 index 00000000..5471594d --- /dev/null +++ b/backend/app/spiders/xueqiu/xueqiu/items.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class XueqiuItem(scrapy.Item): + # define the fields for your item here like: + _id = scrapy.Field() + task_id = scrapy.Field() + id = scrapy.Field() + text = scrapy.Field() + url = scrapy.Field() + target = scrapy.Field() + view_count = scrapy.Field() + mark = scrapy.Field() + created_at = scrapy.Field() + ts = scrapy.Field() + source = scrapy.Field() diff --git a/backend/app/spiders/xueqiu/xueqiu/middlewares.py b/backend/app/spiders/xueqiu/xueqiu/middlewares.py new file mode 100755 index 00000000..f60102ce --- /dev/null +++ b/backend/app/spiders/xueqiu/xueqiu/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class XueqiuSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class XueqiuDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/xueqiu/xueqiu/pipelines.py b/backend/app/spiders/xueqiu/xueqiu/pipelines.py new file mode 100755 index 00000000..5a7d7cbf --- /dev/null +++ b/backend/app/spiders/xueqiu/xueqiu/pipelines.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/backend/app/spiders/xueqiu/xueqiu/settings.py b/backend/app/spiders/xueqiu/xueqiu/settings.py new file mode 100755 index 00000000..1d898e2f --- /dev/null +++ b/backend/app/spiders/xueqiu/xueqiu/settings.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for xueqiu project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'xueqiu' + +SPIDER_MODULES = ['xueqiu.spiders'] +NEWSPIDER_MODULE = 'xueqiu.spiders' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +# CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +# DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +# } + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# 'xueqiu.middlewares.XueqiuSpiderMiddleware': 543, +# } + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# DOWNLOADER_MIDDLEWARES = { +# 'xueqiu.middlewares.XueqiuDownloaderMiddleware': 543, +# } + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +# } + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'crawlab.pipelines.CrawlabMongoPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/xueqiu/xueqiu/spiders/__init__.py b/backend/app/spiders/xueqiu/xueqiu/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/xueqiu/xueqiu/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py b/backend/app/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py new file mode 100755 index 00000000..a746e156 --- /dev/null +++ b/backend/app/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +import json +from datetime import datetime +from time import sleep + +import scrapy + +from xueqiu.items import XueqiuItem + + +class XueqiuSpiderSpider(scrapy.Spider): + name = 'xueqiu_spider' + allowed_domains = ['xueqiu.com'] + + def start_requests(self): + return [scrapy.Request( + url='https://xueqiu.com', + callback=self.parse_home + )] + + def parse_home(self, response): + yield scrapy.Request( + url='https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=20&category=6' + ) + + def parse(self, response): + data = json.loads(response.body) + next_max_id = data.get('next_max_id') + sleep(1) + for row in data.get('list'): + d = json.loads(row.get('data')) + item = XueqiuItem( + id=d['id'], + text=d['text'], + mark=d['mark'], + url=d['target'], + created_at=d['created_at'], + ts=datetime.fromtimestamp(d['created_at'] / 1e3), + view_count=d['view_count'], + source='xueqiu' + ) + yield item + + yield scrapy.Request( + url=f'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id={next_max_id}&count=20&category=6' + ) diff --git a/backend/app/spiders/xueqiu_config/Spiderfile b/backend/app/spiders/xueqiu_config/Spiderfile new file mode 100755 index 00000000..0de50e9e --- /dev/null +++ b/backend/app/spiders/xueqiu_config/Spiderfile @@ -0,0 +1,39 @@ +name: "xueqiu_config" +display_name: "雪球网(可配置)" +remark: "雪球网新闻,列表" +type: "configurable" +col: "results_xueqiu_config" +engine: scrapy +start_url: https://xueqiu.com/ +start_stage: list +stages: +- name: list + is_list: true + list_css: "" + list_xpath: .//*[contains(@class, "AnonymousHome_home__timeline__item")] + page_css: "" + page_xpath: "" + page_attr: "" + fields: + - name: title + css: h3 > a + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: h3 > a + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: abstract + css: p + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/xueqiu_config/config_spider/__init__.py b/backend/app/spiders/xueqiu_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/xueqiu_config/config_spider/items.py b/backend/app/spiders/xueqiu_config/config_spider/items.py new file mode 100755 index 00000000..9282765f --- /dev/null +++ b/backend/app/spiders/xueqiu_config/config_spider/items.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + title = scrapy.Field() + url = scrapy.Field() + abstract = scrapy.Field() + diff --git a/backend/app/spiders/xueqiu_config/config_spider/middlewares.py b/backend/app/spiders/xueqiu_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/xueqiu_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/xueqiu_config/config_spider/pipelines.py b/backend/app/spiders/xueqiu_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/xueqiu_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/xueqiu_config/config_spider/settings.py b/backend/app/spiders/xueqiu_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/xueqiu_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/xueqiu_config/config_spider/spiders/__init__.py b/backend/app/spiders/xueqiu_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/xueqiu_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/xueqiu_config/config_spider/spiders/spider.py b/backend/app/spiders/xueqiu_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..79d4636b --- /dev/null +++ b/backend/app/spiders/xueqiu_config/config_spider/spiders/spider.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='https://xueqiu.com/', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.xpath('.//*[contains(@class, "AnonymousHome_home__timeline__item")]'): + item = Item() + item['title'] = elem.css('h3 > a::text').extract_first() + item['url'] = elem.css('h3 > a::attr("href")').extract_first() + item['abstract'] = elem.css('p::text').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield item + + diff --git a/backend/app/spiders/xueqiu_config/md5.txt b/backend/app/spiders/xueqiu_config/md5.txt new file mode 100755 index 00000000..39a6df77 --- /dev/null +++ b/backend/app/spiders/xueqiu_config/md5.txt @@ -0,0 +1 @@ +e3da3aacb2d290cb179a79028fbfff9c diff --git a/backend/app/spiders/xueqiu_config/scrapy.cfg b/backend/app/spiders/xueqiu_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/xueqiu_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/app/spiders/zongheng_config/Spiderfile b/backend/app/spiders/zongheng_config/Spiderfile new file mode 100755 index 00000000..0163fac7 --- /dev/null +++ b/backend/app/spiders/zongheng_config/Spiderfile @@ -0,0 +1,45 @@ +name: "zongheng_config" +display_name: "纵横(可配置)" +remark: "纵横小说网,列表" +type: "configurable" +col: "results_zongheng_config" +engine: scrapy +start_url: http://www.zongheng.com/rank/details.html?rt=1&d=1 +start_stage: list +stages: +- name: list + is_list: true + list_css: .rank_d_list + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: href + fields: + - name: title + css: .rank_d_b_name > a + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: .rank_d_b_name > a + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: abstract + css: body + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: votes + css: .rank_d_b_ticket + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/zongheng_config/config_spider/__init__.py b/backend/app/spiders/zongheng_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/zongheng_config/config_spider/items.py b/backend/app/spiders/zongheng_config/config_spider/items.py new file mode 100755 index 00000000..528c3187 --- /dev/null +++ b/backend/app/spiders/zongheng_config/config_spider/items.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + title = scrapy.Field() + url = scrapy.Field() + abstract = scrapy.Field() + votes = scrapy.Field() + diff --git a/backend/app/spiders/zongheng_config/config_spider/middlewares.py b/backend/app/spiders/zongheng_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/zongheng_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/zongheng_config/config_spider/pipelines.py b/backend/app/spiders/zongheng_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/zongheng_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/zongheng_config/config_spider/settings.py b/backend/app/spiders/zongheng_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/zongheng_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/zongheng_config/config_spider/spiders/__init__.py b/backend/app/spiders/zongheng_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/zongheng_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/zongheng_config/config_spider/spiders/spider.py b/backend/app/spiders/zongheng_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..cf1b6a08 --- /dev/null +++ b/backend/app/spiders/zongheng_config/config_spider/spiders/spider.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='http://www.zongheng.com/rank/details.html?rt=1&d=1', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.css('.rank_d_list'): + item = Item() + item['title'] = elem.css('.rank_d_b_name > a::text').extract_first() + item['url'] = elem.css('.rank_d_b_name > a::attr("href")').extract_first() + item['abstract'] = elem.css('body::text').extract_first() + item['votes'] = elem.css('.rank_d_b_ticket::text').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield item + + diff --git a/backend/app/spiders/zongheng_config/md5.txt b/backend/app/spiders/zongheng_config/md5.txt new file mode 100755 index 00000000..46fd3de6 --- /dev/null +++ b/backend/app/spiders/zongheng_config/md5.txt @@ -0,0 +1 @@ +82cb98a6103fb878501df81f191703ba diff --git a/backend/app/spiders/zongheng_config/scrapy.cfg b/backend/app/spiders/zongheng_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/zongheng_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/conf/config.yml b/backend/conf/config.yml index 17341e95..1c2c8507 100644 --- a/backend/conf/config.yml +++ b/backend/conf/config.yml @@ -46,6 +46,8 @@ setting: demoSpiders: "N" checkScrapy: "Y" autoInstall: "Y" + esClient: "" # Your ES client, for example, http://192.168.1.1:9200 or http://your-domain.com, if not use es, set empty + spiderLogIndex: "spider-log" # Index pattern for kibana, need to config on kibana notification: mail: server: '' diff --git a/backend/config/config.go b/backend/config/config.go index e4c4616c..79be808e 100644 --- a/backend/config/config.go +++ b/backend/config/config.go @@ -53,3 +53,5 @@ func InitConfig(cfg string) error { return nil } + + diff --git a/backend/database/es_base.go b/backend/database/es_base.go new file mode 100644 index 00000000..b255958a --- /dev/null +++ b/backend/database/es_base.go @@ -0,0 +1,44 @@ +package database + +import ( + "context" + "github.com/apex/log" + "github.com/olivere/elastic/v7" + "github.com/satori/go.uuid" + "github.com/spf13/viper" + "sync" + "time" +) + +var doOnce sync.Once +var ctx context.Context +var ESClient *elastic.Client + +func InitEsClient() { + esClientStr := viper.GetString("setting.esClient") + ctx = context.Background() + ESClient, _ = elastic.NewClient(elastic.SetURL(esClientStr), elastic.SetSniff(false)) +} + +// WriteMsg will write the msg and level into es +func WriteMsgToES(when time.Time, msg chan string, index string) { + doOnce.Do(InitEsClient) + vals := make(map[string]interface{}) + vals["@timestamp"] = when.Format(time.RFC3339) + for { + select { + case vals["@msg"] = <-msg: + uid := uuid.NewV4().String() + _, err := ESClient.Index().Index(index).Id(uid).BodyJson(vals).Refresh("wait_for").Do(ctx) + if err != nil { + log.Error(err.Error()) + log.Error("send msg log to es error") + return + } + case <-time.After(6 * time.Second): + return + } + } + + return +} diff --git a/backend/go.mod b/backend/go.mod index d91a1a84..7503389a 100644 --- a/backend/go.mod +++ b/backend/go.mod @@ -3,7 +3,10 @@ module crawlab go 1.12 require ( + github.com/Masterminds/semver v1.4.2 // indirect + github.com/Masterminds/sprig v2.16.0+incompatible // indirect github.com/Unknwon/goconfig v0.0.0-20191126170842-860a72fb44fd + github.com/aokoli/goutils v1.0.1 // indirect github.com/apex/log v1.1.1 github.com/dgrijalva/jwt-go v3.2.0+incompatible github.com/fsnotify/fsnotify v1.4.7 @@ -12,15 +15,21 @@ require ( github.com/go-playground/locales v0.12.1 // indirect github.com/go-playground/universal-translator v0.16.0 // indirect github.com/gomodule/redigo v2.0.0+incompatible + github.com/huandu/xstrings v1.2.0 // indirect + github.com/imdario/mergo v0.3.6 // indirect github.com/imroc/req v0.2.4 + github.com/jaytaylor/html2text v0.0.0-20180606194806-57d518f124b0 // indirect github.com/leodido/go-urn v1.1.0 // indirect github.com/matcornic/hermes v1.2.0 - github.com/matcornic/hermes/v2 v2.0.2 // indirect - github.com/pkg/errors v0.8.1 - github.com/royeo/dingrobot v1.0.0 // indirect + github.com/mattn/go-runewidth v0.0.3 // indirect + github.com/olekukonko/tablewriter v0.0.1 // indirect + github.com/olivere/elastic/v7 v7.0.14 + github.com/pkg/errors v0.9.1 github.com/satori/go.uuid v1.2.0 + github.com/shurcooL/sanitized_anchor_name v1.0.0 // indirect github.com/smartystreets/goconvey v0.0.0-20190731233626-505e41936337 github.com/spf13/viper v1.4.0 + github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf // indirect gopkg.in/alexcesaro/quotedprintable.v3 v3.0.0-20150716171945-2caba252f4dc // indirect gopkg.in/go-playground/validator.v9 v9.29.1 gopkg.in/gomail.v2 v2.0.0-20150902115704-41f357289737 diff --git a/backend/go.sum b/backend/go.sum index 463abbee..1a253f5d 100644 --- a/backend/go.sum +++ b/backend/go.sum @@ -8,9 +8,11 @@ github.com/Masterminds/sprig v2.16.0+incompatible/go.mod h1:y6hNFY5UBTIWBxnzTeuN github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= github.com/Unknwon/goconfig v0.0.0-20191126170842-860a72fb44fd h1:+CYOsXi89xOqBkj7CuEJjA2It+j+R3ngUZEydr6mtkw= github.com/Unknwon/goconfig v0.0.0-20191126170842-860a72fb44fd/go.mod h1:wngxua9XCNjvHjDiTiV26DaKDT+0c63QR6H5hjVUUxw= +github.com/alcortesm/tgz v0.0.0-20161220082320-9c5fe88206d7 h1:uSoVVbwJiQipAclBbw+8quDsfcvFjOpI5iCf4p/cqCs= github.com/alcortesm/tgz v0.0.0-20161220082320-9c5fe88206d7/go.mod h1:6zEj6s6u/ghQa61ZWa/C2Aw3RkjiTBOix7dkqa1VLIs= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= +github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239 h1:kFOfPq6dUM1hTo4JG6LR5AXSUEsOjtdm0kw0FtQtMJA= github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239/go.mod h1:2FmKhYUyUczH0OGQWaF5ceTx0UBShxjsH6f8oGKYe2c= github.com/aokoli/goutils v1.0.1 h1:7fpzNGoJ3VA8qcrm++XEE1QUe0mIwNeLa02Nwq7RDkg= github.com/aokoli/goutils v1.0.1/go.mod h1:SijmP0QR8LtwsmDs8Yii5Z/S4trXFGFC2oO5g9DP+DQ= @@ -19,8 +21,10 @@ github.com/apex/log v1.1.1/go.mod h1:Ls949n1HFtXfbDcjiTTFQqkVUrte0puoIBfO3SVgwOA github.com/aphistic/golf v0.0.0-20180712155816-02c07f170c5a/go.mod h1:3NqKYiepwy8kCu4PNA+aP7WUV72eXWJeP9/r3/K9aLE= github.com/aphistic/sweet v0.2.0/go.mod h1:fWDlIh/isSE9n6EPsRmC0det+whmX6dJid3stzu0Xys= github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/aws/aws-sdk-go v1.20.6/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo= +github.com/aws/aws-sdk-go v1.30.7/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZveU8YkpAk0= github.com/aybabtme/rgbterm v0.0.0-20170906152045-cc83f3b3ce59/go.mod h1:q/89r3U2H7sSsE2t6Kca0lfwTK8JdoNGS/yzM/4iH5I= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= @@ -41,7 +45,10 @@ github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8 github.com/emirpasic/gods v1.12.0 h1:QAUIPSaCu4G+POclxeqb3F+WPpdKqFGlw36+yOzGlrg= github.com/emirpasic/gods v1.12.0/go.mod h1:YfzfFFoVP/catgzJb4IKIqXjX78Ha8FMSDh3ymbK86o= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= +github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568 h1:BHsljHzVlRcyQhjrss6TZTdY2VfCqZPbv5k3iBFa2ZQ= github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:xEzjJPgXI435gkrCt3MPfRiAkVrwSbHsst4LCFVfpJc= +github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw= +github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g= github.com/fsnotify/fsnotify v1.4.7 h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= @@ -49,6 +56,7 @@ github.com/gin-contrib/sse v0.0.0-20190301062529-5545eab6dad3 h1:t8FVkw33L+wilf2 github.com/gin-contrib/sse v0.0.0-20190301062529-5545eab6dad3/go.mod h1:VJ0WA2NBN22VlZ2dKZQPAPnyWw5XTlK1KymzLKsr59s= github.com/gin-gonic/gin v1.4.0 h1:3tMoCCfM7ppqsR0ptz/wi1impNpT7/9wQtMZ8lr1mCQ= github.com/gin-gonic/gin v1.4.0/go.mod h1:OW2EZn3DO8Ln9oIKOvM++LBO+5UPHJJDH72/q/3rZdM= +github.com/gliderlabs/ssh v0.2.2 h1:6zsha5zo/TWhRhwqCD3+EarCAgZ2yN28ipRnGPnwkI0= github.com/gliderlabs/ssh v0.2.2/go.mod h1:U7qILu1NlMHj9FlMhZLlkCdDnU1DBEAqr0aevW3Awn0= github.com/globalsign/mgo v0.0.0-20181015135952-eeefdecb41b8 h1:DujepqpGd1hyOd7aW59XpK7Qymp8iy83xq74fLr21is= github.com/globalsign/mgo v0.0.0-20181015135952-eeefdecb41b8/go.mod h1:xkRDCp4j0OGD1HRkm4kmhM+pmpv3AKq5SU7GMg4oO/Q= @@ -59,11 +67,13 @@ github.com/go-playground/locales v0.12.1 h1:2FITxuFt/xuCNP1Acdhv62OzaCiviiE4kotf github.com/go-playground/locales v0.12.1/go.mod h1:IUMDtCfWo/w/mtMfIE/IG2K+Ey3ygWanZIBtBW0W2TM= github.com/go-playground/universal-translator v0.16.0 h1:X++omBR/4cE2MNg91AoC3rmGrCjJ8eAeUP/K/EKx4DM= github.com/go-playground/universal-translator v0.16.0/go.mod h1:1AnU7NaIRDWWzGEKwgtJRd2xk99HeFyHw3yid4rvQIY= +github.com/go-sql-driver/mysql v1.5.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/groupcache v0.0.0-20190129154638-5b532d6fd5ef/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg= @@ -72,8 +82,10 @@ github.com/gomodule/redigo v2.0.0+incompatible h1:K/R+8tc58AaqLkqG2Ol3Qk+DR/TlNu github.com/gomodule/redigo v2.0.0+incompatible/go.mod h1:B4C85qUVwatsJoIUNIfCRsp7qO0iAmpGFZ4EELWSbC4= github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= +github.com/google/go-cmp v0.3.0 h1:crn/baboCvb5fXaQ0IJ1SGTsTVrWpDsCWC8EGETZijY= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= -github.com/google/uuid v1.0.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/google/go-cmp v0.4.0 h1:xsAVV57WRhGj6kEIi8ReJzQlHHqcBYCElAvkovg3B/4= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/uuid v1.1.1 h1:Gkbcsh/GbpXz7lPftLA3P6TYMwjCLYm83jiFQZF/3gY= github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8= @@ -97,6 +109,7 @@ github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 h1:BQSFePA1RWJOl github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99/go.mod h1:1lJo3i6rXxKeerYnT8Nvf0QmHCRC1n8sfWVwXF2Frvo= github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI= github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k= +github.com/jmespath/go-jmespath v0.3.0/go.mod h1:9QtRXoHjLGCJ5IBSaohpXITPlowMeeYCZ7fLUTSywik= github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo= github.com/jpillora/backoff v0.0.0-20180909062703-3050d21c67d7/go.mod h1:2iMrUgbbvHEiQClaW2NsSzMyGHqN+rDFqY705q49KG0= github.com/json-iterator/go v1.1.6 h1:MrUvLMLTMxbqFJ9kzlvat/rYZqZnW3u4wkLzWTaFwKs= @@ -120,9 +133,10 @@ github.com/leodido/go-urn v1.1.0 h1:Sm1gr51B1kKyfD2BlRcLSiEkffoG96g6TPv6eRoEiB8= github.com/leodido/go-urn v1.1.0/go.mod h1:+cyI34gQWZcE1eQU7NVgKkkzdXDQHr1dBMtdAPozLkw= github.com/magiconair/properties v1.8.0 h1:LLgXmsheXeRoUOBOjtwPQCWIYqM/LU1ayDtDePerRcY= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= +github.com/mailru/easyjson v0.7.1 h1:mdxE1MF9o53iCb2Ghj1VfWvh7ZOwHpnVG/xwXrV90U8= +github.com/mailru/easyjson v0.7.1/go.mod h1:KAzv3t3aY1NaHWoQz1+4F1ccyAH66Jk7yos7ldAVICs= github.com/matcornic/hermes v1.2.0 h1:AuqZpYcTOtTB7cahdevLfnhIpfzmpqw5Czv8vpdnFDU= github.com/matcornic/hermes v1.2.0/go.mod h1:lujJomb016Xjv8wBnWlNvUdtmvowjjfkqri5J/+1hYc= -github.com/matcornic/hermes/v2 v2.0.2/go.mod h1:iVsJWSIS4NtMNtgan22sy6lt7pImok7bATGPWCoaKNY= github.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcnceauSikq3lYCQ= github.com/mattn/go-colorable v0.1.2/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= github.com/mattn/go-isatty v0.0.5/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= @@ -145,14 +159,19 @@ github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRW github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U= github.com/olekukonko/tablewriter v0.0.1 h1:b3iUnf1v+ppJiOfNX4yxxqfWKMQPZR5yoh8urCTFX88= github.com/olekukonko/tablewriter v0.0.1/go.mod h1:vsDQFd/mU46D+Z4whnwzcISnGGzXWMclvtLoiIKAKIo= +github.com/olivere/elastic/v7 v7.0.14 h1:89dYPg6kD3WJx42ZtO4U6WDIzRy69FvQqz/yRiwekuM= +github.com/olivere/elastic/v7 v7.0.14/go.mod h1:+FgncZ8ho1QF3NlBo77XbuoTKYHhvEOfFZKIAfHnnDE= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/gomega v1.5.0/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= +github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= github.com/pelletier/go-buffruneio v0.2.0/go.mod h1:JkE26KsDizTr40EUHkXVtNPvgGtbSNq5BcowyYOWdKo= github.com/pelletier/go-toml v1.2.0 h1:T5zMGML61Wp+FlcbWjRDT7yAxhJNAiPPLOFECq181zc= github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= @@ -166,9 +185,6 @@ github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084/go.mod h1:TjEm7z github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU= github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg= github.com/rogpeppe/fastuuid v1.1.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= -github.com/royeo/dingrobot v1.0.0 h1:K4GrF+fOecNX0yi+oBKpfh7z0XP/8TzaIIHu1B2kKUQ= -github.com/royeo/dingrobot v1.0.0/go.mod h1:RqDM8E/hySCVwI2aUFRJAUGDcHHRnIhzNmbNG3bamQs= -github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/satori/go.uuid v1.2.0 h1:0uYX9dsZ2yD7q2RtLRtPSdGDWzjeM3TbMJP9utgA0ww= github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= github.com/sergi/go-diff v1.0.0 h1:Kpca3qRNrduNnOQeazBd0ysaKrUJiIuISHxogkT9RPQ= @@ -205,6 +221,9 @@ github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoH github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H4= +github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= github.com/tj/assert v0.0.0-20171129193455-018094318fb0/go.mod h1:mZ9/Rh9oLWpLLDRpvE+3b7gP/C2YyLFYxNmcLnPTMe0= github.com/tj/go-elastic v0.0.0-20171221160941-36157cbbebc2/go.mod h1:WjeM0Oo1eNAjXGDx2yma7uG2XoyRZTq1uv3M/o7imD0= github.com/tj/go-kinesis v0.0.0-20171128231115-08b17f58cb1b/go.mod h1:/yhzCV0xPfx6jb1bBgRFjl5lytqVqZXEaeqWP8lTEao= @@ -217,24 +236,27 @@ github.com/xanzy/ssh-agent v0.2.1/go.mod h1:mLlQY/MoOhWBj+gOGMQkOeiEvkx+8pJSI+0B github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU= github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q= go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= +go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= -golang.org/x/crypto v0.0.0-20181029175232-7e6ffbd03851/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190219172222-a4c6cb3142f2/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190426145343-a29dc8fdc734 h1:p/H982KKEjUnLJkM3tt/LemDnOc1GiZL5FCVlORJ5zo= golang.org/x/crypto v0.0.0-20190426145343-a29dc8fdc734/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4 h1:HuIa8hRrWRSrqYzx1qI49NNxhdi2PrY7gxVSq1JjLDc= golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20181029044818-c44066c5c816/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181220203305-927f97764cc3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= @@ -243,10 +265,13 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859 h1:R/3boaszxrf1GEUWTVDzSKVwL golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190724013045-ca1201d0de80 h1:Ao/3l156eZf2AW5wK8a7/smtodRU+gha3+BeqJ69lRk= golang.org/x/net v0.0.0-20190724013045-ca1201d0de80/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200202094626-16171245cfb2 h1:CCH4IOTTfewWjGOlSp+zGcjutRKlBEZQ6wTn8ozI/nI= +golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -258,6 +283,7 @@ golang.org/x/sys v0.0.0-20190221075227-b4e8571b14e0/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d h1:+R4KGOnez64A81RvjARKc4UT5/tI9ujCIVX+P5KiHuI= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e h1:D5TXcfTk7xF7hvieo4QErS3qqCB4teTffacDWr7CI+0= golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= @@ -268,12 +294,18 @@ golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxb golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190729092621-ff9f1409240a/go.mod h1:jcCCGcm9btYwXyDqrUWc6MKQKKGJCWEQ3AfLSRIbEuI= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= +google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= +google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= +google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.0/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/alexcesaro/quotedprintable.v3 v3.0.0-20150716171945-2caba252f4dc h1:2gGKlE2+asNV9m7xrywl36YYNnBG5ZQ0r/BOOxqPpmk= @@ -295,6 +327,7 @@ gopkg.in/russross/blackfriday.v2 v2.0.0 h1:+FlnIV8DSQnT7NZ43hcVKcdJdzZoeCmJj4Ql8 gopkg.in/russross/blackfriday.v2 v2.0.0/go.mod h1:6sSBNz/GtOm/pJTuh5UmBK2ZHfmnxGbl2NZg1UliSOI= gopkg.in/src-d/go-billy.v4 v4.3.2 h1:0SQA1pRztfTFx2miS8sA97XvooFeNOmvUenF4o0EcVg= gopkg.in/src-d/go-billy.v4 v4.3.2/go.mod h1:nDjArDMp+XMs1aFAESLRjfGSgfvoYN0hDfzEk0GjC98= +gopkg.in/src-d/go-git-fixtures.v3 v3.5.0 h1:ivZFOIltbce2Mo8IjzUHAFoq/IylO9WHhNOAJK+LsJg= gopkg.in/src-d/go-git-fixtures.v3 v3.5.0/go.mod h1:dLBcvytrw/TYZsNTWCnkNF2DSIlzWYqTe3rJR56Ac7g= gopkg.in/src-d/go-git.v4 v4.13.1 h1:SRtFyV8Kxc0UP7aCHcijOMQGPxHSmMOPrzulQWolkYE= gopkg.in/src-d/go-git.v4 v4.13.1/go.mod h1:nx5NYcxdKxq5fpltdHnPa2Exj4Sx0EclMWZQbYDu2z8= diff --git a/backend/model/task.go b/backend/model/task.go index 35e738ab..0b2ed0a9 100644 --- a/backend/model/task.go +++ b/backend/model/task.go @@ -508,3 +508,4 @@ func UpdateTaskErrorLogs(taskId string, errorRegexPattern string) error { return nil } + diff --git a/backend/services/task.go b/backend/services/task.go index a0bb9a49..16278d24 100644 --- a/backend/services/task.go +++ b/backend/services/task.go @@ -16,7 +16,7 @@ import ( "github.com/apex/log" "github.com/globalsign/mgo/bson" "github.com/imroc/req" - uuid "github.com/satori/go.uuid" + "github.com/satori/go.uuid" "github.com/spf13/viper" "net/http" "os" @@ -166,7 +166,11 @@ func SetEnv(cmd *exec.Cmd, envs []model.Env, task model.Task, spider model.Spide return cmd } -func SetLogConfig(cmd *exec.Cmd, t model.Task, u model.User) error { +func SetLogConfig(wg *sync.WaitGroup, cmd *exec.Cmd, t model.Task, u model.User) error { + + esChan := make(chan string, 1) + esClientStr := viper.GetString("setting.esClient") + spiderLogIndex := viper.GetString("setting.spiderLogIndex") // get stdout reader stdout, err := cmd.StdoutPipe() readerStdout := bufio.NewReader(stdout) @@ -191,7 +195,9 @@ func SetLogConfig(cmd *exec.Cmd, t model.Task, u model.User) error { isStderrFinished := false // periodically (1 sec) insert log items + wg.Add(3) go func() { + defer wg.Done() for { _ = model.AddLogItems(logs) logs = []model.LogItem{} @@ -211,6 +217,7 @@ func SetLogConfig(cmd *exec.Cmd, t model.Task, u model.User) error { // read stdout go func() { + defer wg.Done() for { line, err := readerStdout.ReadString('\n') if err != nil { @@ -227,12 +234,18 @@ func SetLogConfig(cmd *exec.Cmd, t model.Task, u model.User) error { Ts: time.Now(), ExpireTs: time.Now().Add(time.Duration(expireDuration) * time.Second), } + esChan <- l.Message + if esClientStr != "" { + go database.WriteMsgToES(time.Now(), esChan, spiderLogIndex) + } + logs = append(logs, l) } }() // read stderr go func() { + defer wg.Done() for { line, err := readerStderr.ReadString('\n') if err != nil { @@ -249,10 +262,15 @@ func SetLogConfig(cmd *exec.Cmd, t model.Task, u model.User) error { Ts: time.Now(), ExpireTs: time.Now().Add(time.Duration(expireDuration) * time.Second), } + esChan <- l.Message + if esClientStr != "" { + go database.WriteMsgToES(time.Now(), esChan, spiderLogIndex) + } logs = append(logs, l) } }() + wg.Wait() return nil } @@ -337,6 +355,8 @@ func ExecuteShellCmd(cmdStr string, cwd string, t model.Task, s model.Spider, u log.Infof("cwd: %s", cwd) log.Infof("cmd: %s", cmdStr) + wg := &sync.WaitGroup{} + // 生成执行命令 var cmd *exec.Cmd if runtime.GOOS == constants.Windows { @@ -349,9 +369,7 @@ func ExecuteShellCmd(cmdStr string, cwd string, t model.Task, s model.Spider, u cmd.Dir = cwd // 日志配置 - if err := SetLogConfig(cmd, t, u); err != nil { - return err - } + go SetLogConfig(wg, cmd, t, u) // 环境变量配置 envs := s.Envs