From 14159c46a5ef5e4e777dab8ba404ad7b527b8fd6 Mon Sep 17 00:00:00 2001 From: hantmac Date: Mon, 27 Apr 2020 15:59:06 +0800 Subject: [PATCH 1/4] support send log to ES --- .gitignore | 1 + backend/app/spiders/amazon_config/Spiderfile | 51 ++++++++ .../amazon_config/config_spider/__init__.py | 0 .../amazon_config/config_spider/items.py | 20 ++++ .../config_spider/middlewares.py | 103 ++++++++++++++++ .../amazon_config/config_spider/pipelines.py | 27 +++++ .../amazon_config/config_spider/settings.py | 111 ++++++++++++++++++ .../config_spider/spiders/__init__.py | 4 + .../config_spider/spiders/spider.py | 37 ++++++ backend/app/spiders/amazon_config/md5.txt | 1 + backend/app/spiders/amazon_config/scrapy.cfg | 11 ++ .../app/spiders/autohome_config/Spiderfile | 57 +++++++++ .../autohome_config/config_spider/__init__.py | 0 .../autohome_config/config_spider/items.py | 21 ++++ .../config_spider/middlewares.py | 103 ++++++++++++++++ .../config_spider/pipelines.py | 27 +++++ .../autohome_config/config_spider/settings.py | 111 ++++++++++++++++++ .../config_spider/spiders/__init__.py | 4 + .../config_spider/spiders/spider.py | 38 ++++++ backend/app/spiders/autohome_config/md5.txt | 1 + .../app/spiders/autohome_config/scrapy.cfg | 11 ++ backend/app/spiders/baidu_config/Spiderfile | 39 ++++++ .../baidu_config/config_spider/__init__.py | 0 .../baidu_config/config_spider/items.py | 18 +++ .../baidu_config/config_spider/middlewares.py | 103 ++++++++++++++++ .../baidu_config/config_spider/pipelines.py | 27 +++++ .../baidu_config/config_spider/settings.py | 111 ++++++++++++++++++ .../config_spider/spiders/__init__.py | 4 + .../config_spider/spiders/spider.py | 35 ++++++ backend/app/spiders/baidu_config/md5.txt | 1 + backend/app/spiders/baidu_config/scrapy.cfg | 11 ++ backend/app/spiders/bing_general/Spiderfile | 6 + .../app/spiders/bing_general/bing_spider.py | 41 +++++++ backend/app/spiders/bing_general/md5.txt | 1 + backend/app/spiders/chinaz/Spiderfile | 5 + backend/app/spiders/chinaz/chinaz/__init__.py | 0 backend/app/spiders/chinaz/chinaz/items.py | 21 ++++ .../app/spiders/chinaz/chinaz/middlewares.py | 103 ++++++++++++++++ .../app/spiders/chinaz/chinaz/pipelines.py | 7 ++ backend/app/spiders/chinaz/chinaz/settings.py | 90 ++++++++++++++ .../spiders/chinaz/chinaz/spiders/__init__.py | 4 + .../chinaz/chinaz/spiders/chinaz_spider.py | 63 ++++++++++ backend/app/spiders/chinaz/md5.txt | 1 + backend/app/spiders/chinaz/scrapy.cfg | 11 ++ backend/app/spiders/csdn_config/Spiderfile | 60 ++++++++++ .../csdn_config/config_spider/__init__.py | 0 .../csdn_config/config_spider/items.py | 20 ++++ .../csdn_config/config_spider/middlewares.py | 103 ++++++++++++++++ .../csdn_config/config_spider/pipelines.py | 27 +++++ .../csdn_config/config_spider/settings.py | 111 ++++++++++++++++++ .../config_spider/spiders/__init__.py | 4 + .../config_spider/spiders/spider.py | 41 +++++++ backend/app/spiders/csdn_config/md5.txt | 1 + backend/app/spiders/csdn_config/scrapy.cfg | 11 ++ backend/app/spiders/douban_config/Spiderfile | 57 +++++++++ .../douban_config/config_spider/__init__.py | 0 .../douban_config/config_spider/items.py | 21 ++++ .../config_spider/middlewares.py | 103 ++++++++++++++++ .../douban_config/config_spider/pipelines.py | 27 +++++ .../douban_config/config_spider/settings.py | 111 ++++++++++++++++++ .../config_spider/spiders/__init__.py | 4 + .../config_spider/spiders/spider.py | 36 ++++++ backend/app/spiders/douban_config/md5.txt | 1 + backend/app/spiders/douban_config/scrapy.cfg | 11 ++ backend/app/spiders/jd/Spiderfile | 5 + backend/app/spiders/jd/jd/__init__.py | 0 backend/app/spiders/jd/jd/items.py | 15 +++ backend/app/spiders/jd/jd/middlewares.py | 103 ++++++++++++++++ backend/app/spiders/jd/jd/pipelines.py | 6 + backend/app/spiders/jd/jd/settings.py | 90 ++++++++++++++ backend/app/spiders/jd/jd/spiders/__init__.py | 4 + .../app/spiders/jd/jd/spiders/jd_spider.py | 21 ++++ backend/app/spiders/jd/md5.txt | 1 + backend/app/spiders/jd/scrapy.cfg | 11 ++ backend/app/spiders/sinastock/Spiderfile | 5 + backend/app/spiders/sinastock/md5.txt | 1 + backend/app/spiders/sinastock/scrapy.cfg | 11 ++ .../spiders/sinastock/sinastock/__init__.py | 0 .../app/spiders/sinastock/sinastock/items.py | 21 ++++ .../sinastock/sinastock/middlewares.py | 103 ++++++++++++++++ .../spiders/sinastock/sinastock/pipelines.py | 6 + .../spiders/sinastock/sinastock/settings.py | 89 ++++++++++++++ .../sinastock/sinastock/spiders/__init__.py | 4 + .../sinastock/spiders/sinastock_spider.py | 59 ++++++++++ backend/app/spiders/v2ex_config/Spiderfile | 54 +++++++++ .../v2ex_config/config_spider/__init__.py | 0 .../v2ex_config/config_spider/items.py | 19 +++ .../v2ex_config/config_spider/middlewares.py | 103 ++++++++++++++++ .../v2ex_config/config_spider/pipelines.py | 27 +++++ .../v2ex_config/config_spider/settings.py | 111 ++++++++++++++++++ .../config_spider/spiders/__init__.py | 4 + .../config_spider/spiders/spider.py | 38 ++++++ backend/app/spiders/v2ex_config/md5.txt | 1 + backend/app/spiders/v2ex_config/scrapy.cfg | 11 ++ backend/app/spiders/xueqiu/Spiderfile | 5 + backend/app/spiders/xueqiu/md5.txt | 1 + backend/app/spiders/xueqiu/scrapy.cfg | 11 ++ backend/app/spiders/xueqiu/xueqiu/__init__.py | 0 backend/app/spiders/xueqiu/xueqiu/items.py | 23 ++++ .../app/spiders/xueqiu/xueqiu/middlewares.py | 103 ++++++++++++++++ .../app/spiders/xueqiu/xueqiu/pipelines.py | 6 + backend/app/spiders/xueqiu/xueqiu/settings.py | 89 ++++++++++++++ .../spiders/xueqiu/xueqiu/spiders/__init__.py | 4 + .../xueqiu/xueqiu/spiders/xueqiu_spider.py | 46 ++++++++ backend/app/spiders/xueqiu_config/Spiderfile | 39 ++++++ .../xueqiu_config/config_spider/__init__.py | 0 .../xueqiu_config/config_spider/items.py | 18 +++ .../config_spider/middlewares.py | 103 ++++++++++++++++ .../xueqiu_config/config_spider/pipelines.py | 27 +++++ .../xueqiu_config/config_spider/settings.py | 111 ++++++++++++++++++ .../config_spider/spiders/__init__.py | 4 + .../config_spider/spiders/spider.py | 33 ++++++ backend/app/spiders/xueqiu_config/md5.txt | 1 + backend/app/spiders/xueqiu_config/scrapy.cfg | 11 ++ .../app/spiders/zongheng_config/Spiderfile | 45 +++++++ .../zongheng_config/config_spider/__init__.py | 0 .../zongheng_config/config_spider/items.py | 19 +++ .../config_spider/middlewares.py | 103 ++++++++++++++++ .../config_spider/pipelines.py | 27 +++++ .../zongheng_config/config_spider/settings.py | 111 ++++++++++++++++++ .../config_spider/spiders/__init__.py | 4 + .../config_spider/spiders/spider.py | 34 ++++++ backend/app/spiders/zongheng_config/md5.txt | 1 + .../app/spiders/zongheng_config/scrapy.cfg | 11 ++ backend/conf/config.yml | 2 + backend/config/config.go | 2 + backend/database/es_base.go | 44 +++++++ backend/go.mod | 15 ++- backend/go.sum | 47 ++++++-- backend/model/task.go | 1 + backend/services/task.go | 28 ++++- 131 files changed, 4227 insertions(+), 15 deletions(-) create mode 100755 backend/app/spiders/amazon_config/Spiderfile create mode 100755 backend/app/spiders/amazon_config/config_spider/__init__.py create mode 100755 backend/app/spiders/amazon_config/config_spider/items.py create mode 100755 backend/app/spiders/amazon_config/config_spider/middlewares.py create mode 100755 backend/app/spiders/amazon_config/config_spider/pipelines.py create mode 100755 backend/app/spiders/amazon_config/config_spider/settings.py create mode 100755 backend/app/spiders/amazon_config/config_spider/spiders/__init__.py create mode 100755 backend/app/spiders/amazon_config/config_spider/spiders/spider.py create mode 100755 backend/app/spiders/amazon_config/md5.txt create mode 100755 backend/app/spiders/amazon_config/scrapy.cfg create mode 100755 backend/app/spiders/autohome_config/Spiderfile create mode 100755 backend/app/spiders/autohome_config/config_spider/__init__.py create mode 100755 backend/app/spiders/autohome_config/config_spider/items.py create mode 100755 backend/app/spiders/autohome_config/config_spider/middlewares.py create mode 100755 backend/app/spiders/autohome_config/config_spider/pipelines.py create mode 100755 backend/app/spiders/autohome_config/config_spider/settings.py create mode 100755 backend/app/spiders/autohome_config/config_spider/spiders/__init__.py create mode 100755 backend/app/spiders/autohome_config/config_spider/spiders/spider.py create mode 100755 backend/app/spiders/autohome_config/md5.txt create mode 100755 backend/app/spiders/autohome_config/scrapy.cfg create mode 100755 backend/app/spiders/baidu_config/Spiderfile create mode 100755 backend/app/spiders/baidu_config/config_spider/__init__.py create mode 100755 backend/app/spiders/baidu_config/config_spider/items.py create mode 100755 backend/app/spiders/baidu_config/config_spider/middlewares.py create mode 100755 backend/app/spiders/baidu_config/config_spider/pipelines.py create mode 100755 backend/app/spiders/baidu_config/config_spider/settings.py create mode 100755 backend/app/spiders/baidu_config/config_spider/spiders/__init__.py create mode 100755 backend/app/spiders/baidu_config/config_spider/spiders/spider.py create mode 100755 backend/app/spiders/baidu_config/md5.txt create mode 100755 backend/app/spiders/baidu_config/scrapy.cfg create mode 100755 backend/app/spiders/bing_general/Spiderfile create mode 100755 backend/app/spiders/bing_general/bing_spider.py create mode 100755 backend/app/spiders/bing_general/md5.txt create mode 100755 backend/app/spiders/chinaz/Spiderfile create mode 100755 backend/app/spiders/chinaz/chinaz/__init__.py create mode 100755 backend/app/spiders/chinaz/chinaz/items.py create mode 100755 backend/app/spiders/chinaz/chinaz/middlewares.py create mode 100755 backend/app/spiders/chinaz/chinaz/pipelines.py create mode 100755 backend/app/spiders/chinaz/chinaz/settings.py create mode 100755 backend/app/spiders/chinaz/chinaz/spiders/__init__.py create mode 100755 backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py create mode 100755 backend/app/spiders/chinaz/md5.txt create mode 100755 backend/app/spiders/chinaz/scrapy.cfg create mode 100755 backend/app/spiders/csdn_config/Spiderfile create mode 100755 backend/app/spiders/csdn_config/config_spider/__init__.py create mode 100755 backend/app/spiders/csdn_config/config_spider/items.py create mode 100755 backend/app/spiders/csdn_config/config_spider/middlewares.py create mode 100755 backend/app/spiders/csdn_config/config_spider/pipelines.py create mode 100755 backend/app/spiders/csdn_config/config_spider/settings.py create mode 100755 backend/app/spiders/csdn_config/config_spider/spiders/__init__.py create mode 100755 backend/app/spiders/csdn_config/config_spider/spiders/spider.py create mode 100755 backend/app/spiders/csdn_config/md5.txt create mode 100755 backend/app/spiders/csdn_config/scrapy.cfg create mode 100755 backend/app/spiders/douban_config/Spiderfile create mode 100755 backend/app/spiders/douban_config/config_spider/__init__.py create mode 100755 backend/app/spiders/douban_config/config_spider/items.py create mode 100755 backend/app/spiders/douban_config/config_spider/middlewares.py create mode 100755 backend/app/spiders/douban_config/config_spider/pipelines.py create mode 100755 backend/app/spiders/douban_config/config_spider/settings.py create mode 100755 backend/app/spiders/douban_config/config_spider/spiders/__init__.py create mode 100755 backend/app/spiders/douban_config/config_spider/spiders/spider.py create mode 100755 backend/app/spiders/douban_config/md5.txt create mode 100755 backend/app/spiders/douban_config/scrapy.cfg create mode 100755 backend/app/spiders/jd/Spiderfile create mode 100755 backend/app/spiders/jd/jd/__init__.py create mode 100755 backend/app/spiders/jd/jd/items.py create mode 100755 backend/app/spiders/jd/jd/middlewares.py create mode 100755 backend/app/spiders/jd/jd/pipelines.py create mode 100755 backend/app/spiders/jd/jd/settings.py create mode 100755 backend/app/spiders/jd/jd/spiders/__init__.py create mode 100755 backend/app/spiders/jd/jd/spiders/jd_spider.py create mode 100755 backend/app/spiders/jd/md5.txt create mode 100755 backend/app/spiders/jd/scrapy.cfg create mode 100755 backend/app/spiders/sinastock/Spiderfile create mode 100755 backend/app/spiders/sinastock/md5.txt create mode 100755 backend/app/spiders/sinastock/scrapy.cfg create mode 100755 backend/app/spiders/sinastock/sinastock/__init__.py create mode 100755 backend/app/spiders/sinastock/sinastock/items.py create mode 100755 backend/app/spiders/sinastock/sinastock/middlewares.py create mode 100755 backend/app/spiders/sinastock/sinastock/pipelines.py create mode 100755 backend/app/spiders/sinastock/sinastock/settings.py create mode 100755 backend/app/spiders/sinastock/sinastock/spiders/__init__.py create mode 100755 backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py create mode 100755 backend/app/spiders/v2ex_config/Spiderfile create mode 100755 backend/app/spiders/v2ex_config/config_spider/__init__.py create mode 100755 backend/app/spiders/v2ex_config/config_spider/items.py create mode 100755 backend/app/spiders/v2ex_config/config_spider/middlewares.py create mode 100755 backend/app/spiders/v2ex_config/config_spider/pipelines.py create mode 100755 backend/app/spiders/v2ex_config/config_spider/settings.py create mode 100755 backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py create mode 100755 backend/app/spiders/v2ex_config/config_spider/spiders/spider.py create mode 100755 backend/app/spiders/v2ex_config/md5.txt create mode 100755 backend/app/spiders/v2ex_config/scrapy.cfg create mode 100755 backend/app/spiders/xueqiu/Spiderfile create mode 100755 backend/app/spiders/xueqiu/md5.txt create mode 100755 backend/app/spiders/xueqiu/scrapy.cfg create mode 100755 backend/app/spiders/xueqiu/xueqiu/__init__.py create mode 100755 backend/app/spiders/xueqiu/xueqiu/items.py create mode 100755 backend/app/spiders/xueqiu/xueqiu/middlewares.py create mode 100755 backend/app/spiders/xueqiu/xueqiu/pipelines.py create mode 100755 backend/app/spiders/xueqiu/xueqiu/settings.py create mode 100755 backend/app/spiders/xueqiu/xueqiu/spiders/__init__.py create mode 100755 backend/app/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py create mode 100755 backend/app/spiders/xueqiu_config/Spiderfile create mode 100755 backend/app/spiders/xueqiu_config/config_spider/__init__.py create mode 100755 backend/app/spiders/xueqiu_config/config_spider/items.py create mode 100755 backend/app/spiders/xueqiu_config/config_spider/middlewares.py create mode 100755 backend/app/spiders/xueqiu_config/config_spider/pipelines.py create mode 100755 backend/app/spiders/xueqiu_config/config_spider/settings.py create mode 100755 backend/app/spiders/xueqiu_config/config_spider/spiders/__init__.py create mode 100755 backend/app/spiders/xueqiu_config/config_spider/spiders/spider.py create mode 100755 backend/app/spiders/xueqiu_config/md5.txt create mode 100755 backend/app/spiders/xueqiu_config/scrapy.cfg create mode 100755 backend/app/spiders/zongheng_config/Spiderfile create mode 100755 backend/app/spiders/zongheng_config/config_spider/__init__.py create mode 100755 backend/app/spiders/zongheng_config/config_spider/items.py create mode 100755 backend/app/spiders/zongheng_config/config_spider/middlewares.py create mode 100755 backend/app/spiders/zongheng_config/config_spider/pipelines.py create mode 100755 backend/app/spiders/zongheng_config/config_spider/settings.py create mode 100755 backend/app/spiders/zongheng_config/config_spider/spiders/__init__.py create mode 100755 backend/app/spiders/zongheng_config/config_spider/spiders/spider.py create mode 100755 backend/app/spiders/zongheng_config/md5.txt create mode 100755 backend/app/spiders/zongheng_config/scrapy.cfg create mode 100644 backend/database/es_base.go diff --git a/.gitignore b/.gitignore index 0b6328c9..6eb0d9f1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .idea/ +.vscode/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/backend/app/spiders/amazon_config/Spiderfile b/backend/app/spiders/amazon_config/Spiderfile new file mode 100755 index 00000000..eea8a538 --- /dev/null +++ b/backend/app/spiders/amazon_config/Spiderfile @@ -0,0 +1,51 @@ +name: "amazon_config" +display_name: "亚马逊中国(可配置)" +remark: "亚马逊中国搜索手机,列表+分页" +type: "configurable" +col: "results_amazon_config" +engine: scrapy +start_url: https://www.amazon.cn/s?k=%E6%89%8B%E6%9C%BA&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_2 +start_stage: list +stages: +- name: list + is_list: true + list_css: .s-result-item + list_xpath: "" + page_css: .a-last > a + page_xpath: "" + page_attr: href + fields: + - name: title + css: span.a-text-normal + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: .a-link-normal + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: price + css: "" + xpath: .//*[@class="a-price-whole"] + attr: "" + next_stage: "" + remark: "" + - name: price_fraction + css: "" + xpath: .//*[@class="a-price-fraction"] + attr: "" + next_stage: "" + remark: "" + - name: img + css: .s-image-square-aspect > img + xpath: "" + attr: src + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/amazon_config/config_spider/__init__.py b/backend/app/spiders/amazon_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/amazon_config/config_spider/items.py b/backend/app/spiders/amazon_config/config_spider/items.py new file mode 100755 index 00000000..79bf0adb --- /dev/null +++ b/backend/app/spiders/amazon_config/config_spider/items.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + title = scrapy.Field() + url = scrapy.Field() + price = scrapy.Field() + price_fraction = scrapy.Field() + img = scrapy.Field() + diff --git a/backend/app/spiders/amazon_config/config_spider/middlewares.py b/backend/app/spiders/amazon_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/amazon_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/amazon_config/config_spider/pipelines.py b/backend/app/spiders/amazon_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/amazon_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/amazon_config/config_spider/settings.py b/backend/app/spiders/amazon_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/amazon_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/amazon_config/config_spider/spiders/__init__.py b/backend/app/spiders/amazon_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/amazon_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/amazon_config/config_spider/spiders/spider.py b/backend/app/spiders/amazon_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..a7421df3 --- /dev/null +++ b/backend/app/spiders/amazon_config/config_spider/spiders/spider.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='https://www.amazon.cn/s?k=%E6%89%8B%E6%9C%BA&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_2', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.css('.s-result-item'): + item = Item() + item['title'] = elem.css('span.a-text-normal::text').extract_first() + item['url'] = elem.css('.a-link-normal::attr("href")').extract_first() + item['price'] = elem.xpath('string(.//*[@class="a-price-whole"])').extract_first() + item['price_fraction'] = elem.xpath('string(.//*[@class="a-price-fraction"])').extract_first() + item['img'] = elem.css('.s-image-square-aspect > img::attr("src")').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield item + next_url = response.css('.a-last > a::attr("href")').extract_first() + yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) + + diff --git a/backend/app/spiders/amazon_config/md5.txt b/backend/app/spiders/amazon_config/md5.txt new file mode 100755 index 00000000..52c5423f --- /dev/null +++ b/backend/app/spiders/amazon_config/md5.txt @@ -0,0 +1 @@ +4b716dd3c15b993ccb7a9f0be1cc0de9 diff --git a/backend/app/spiders/amazon_config/scrapy.cfg b/backend/app/spiders/amazon_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/amazon_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/app/spiders/autohome_config/Spiderfile b/backend/app/spiders/autohome_config/Spiderfile new file mode 100755 index 00000000..e69880cb --- /dev/null +++ b/backend/app/spiders/autohome_config/Spiderfile @@ -0,0 +1,57 @@ +name: "autohome_config" +display_name: "汽车之家(可配置)" +remark: "汽车之家文章,列表+详情+分页" +type: "configurable" +col: "results_autohome_config" +engine: scrapy +start_url: https://www.autohome.com.cn/all/ +start_stage: list +stages: +- name: list + is_list: true + list_css: ul.article > li + list_xpath: "" + page_css: a.page-item-next + page_xpath: "" + page_attr: href + fields: + - name: title + css: li > a > h3 + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: li > a + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: abstract + css: li > a > p + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: time + css: li > a .fn-left + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: views + css: li > a .fn-right > em:first-child + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: comments + css: li > a .fn-right > em:last-child + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/autohome_config/config_spider/__init__.py b/backend/app/spiders/autohome_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/autohome_config/config_spider/items.py b/backend/app/spiders/autohome_config/config_spider/items.py new file mode 100755 index 00000000..206203d5 --- /dev/null +++ b/backend/app/spiders/autohome_config/config_spider/items.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + title = scrapy.Field() + url = scrapy.Field() + abstract = scrapy.Field() + time = scrapy.Field() + views = scrapy.Field() + comments = scrapy.Field() + diff --git a/backend/app/spiders/autohome_config/config_spider/middlewares.py b/backend/app/spiders/autohome_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/autohome_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/autohome_config/config_spider/pipelines.py b/backend/app/spiders/autohome_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/autohome_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/autohome_config/config_spider/settings.py b/backend/app/spiders/autohome_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/autohome_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/autohome_config/config_spider/spiders/__init__.py b/backend/app/spiders/autohome_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/autohome_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/autohome_config/config_spider/spiders/spider.py b/backend/app/spiders/autohome_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..83753f5a --- /dev/null +++ b/backend/app/spiders/autohome_config/config_spider/spiders/spider.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='https://www.autohome.com.cn/all/', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.css('ul.article > li'): + item = Item() + item['title'] = elem.css('li > a > h3::text').extract_first() + item['url'] = elem.css('li > a::attr("href")').extract_first() + item['abstract'] = elem.css('li > a > p::text').extract_first() + item['time'] = elem.css('li > a .fn-left::text').extract_first() + item['views'] = elem.css('li > a .fn-right > em:first-child::text').extract_first() + item['comments'] = elem.css('li > a .fn-right > em:last-child::text').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield item + next_url = response.css('a.page-item-next::attr("href")').extract_first() + yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) + + diff --git a/backend/app/spiders/autohome_config/md5.txt b/backend/app/spiders/autohome_config/md5.txt new file mode 100755 index 00000000..c4707adf --- /dev/null +++ b/backend/app/spiders/autohome_config/md5.txt @@ -0,0 +1 @@ +d784a11085e298eaf344eadc3a3e9411 diff --git a/backend/app/spiders/autohome_config/scrapy.cfg b/backend/app/spiders/autohome_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/autohome_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/app/spiders/baidu_config/Spiderfile b/backend/app/spiders/baidu_config/Spiderfile new file mode 100755 index 00000000..a29d4acb --- /dev/null +++ b/backend/app/spiders/baidu_config/Spiderfile @@ -0,0 +1,39 @@ +name: "baidu_config" +display_name: "百度搜索(可配置)" +remark: "百度搜索Crawlab,列表+分页" +type: "configurable" +col: "results_baidu_config" +engine: scrapy +start_url: http://www.baidu.com/s?wd=crawlab +start_stage: list +stages: +- name: list + is_list: true + list_css: ".result.c-container" + list_xpath: "" + page_css: "a.n" + page_xpath: "" + page_attr: href + fields: + - name: title + css: "" + xpath: .//h3/a + attr: "" + next_stage: "" + remark: "" + - name: url + css: "" + xpath: .//h3/a + attr: href + next_stage: "" + remark: "" + - name: abstract + css: "" + xpath: .//*[@class="c-abstract"] + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/baidu_config/config_spider/__init__.py b/backend/app/spiders/baidu_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/baidu_config/config_spider/items.py b/backend/app/spiders/baidu_config/config_spider/items.py new file mode 100755 index 00000000..9282765f --- /dev/null +++ b/backend/app/spiders/baidu_config/config_spider/items.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + title = scrapy.Field() + url = scrapy.Field() + abstract = scrapy.Field() + diff --git a/backend/app/spiders/baidu_config/config_spider/middlewares.py b/backend/app/spiders/baidu_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/baidu_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/baidu_config/config_spider/pipelines.py b/backend/app/spiders/baidu_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/baidu_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/baidu_config/config_spider/settings.py b/backend/app/spiders/baidu_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/baidu_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/baidu_config/config_spider/spiders/__init__.py b/backend/app/spiders/baidu_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/baidu_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/baidu_config/config_spider/spiders/spider.py b/backend/app/spiders/baidu_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..e5fd793f --- /dev/null +++ b/backend/app/spiders/baidu_config/config_spider/spiders/spider.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='http://www.baidu.com/s?wd=crawlab', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.css('.result.c-container'): + item = Item() + item['title'] = elem.xpath('string(.//h3/a)').extract_first() + item['url'] = elem.xpath('.//h3/a/@href').extract_first() + item['abstract'] = elem.xpath('string(.//*[@class="c-abstract"])').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield item + next_url = response.css('a.n::attr("href")').extract_first() + yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) + + diff --git a/backend/app/spiders/baidu_config/md5.txt b/backend/app/spiders/baidu_config/md5.txt new file mode 100755 index 00000000..32137b76 --- /dev/null +++ b/backend/app/spiders/baidu_config/md5.txt @@ -0,0 +1 @@ +ba25f6f3567b256473d3f0ec6af783fd diff --git a/backend/app/spiders/baidu_config/scrapy.cfg b/backend/app/spiders/baidu_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/baidu_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/app/spiders/bing_general/Spiderfile b/backend/app/spiders/bing_general/Spiderfile new file mode 100755 index 00000000..614c135e --- /dev/null +++ b/backend/app/spiders/bing_general/Spiderfile @@ -0,0 +1,6 @@ +name: "bing_general" +display_name: "必应搜索 (通用)" +remark: "必应搜索 Crawlab,列表+分页" +col: "results_bing_general" +type: "customized" +cmd: "python bing_spider.py" \ No newline at end of file diff --git a/backend/app/spiders/bing_general/bing_spider.py b/backend/app/spiders/bing_general/bing_spider.py new file mode 100755 index 00000000..e982e4ee --- /dev/null +++ b/backend/app/spiders/bing_general/bing_spider.py @@ -0,0 +1,41 @@ +import requests +from bs4 import BeautifulSoup as bs +from urllib.parse import urljoin, urlparse +import re +from crawlab import save_item + +s = requests.Session() + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +def start_requests(): + for i in range(0, 9): + fr = 'PERE' if not i else 'MORE' + url = f'https://cn.bing.com/search?q=crawlab&first={10 * i + 1}&FROM={fr}' + request_page(url) + +def request_page(url): + print(f'requesting {url}') + r = s.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}) + parse_list(r) + +def parse_list(response): + soup = bs(response.content.decode('utf-8')) + for el in list(soup.select('#b_results > li')): + try: + save_item({ + 'title': el.select_one('h2').text, + 'url': el.select_one('h2 a').attrs.get('href'), + 'abstract': el.select_one('.b_caption p').text, + }) + except: + pass + +if __name__ == '__main__': + start_requests() \ No newline at end of file diff --git a/backend/app/spiders/bing_general/md5.txt b/backend/app/spiders/bing_general/md5.txt new file mode 100755 index 00000000..42fb6afd --- /dev/null +++ b/backend/app/spiders/bing_general/md5.txt @@ -0,0 +1 @@ +cf295b694a20c99c4857f838aa0402a7 diff --git a/backend/app/spiders/chinaz/Spiderfile b/backend/app/spiders/chinaz/Spiderfile new file mode 100755 index 00000000..2fb940bb --- /dev/null +++ b/backend/app/spiders/chinaz/Spiderfile @@ -0,0 +1,5 @@ +name: "chinaz" +display_name: "站长之家 (Scrapy)" +col: "results_chinaz" +type: "customized" +cmd: "scrapy crawl chinaz_spider" \ No newline at end of file diff --git a/backend/app/spiders/chinaz/chinaz/__init__.py b/backend/app/spiders/chinaz/chinaz/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/chinaz/chinaz/items.py b/backend/app/spiders/chinaz/chinaz/items.py new file mode 100755 index 00000000..1fdcac1b --- /dev/null +++ b/backend/app/spiders/chinaz/chinaz/items.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class ChinazItem(scrapy.Item): + # define the fields for your item here like: + _id = scrapy.Field() + task_id = scrapy.Field() + name = scrapy.Field() + domain = scrapy.Field() + description = scrapy.Field() + rank = scrapy.Field() + main_category = scrapy.Field() + category = scrapy.Field() + location = scrapy.Field() diff --git a/backend/app/spiders/chinaz/chinaz/middlewares.py b/backend/app/spiders/chinaz/chinaz/middlewares.py new file mode 100755 index 00000000..c98995d5 --- /dev/null +++ b/backend/app/spiders/chinaz/chinaz/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ChinazSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ChinazDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/chinaz/chinaz/pipelines.py b/backend/app/spiders/chinaz/chinaz/pipelines.py new file mode 100755 index 00000000..b29f9eb7 --- /dev/null +++ b/backend/app/spiders/chinaz/chinaz/pipelines.py @@ -0,0 +1,7 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html + diff --git a/backend/app/spiders/chinaz/chinaz/settings.py b/backend/app/spiders/chinaz/chinaz/settings.py new file mode 100755 index 00000000..932ec9ed --- /dev/null +++ b/backend/app/spiders/chinaz/chinaz/settings.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for chinaz project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'chinaz' + +SPIDER_MODULES = ['chinaz.spiders'] +NEWSPIDER_MODULE = 'chinaz.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'chinaz (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'chinaz.middlewares.ChinazSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'chinaz.middlewares.ChinazDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'crawlab.pipelines.CrawlabMongoPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/chinaz/chinaz/spiders/__init__.py b/backend/app/spiders/chinaz/chinaz/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/chinaz/chinaz/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py b/backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py new file mode 100755 index 00000000..28ad84e7 --- /dev/null +++ b/backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- +import scrapy +from chinaz.items import ChinazItem + + +class ChinazSpiderSpider(scrapy.Spider): + name = 'chinaz_spider' + allowed_domains = ['chinaz.com'] + start_urls = ['http://top.chinaz.com/hangye/'] + + def parse(self, response): + for item in response.css('.listCentent > li'): + name = item.css('h3.rightTxtHead > a::text').extract_first() + href = item.css('h3.rightTxtHead > a::attr("href")').extract_first() + domain = item.css('h3.rightTxtHead > span::text').extract_first() + description = item.css('p.RtCInfo::text').extract_first() + rank = item.css('.RtCRateCent > strong::text').extract_first() + rank = int(rank) + item = ChinazItem( + _id=domain, + name=name, + domain=domain, + description=description, + rank=rank, + ) + yield scrapy.Request( + url='http://top.chinaz.com' + href, + callback=self.parse_item, + meta={ + 'item': item + } + ) + + # pagination + a_list = response.css('.ListPageWrap > a::attr("href")').extract() + url = 'http://top.chinaz.com/hangye/' + a_list[-1] + yield scrapy.Request(url=url, callback=self.parse) + + def parse_item(self, response): + item = response.meta['item'] + + # category info extraction + arr = response.css('.TopMainTag-show .SimSun') + res1 = arr[0].css('a::text').extract() + main_category = res1[0] + if len(res1) == 1: + category = '其他' + else: + category = res1[1] + + # location info extraction + res2 = arr[1].css('a::text').extract() + if len(res2) > 0: + location = res2[0] + else: + location = '其他' + + # assign values to item + item['main_category'] = main_category + item['category'] = category + item['location'] = location + + yield item diff --git a/backend/app/spiders/chinaz/md5.txt b/backend/app/spiders/chinaz/md5.txt new file mode 100755 index 00000000..f5e15fb9 --- /dev/null +++ b/backend/app/spiders/chinaz/md5.txt @@ -0,0 +1 @@ +1976593e49bf0238602ce35d051bd137 diff --git a/backend/app/spiders/chinaz/scrapy.cfg b/backend/app/spiders/chinaz/scrapy.cfg new file mode 100755 index 00000000..d3b44a1a --- /dev/null +++ b/backend/app/spiders/chinaz/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = chinaz.settings + +[deploy] +#url = http://localhost:6800/ +project = chinaz diff --git a/backend/app/spiders/csdn_config/Spiderfile b/backend/app/spiders/csdn_config/Spiderfile new file mode 100755 index 00000000..67f4f8c5 --- /dev/null +++ b/backend/app/spiders/csdn_config/Spiderfile @@ -0,0 +1,60 @@ +name: "csdn_config" +display_name: "CSDN(可配置)" +remark: "CSDN Crawlab 文章,列表+详情+分页" +type: "configurable" +col: "results_csdn_config" +engine: scrapy +start_url: https://so.csdn.net/so/search/s.do?q=crawlab +start_stage: list +stages: +- name: list + is_list: true + list_css: .search-list-con > .search-list + list_xpath: "" + page_css: a.btn-next + page_xpath: "" + page_attr: href + fields: + - name: url + css: "" + xpath: .//*[@class="limit_width"]/a + attr: href + next_stage: detail + remark: "" +- name: detail + is_list: false + list_css: "" + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: "" + fields: + - name: content + css: "" + xpath: .//div[@id="content_views"] + attr: "" + next_stage: "" + remark: "" + - name: views + css: .read-count + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: title + css: .title-article + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: author + css: .follow-nickName + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + AUTOTHROTTLE_ENABLED: "false" + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/79.0.3945.117 Safari/537.36 diff --git a/backend/app/spiders/csdn_config/config_spider/__init__.py b/backend/app/spiders/csdn_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/csdn_config/config_spider/items.py b/backend/app/spiders/csdn_config/config_spider/items.py new file mode 100755 index 00000000..3c8e5e54 --- /dev/null +++ b/backend/app/spiders/csdn_config/config_spider/items.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + url = scrapy.Field() + content = scrapy.Field() + views = scrapy.Field() + title = scrapy.Field() + author = scrapy.Field() + diff --git a/backend/app/spiders/csdn_config/config_spider/middlewares.py b/backend/app/spiders/csdn_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/csdn_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/csdn_config/config_spider/pipelines.py b/backend/app/spiders/csdn_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/csdn_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/csdn_config/config_spider/settings.py b/backend/app/spiders/csdn_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/csdn_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/csdn_config/config_spider/spiders/__init__.py b/backend/app/spiders/csdn_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/csdn_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/csdn_config/config_spider/spiders/spider.py b/backend/app/spiders/csdn_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..9ecc4aae --- /dev/null +++ b/backend/app/spiders/csdn_config/config_spider/spiders/spider.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='https://so.csdn.net/so/search/s.do?q=crawlab', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.css('.search-list-con > .search-list'): + item = Item() + item['url'] = elem.xpath('.//*[@class="limit_width"]/a/@href').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield scrapy.Request(url=get_real_url(response, item['url']), callback=self.parse_detail, meta={'item': item}) + next_url = response.css('a.btn-next::attr("href")').extract_first() + yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) + + def parse_detail(self, response): + item = Item() if response.meta.get('item') is None else response.meta.get('item') + item['content'] = response.xpath('string(.//div[@id="content_views"])').extract_first() + item['views'] = response.css('.read-count::text').extract_first() + item['title'] = response.css('.title-article::text').extract_first() + item['author'] = response.css('.follow-nickName::text').extract_first() + yield item + + diff --git a/backend/app/spiders/csdn_config/md5.txt b/backend/app/spiders/csdn_config/md5.txt new file mode 100755 index 00000000..e169c42a --- /dev/null +++ b/backend/app/spiders/csdn_config/md5.txt @@ -0,0 +1 @@ +b6889c74e006a5e619b525d84db62ffd diff --git a/backend/app/spiders/csdn_config/scrapy.cfg b/backend/app/spiders/csdn_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/csdn_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/app/spiders/douban_config/Spiderfile b/backend/app/spiders/douban_config/Spiderfile new file mode 100755 index 00000000..84f0647a --- /dev/null +++ b/backend/app/spiders/douban_config/Spiderfile @@ -0,0 +1,57 @@ +name: "douban_config" +display_name: "豆瓣读书(可配置)" +remark: "豆瓣读书新书推荐,列表" +type: "configurable" +col: "results_douban_config" +engine: scrapy +start_url: https://book.douban.com/latest +start_stage: list +stages: +- name: list + is_list: true + list_css: ul.cover-col-4 > li + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: "" + fields: + - name: title + css: h2 > a + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: h2 > a + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: img + css: a.cover img + xpath: "" + attr: src + next_stage: "" + remark: "" + - name: rating + css: p.rating > .color-lightgray + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: abstract + css: p:last-child + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: info + css: .color-gray + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/douban_config/config_spider/__init__.py b/backend/app/spiders/douban_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/douban_config/config_spider/items.py b/backend/app/spiders/douban_config/config_spider/items.py new file mode 100755 index 00000000..d6959b8d --- /dev/null +++ b/backend/app/spiders/douban_config/config_spider/items.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + title = scrapy.Field() + url = scrapy.Field() + img = scrapy.Field() + rating = scrapy.Field() + abstract = scrapy.Field() + info = scrapy.Field() + diff --git a/backend/app/spiders/douban_config/config_spider/middlewares.py b/backend/app/spiders/douban_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/douban_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/douban_config/config_spider/pipelines.py b/backend/app/spiders/douban_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/douban_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/douban_config/config_spider/settings.py b/backend/app/spiders/douban_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/douban_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/douban_config/config_spider/spiders/__init__.py b/backend/app/spiders/douban_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/douban_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/douban_config/config_spider/spiders/spider.py b/backend/app/spiders/douban_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..61bb648d --- /dev/null +++ b/backend/app/spiders/douban_config/config_spider/spiders/spider.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='https://book.douban.com/latest', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.css('ul.cover-col-4 > li'): + item = Item() + item['title'] = elem.css('h2 > a::text').extract_first() + item['url'] = elem.css('h2 > a::attr("href")').extract_first() + item['img'] = elem.css('a.cover img::attr("src")').extract_first() + item['rating'] = elem.css('p.rating > .color-lightgray::text').extract_first() + item['abstract'] = elem.css('p:last-child::text').extract_first() + item['info'] = elem.css('.color-gray::text').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield item + + diff --git a/backend/app/spiders/douban_config/md5.txt b/backend/app/spiders/douban_config/md5.txt new file mode 100755 index 00000000..374e3804 --- /dev/null +++ b/backend/app/spiders/douban_config/md5.txt @@ -0,0 +1 @@ +4d59a6c83b0e125d5321beae86bb93ce diff --git a/backend/app/spiders/douban_config/scrapy.cfg b/backend/app/spiders/douban_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/douban_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/app/spiders/jd/Spiderfile b/backend/app/spiders/jd/Spiderfile new file mode 100755 index 00000000..d090472b --- /dev/null +++ b/backend/app/spiders/jd/Spiderfile @@ -0,0 +1,5 @@ +name: "jd" +display_name: "京东 (Scrapy)" +col: "results_jd" +type: "customized" +cmd: "scrapy crawl jd_spider" \ No newline at end of file diff --git a/backend/app/spiders/jd/jd/__init__.py b/backend/app/spiders/jd/jd/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/jd/jd/items.py b/backend/app/spiders/jd/jd/items.py new file mode 100755 index 00000000..b2c5e647 --- /dev/null +++ b/backend/app/spiders/jd/jd/items.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class JdItem(scrapy.Item): + # define the fields for your item here like: + name = scrapy.Field() + price = scrapy.Field() + url = scrapy.Field() diff --git a/backend/app/spiders/jd/jd/middlewares.py b/backend/app/spiders/jd/jd/middlewares.py new file mode 100755 index 00000000..6fceded5 --- /dev/null +++ b/backend/app/spiders/jd/jd/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class JdSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class JdDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/jd/jd/pipelines.py b/backend/app/spiders/jd/jd/pipelines.py new file mode 100755 index 00000000..5a7d7cbf --- /dev/null +++ b/backend/app/spiders/jd/jd/pipelines.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/backend/app/spiders/jd/jd/settings.py b/backend/app/spiders/jd/jd/settings.py new file mode 100755 index 00000000..ef89ed0c --- /dev/null +++ b/backend/app/spiders/jd/jd/settings.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for jd project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'jd' + +SPIDER_MODULES = ['jd.spiders'] +NEWSPIDER_MODULE = 'jd.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'jd (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'jd.middlewares.JdSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'jd.middlewares.JdDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'crawlab.pipelines.CrawlabMongoPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/jd/jd/spiders/__init__.py b/backend/app/spiders/jd/jd/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/jd/jd/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/jd/jd/spiders/jd_spider.py b/backend/app/spiders/jd/jd/spiders/jd_spider.py new file mode 100755 index 00000000..4ec94fa9 --- /dev/null +++ b/backend/app/spiders/jd/jd/spiders/jd_spider.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +import scrapy + +from jd.items import JdItem + + +class JdSpiderSpider(scrapy.Spider): + name = 'jd_spider' + allowed_domains = ['jd.com'] + + def start_requests(self): + for i in range(1, 50): + yield scrapy.Request(url=f'https://search.jd.com/Search?keyword=手机&enc=utf-8&page={i}') + + def parse(self, response): + for el in response.css('.gl-item'): + yield JdItem( + url=el.css('.p-name > a::attr("href")').extract_first(), + name=el.css('.p-name > a::attr("title")').extract_first(), + price=float(el.css('.p-price i::text').extract_first()), + ) diff --git a/backend/app/spiders/jd/md5.txt b/backend/app/spiders/jd/md5.txt new file mode 100755 index 00000000..dcd53f51 --- /dev/null +++ b/backend/app/spiders/jd/md5.txt @@ -0,0 +1 @@ +621486d31459514eb27a082d159d9b8c diff --git a/backend/app/spiders/jd/scrapy.cfg b/backend/app/spiders/jd/scrapy.cfg new file mode 100755 index 00000000..87cf0280 --- /dev/null +++ b/backend/app/spiders/jd/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = jd.settings + +[deploy] +#url = http://localhost:6800/ +project = jd diff --git a/backend/app/spiders/sinastock/Spiderfile b/backend/app/spiders/sinastock/Spiderfile new file mode 100755 index 00000000..b110cb48 --- /dev/null +++ b/backend/app/spiders/sinastock/Spiderfile @@ -0,0 +1,5 @@ +name: "sinastock" +display_name: "新浪股票 (Scrapy)" +type: "customized" +col: "results_sinastock" +cmd: "scrapy crawl sinastock_spider" \ No newline at end of file diff --git a/backend/app/spiders/sinastock/md5.txt b/backend/app/spiders/sinastock/md5.txt new file mode 100755 index 00000000..1e5d8ab9 --- /dev/null +++ b/backend/app/spiders/sinastock/md5.txt @@ -0,0 +1 @@ +80bc091fa45ef4a85c9f1a66c81a4ed7 diff --git a/backend/app/spiders/sinastock/scrapy.cfg b/backend/app/spiders/sinastock/scrapy.cfg new file mode 100755 index 00000000..4969ad96 --- /dev/null +++ b/backend/app/spiders/sinastock/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = sinastock.settings + +[deploy] +#url = http://localhost:6800/ +project = sinastock diff --git a/backend/app/spiders/sinastock/sinastock/__init__.py b/backend/app/spiders/sinastock/sinastock/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/sinastock/sinastock/items.py b/backend/app/spiders/sinastock/sinastock/items.py new file mode 100755 index 00000000..6e3e5d8e --- /dev/null +++ b/backend/app/spiders/sinastock/sinastock/items.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class NewsItem(scrapy.Item): + # define the fields for your item here like: + _id = scrapy.Field() + title = scrapy.Field() + ts_str = scrapy.Field() + ts = scrapy.Field() + url = scrapy.Field() + text = scrapy.Field() + task_id = scrapy.Field() + source = scrapy.Field() + stocks = scrapy.Field() diff --git a/backend/app/spiders/sinastock/sinastock/middlewares.py b/backend/app/spiders/sinastock/sinastock/middlewares.py new file mode 100755 index 00000000..912b5e57 --- /dev/null +++ b/backend/app/spiders/sinastock/sinastock/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class SinastockSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class SinastockDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/sinastock/sinastock/pipelines.py b/backend/app/spiders/sinastock/sinastock/pipelines.py new file mode 100755 index 00000000..5a7d7cbf --- /dev/null +++ b/backend/app/spiders/sinastock/sinastock/pipelines.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/backend/app/spiders/sinastock/sinastock/settings.py b/backend/app/spiders/sinastock/sinastock/settings.py new file mode 100755 index 00000000..3e01d3ca --- /dev/null +++ b/backend/app/spiders/sinastock/sinastock/settings.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for sinastock project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'sinastock' + +SPIDER_MODULES = ['sinastock.spiders'] +NEWSPIDER_MODULE = 'sinastock.spiders' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +# USER_AGENT = 'sinastock (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +# CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +# DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +# } + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# 'sinastock.middlewares.SinastockSpiderMiddleware': 543, +# } + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# DOWNLOADER_MIDDLEWARES = { +# 'sinastock.middlewares.SinastockDownloaderMiddleware': 543, +# } + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +# } + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'crawlab.pipelines.CrawlabMongoPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/sinastock/sinastock/spiders/__init__.py b/backend/app/spiders/sinastock/sinastock/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/sinastock/sinastock/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py b/backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py new file mode 100755 index 00000000..54daf763 --- /dev/null +++ b/backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- +import os +import re +from datetime import datetime + +import scrapy +from pymongo import MongoClient + +from sinastock.items import NewsItem + +class SinastockSpiderSpider(scrapy.Spider): + name = 'sinastock_spider' + allowed_domains = ['finance.sina.com.cn'] + mongo = MongoClient( + host=os.environ.get('MONGO_HOST') or 'localhost', + port=int(os.environ.get('MONGO_PORT') or 27017) + ) + db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test'] + col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'stock_news') + + def start_requests(self): + col = self.db['stocks'] + for s in col.find({}): + code, ex = s['ts_code'].split('.') + for i in range(10): + url = f'http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol={ex.lower()}{code}&Page={i + 1}' + yield scrapy.Request( + url=url, + callback=self.parse, + meta={'ts_code': s['ts_code']} + ) + + def parse(self, response): + for a in response.css('.datelist > ul > a'): + url = a.css('a::attr("href")').extract_first() + item = NewsItem( + title=a.css('a::text').extract_first(), + url=url, + source='sina', + stocks=[response.meta['ts_code']] + ) + yield scrapy.Request( + url=url, + callback=self.parse_detail, + meta={'item': item} + ) + + def parse_detail(self, response): + item = response.meta['item'] + text = response.css('#artibody').extract_first() + pre = re.compile('>(.*?)<') + text = ''.join(pre.findall(text)) + item['text'] = text.replace('\u3000', '') + item['ts_str'] = response.css('.date::text').extract_first() + if item['text'] is None or item['ts_str'] is None: + pass + else: + item['ts'] = datetime.strptime(item['ts_str'], '%Y年%m月%d日 %H:%M') + yield item diff --git a/backend/app/spiders/v2ex_config/Spiderfile b/backend/app/spiders/v2ex_config/Spiderfile new file mode 100755 index 00000000..bb18d40a --- /dev/null +++ b/backend/app/spiders/v2ex_config/Spiderfile @@ -0,0 +1,54 @@ +name: "v2ex_config" +display_name: "V2ex(可配置)" +remark: "V2ex,列表+详情" +type: "configurable" +col: "results_v2ex_config" +engine: scrapy +start_url: https://v2ex.com/ +start_stage: list +stages: +- name: list + is_list: true + list_css: .cell.item + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: href + fields: + - name: title + css: a.topic-link + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: a.topic-link + xpath: "" + attr: href + next_stage: detail + remark: "" + - name: replies + css: .count_livid + xpath: "" + attr: "" + next_stage: "" + remark: "" +- name: detail + is_list: false + list_css: "" + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: "" + fields: + - name: content + css: "" + xpath: .//*[@class="markdown_body"] + attr: "" + next_stage: "" + remark: "" +settings: + AUTOTHROTTLE_ENABLED: "true" + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/79.0.3945.117 Safari/537.36 diff --git a/backend/app/spiders/v2ex_config/config_spider/__init__.py b/backend/app/spiders/v2ex_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/v2ex_config/config_spider/items.py b/backend/app/spiders/v2ex_config/config_spider/items.py new file mode 100755 index 00000000..d2c01a06 --- /dev/null +++ b/backend/app/spiders/v2ex_config/config_spider/items.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + title = scrapy.Field() + url = scrapy.Field() + replies = scrapy.Field() + content = scrapy.Field() + diff --git a/backend/app/spiders/v2ex_config/config_spider/middlewares.py b/backend/app/spiders/v2ex_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/v2ex_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/v2ex_config/config_spider/pipelines.py b/backend/app/spiders/v2ex_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/v2ex_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/v2ex_config/config_spider/settings.py b/backend/app/spiders/v2ex_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/v2ex_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py b/backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/v2ex_config/config_spider/spiders/spider.py b/backend/app/spiders/v2ex_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..4763e040 --- /dev/null +++ b/backend/app/spiders/v2ex_config/config_spider/spiders/spider.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='https://v2ex.com/', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.css('.cell.item'): + item = Item() + item['title'] = elem.css('a.topic-link::text').extract_first() + item['url'] = elem.css('a.topic-link::attr("href")').extract_first() + item['replies'] = elem.css('.count_livid::text').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield scrapy.Request(url=get_real_url(response, item['url']), callback=self.parse_detail, meta={'item': item}) + + def parse_detail(self, response): + item = Item() if response.meta.get('item') is None else response.meta.get('item') + item['content'] = response.xpath('string(.//*[@class="markdown_body"])').extract_first() + yield item + + diff --git a/backend/app/spiders/v2ex_config/md5.txt b/backend/app/spiders/v2ex_config/md5.txt new file mode 100755 index 00000000..5d725b2c --- /dev/null +++ b/backend/app/spiders/v2ex_config/md5.txt @@ -0,0 +1 @@ +402c0a07873ef74b9b574bc0f6b28423 diff --git a/backend/app/spiders/v2ex_config/scrapy.cfg b/backend/app/spiders/v2ex_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/v2ex_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/app/spiders/xueqiu/Spiderfile b/backend/app/spiders/xueqiu/Spiderfile new file mode 100755 index 00000000..38aa5dbe --- /dev/null +++ b/backend/app/spiders/xueqiu/Spiderfile @@ -0,0 +1,5 @@ +name: "xueqiu" +display_name: "雪球网 (Scrapy)" +type: "customized" +col: "results_xueqiu" +cmd: "scrapy crawl xueqiu_spider" \ No newline at end of file diff --git a/backend/app/spiders/xueqiu/md5.txt b/backend/app/spiders/xueqiu/md5.txt new file mode 100755 index 00000000..6a9a2072 --- /dev/null +++ b/backend/app/spiders/xueqiu/md5.txt @@ -0,0 +1 @@ +df177994199caa691d87fc0c5031326d diff --git a/backend/app/spiders/xueqiu/scrapy.cfg b/backend/app/spiders/xueqiu/scrapy.cfg new file mode 100755 index 00000000..2c5ce3b3 --- /dev/null +++ b/backend/app/spiders/xueqiu/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = xueqiu.settings + +[deploy] +#url = http://localhost:6800/ +project = xueqiu diff --git a/backend/app/spiders/xueqiu/xueqiu/__init__.py b/backend/app/spiders/xueqiu/xueqiu/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/xueqiu/xueqiu/items.py b/backend/app/spiders/xueqiu/xueqiu/items.py new file mode 100755 index 00000000..5471594d --- /dev/null +++ b/backend/app/spiders/xueqiu/xueqiu/items.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class XueqiuItem(scrapy.Item): + # define the fields for your item here like: + _id = scrapy.Field() + task_id = scrapy.Field() + id = scrapy.Field() + text = scrapy.Field() + url = scrapy.Field() + target = scrapy.Field() + view_count = scrapy.Field() + mark = scrapy.Field() + created_at = scrapy.Field() + ts = scrapy.Field() + source = scrapy.Field() diff --git a/backend/app/spiders/xueqiu/xueqiu/middlewares.py b/backend/app/spiders/xueqiu/xueqiu/middlewares.py new file mode 100755 index 00000000..f60102ce --- /dev/null +++ b/backend/app/spiders/xueqiu/xueqiu/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class XueqiuSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class XueqiuDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/xueqiu/xueqiu/pipelines.py b/backend/app/spiders/xueqiu/xueqiu/pipelines.py new file mode 100755 index 00000000..5a7d7cbf --- /dev/null +++ b/backend/app/spiders/xueqiu/xueqiu/pipelines.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/backend/app/spiders/xueqiu/xueqiu/settings.py b/backend/app/spiders/xueqiu/xueqiu/settings.py new file mode 100755 index 00000000..1d898e2f --- /dev/null +++ b/backend/app/spiders/xueqiu/xueqiu/settings.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for xueqiu project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'xueqiu' + +SPIDER_MODULES = ['xueqiu.spiders'] +NEWSPIDER_MODULE = 'xueqiu.spiders' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +# CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +# DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +# } + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# 'xueqiu.middlewares.XueqiuSpiderMiddleware': 543, +# } + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# DOWNLOADER_MIDDLEWARES = { +# 'xueqiu.middlewares.XueqiuDownloaderMiddleware': 543, +# } + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +# } + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'crawlab.pipelines.CrawlabMongoPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/xueqiu/xueqiu/spiders/__init__.py b/backend/app/spiders/xueqiu/xueqiu/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/xueqiu/xueqiu/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py b/backend/app/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py new file mode 100755 index 00000000..a746e156 --- /dev/null +++ b/backend/app/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +import json +from datetime import datetime +from time import sleep + +import scrapy + +from xueqiu.items import XueqiuItem + + +class XueqiuSpiderSpider(scrapy.Spider): + name = 'xueqiu_spider' + allowed_domains = ['xueqiu.com'] + + def start_requests(self): + return [scrapy.Request( + url='https://xueqiu.com', + callback=self.parse_home + )] + + def parse_home(self, response): + yield scrapy.Request( + url='https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=20&category=6' + ) + + def parse(self, response): + data = json.loads(response.body) + next_max_id = data.get('next_max_id') + sleep(1) + for row in data.get('list'): + d = json.loads(row.get('data')) + item = XueqiuItem( + id=d['id'], + text=d['text'], + mark=d['mark'], + url=d['target'], + created_at=d['created_at'], + ts=datetime.fromtimestamp(d['created_at'] / 1e3), + view_count=d['view_count'], + source='xueqiu' + ) + yield item + + yield scrapy.Request( + url=f'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id={next_max_id}&count=20&category=6' + ) diff --git a/backend/app/spiders/xueqiu_config/Spiderfile b/backend/app/spiders/xueqiu_config/Spiderfile new file mode 100755 index 00000000..0de50e9e --- /dev/null +++ b/backend/app/spiders/xueqiu_config/Spiderfile @@ -0,0 +1,39 @@ +name: "xueqiu_config" +display_name: "雪球网(可配置)" +remark: "雪球网新闻,列表" +type: "configurable" +col: "results_xueqiu_config" +engine: scrapy +start_url: https://xueqiu.com/ +start_stage: list +stages: +- name: list + is_list: true + list_css: "" + list_xpath: .//*[contains(@class, "AnonymousHome_home__timeline__item")] + page_css: "" + page_xpath: "" + page_attr: "" + fields: + - name: title + css: h3 > a + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: h3 > a + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: abstract + css: p + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/xueqiu_config/config_spider/__init__.py b/backend/app/spiders/xueqiu_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/xueqiu_config/config_spider/items.py b/backend/app/spiders/xueqiu_config/config_spider/items.py new file mode 100755 index 00000000..9282765f --- /dev/null +++ b/backend/app/spiders/xueqiu_config/config_spider/items.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + title = scrapy.Field() + url = scrapy.Field() + abstract = scrapy.Field() + diff --git a/backend/app/spiders/xueqiu_config/config_spider/middlewares.py b/backend/app/spiders/xueqiu_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/xueqiu_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/xueqiu_config/config_spider/pipelines.py b/backend/app/spiders/xueqiu_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/xueqiu_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/xueqiu_config/config_spider/settings.py b/backend/app/spiders/xueqiu_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/xueqiu_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/xueqiu_config/config_spider/spiders/__init__.py b/backend/app/spiders/xueqiu_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/xueqiu_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/xueqiu_config/config_spider/spiders/spider.py b/backend/app/spiders/xueqiu_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..79d4636b --- /dev/null +++ b/backend/app/spiders/xueqiu_config/config_spider/spiders/spider.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='https://xueqiu.com/', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.xpath('.//*[contains(@class, "AnonymousHome_home__timeline__item")]'): + item = Item() + item['title'] = elem.css('h3 > a::text').extract_first() + item['url'] = elem.css('h3 > a::attr("href")').extract_first() + item['abstract'] = elem.css('p::text').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield item + + diff --git a/backend/app/spiders/xueqiu_config/md5.txt b/backend/app/spiders/xueqiu_config/md5.txt new file mode 100755 index 00000000..39a6df77 --- /dev/null +++ b/backend/app/spiders/xueqiu_config/md5.txt @@ -0,0 +1 @@ +e3da3aacb2d290cb179a79028fbfff9c diff --git a/backend/app/spiders/xueqiu_config/scrapy.cfg b/backend/app/spiders/xueqiu_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/xueqiu_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/app/spiders/zongheng_config/Spiderfile b/backend/app/spiders/zongheng_config/Spiderfile new file mode 100755 index 00000000..0163fac7 --- /dev/null +++ b/backend/app/spiders/zongheng_config/Spiderfile @@ -0,0 +1,45 @@ +name: "zongheng_config" +display_name: "纵横(可配置)" +remark: "纵横小说网,列表" +type: "configurable" +col: "results_zongheng_config" +engine: scrapy +start_url: http://www.zongheng.com/rank/details.html?rt=1&d=1 +start_stage: list +stages: +- name: list + is_list: true + list_css: .rank_d_list + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: href + fields: + - name: title + css: .rank_d_b_name > a + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: .rank_d_b_name > a + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: abstract + css: body + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: votes + css: .rank_d_b_ticket + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/zongheng_config/config_spider/__init__.py b/backend/app/spiders/zongheng_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/zongheng_config/config_spider/items.py b/backend/app/spiders/zongheng_config/config_spider/items.py new file mode 100755 index 00000000..528c3187 --- /dev/null +++ b/backend/app/spiders/zongheng_config/config_spider/items.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + title = scrapy.Field() + url = scrapy.Field() + abstract = scrapy.Field() + votes = scrapy.Field() + diff --git a/backend/app/spiders/zongheng_config/config_spider/middlewares.py b/backend/app/spiders/zongheng_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/zongheng_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/zongheng_config/config_spider/pipelines.py b/backend/app/spiders/zongheng_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/zongheng_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/zongheng_config/config_spider/settings.py b/backend/app/spiders/zongheng_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/zongheng_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/zongheng_config/config_spider/spiders/__init__.py b/backend/app/spiders/zongheng_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/zongheng_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/zongheng_config/config_spider/spiders/spider.py b/backend/app/spiders/zongheng_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..cf1b6a08 --- /dev/null +++ b/backend/app/spiders/zongheng_config/config_spider/spiders/spider.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='http://www.zongheng.com/rank/details.html?rt=1&d=1', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.css('.rank_d_list'): + item = Item() + item['title'] = elem.css('.rank_d_b_name > a::text').extract_first() + item['url'] = elem.css('.rank_d_b_name > a::attr("href")').extract_first() + item['abstract'] = elem.css('body::text').extract_first() + item['votes'] = elem.css('.rank_d_b_ticket::text').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield item + + diff --git a/backend/app/spiders/zongheng_config/md5.txt b/backend/app/spiders/zongheng_config/md5.txt new file mode 100755 index 00000000..46fd3de6 --- /dev/null +++ b/backend/app/spiders/zongheng_config/md5.txt @@ -0,0 +1 @@ +82cb98a6103fb878501df81f191703ba diff --git a/backend/app/spiders/zongheng_config/scrapy.cfg b/backend/app/spiders/zongheng_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/zongheng_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/conf/config.yml b/backend/conf/config.yml index 17341e95..1c2c8507 100644 --- a/backend/conf/config.yml +++ b/backend/conf/config.yml @@ -46,6 +46,8 @@ setting: demoSpiders: "N" checkScrapy: "Y" autoInstall: "Y" + esClient: "" # Your ES client, for example, http://192.168.1.1:9200 or http://your-domain.com, if not use es, set empty + spiderLogIndex: "spider-log" # Index pattern for kibana, need to config on kibana notification: mail: server: '' diff --git a/backend/config/config.go b/backend/config/config.go index e4c4616c..79be808e 100644 --- a/backend/config/config.go +++ b/backend/config/config.go @@ -53,3 +53,5 @@ func InitConfig(cfg string) error { return nil } + + diff --git a/backend/database/es_base.go b/backend/database/es_base.go new file mode 100644 index 00000000..b255958a --- /dev/null +++ b/backend/database/es_base.go @@ -0,0 +1,44 @@ +package database + +import ( + "context" + "github.com/apex/log" + "github.com/olivere/elastic/v7" + "github.com/satori/go.uuid" + "github.com/spf13/viper" + "sync" + "time" +) + +var doOnce sync.Once +var ctx context.Context +var ESClient *elastic.Client + +func InitEsClient() { + esClientStr := viper.GetString("setting.esClient") + ctx = context.Background() + ESClient, _ = elastic.NewClient(elastic.SetURL(esClientStr), elastic.SetSniff(false)) +} + +// WriteMsg will write the msg and level into es +func WriteMsgToES(when time.Time, msg chan string, index string) { + doOnce.Do(InitEsClient) + vals := make(map[string]interface{}) + vals["@timestamp"] = when.Format(time.RFC3339) + for { + select { + case vals["@msg"] = <-msg: + uid := uuid.NewV4().String() + _, err := ESClient.Index().Index(index).Id(uid).BodyJson(vals).Refresh("wait_for").Do(ctx) + if err != nil { + log.Error(err.Error()) + log.Error("send msg log to es error") + return + } + case <-time.After(6 * time.Second): + return + } + } + + return +} diff --git a/backend/go.mod b/backend/go.mod index d91a1a84..7503389a 100644 --- a/backend/go.mod +++ b/backend/go.mod @@ -3,7 +3,10 @@ module crawlab go 1.12 require ( + github.com/Masterminds/semver v1.4.2 // indirect + github.com/Masterminds/sprig v2.16.0+incompatible // indirect github.com/Unknwon/goconfig v0.0.0-20191126170842-860a72fb44fd + github.com/aokoli/goutils v1.0.1 // indirect github.com/apex/log v1.1.1 github.com/dgrijalva/jwt-go v3.2.0+incompatible github.com/fsnotify/fsnotify v1.4.7 @@ -12,15 +15,21 @@ require ( github.com/go-playground/locales v0.12.1 // indirect github.com/go-playground/universal-translator v0.16.0 // indirect github.com/gomodule/redigo v2.0.0+incompatible + github.com/huandu/xstrings v1.2.0 // indirect + github.com/imdario/mergo v0.3.6 // indirect github.com/imroc/req v0.2.4 + github.com/jaytaylor/html2text v0.0.0-20180606194806-57d518f124b0 // indirect github.com/leodido/go-urn v1.1.0 // indirect github.com/matcornic/hermes v1.2.0 - github.com/matcornic/hermes/v2 v2.0.2 // indirect - github.com/pkg/errors v0.8.1 - github.com/royeo/dingrobot v1.0.0 // indirect + github.com/mattn/go-runewidth v0.0.3 // indirect + github.com/olekukonko/tablewriter v0.0.1 // indirect + github.com/olivere/elastic/v7 v7.0.14 + github.com/pkg/errors v0.9.1 github.com/satori/go.uuid v1.2.0 + github.com/shurcooL/sanitized_anchor_name v1.0.0 // indirect github.com/smartystreets/goconvey v0.0.0-20190731233626-505e41936337 github.com/spf13/viper v1.4.0 + github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf // indirect gopkg.in/alexcesaro/quotedprintable.v3 v3.0.0-20150716171945-2caba252f4dc // indirect gopkg.in/go-playground/validator.v9 v9.29.1 gopkg.in/gomail.v2 v2.0.0-20150902115704-41f357289737 diff --git a/backend/go.sum b/backend/go.sum index 463abbee..1a253f5d 100644 --- a/backend/go.sum +++ b/backend/go.sum @@ -8,9 +8,11 @@ github.com/Masterminds/sprig v2.16.0+incompatible/go.mod h1:y6hNFY5UBTIWBxnzTeuN github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= github.com/Unknwon/goconfig v0.0.0-20191126170842-860a72fb44fd h1:+CYOsXi89xOqBkj7CuEJjA2It+j+R3ngUZEydr6mtkw= github.com/Unknwon/goconfig v0.0.0-20191126170842-860a72fb44fd/go.mod h1:wngxua9XCNjvHjDiTiV26DaKDT+0c63QR6H5hjVUUxw= +github.com/alcortesm/tgz v0.0.0-20161220082320-9c5fe88206d7 h1:uSoVVbwJiQipAclBbw+8quDsfcvFjOpI5iCf4p/cqCs= github.com/alcortesm/tgz v0.0.0-20161220082320-9c5fe88206d7/go.mod h1:6zEj6s6u/ghQa61ZWa/C2Aw3RkjiTBOix7dkqa1VLIs= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= +github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239 h1:kFOfPq6dUM1hTo4JG6LR5AXSUEsOjtdm0kw0FtQtMJA= github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239/go.mod h1:2FmKhYUyUczH0OGQWaF5ceTx0UBShxjsH6f8oGKYe2c= github.com/aokoli/goutils v1.0.1 h1:7fpzNGoJ3VA8qcrm++XEE1QUe0mIwNeLa02Nwq7RDkg= github.com/aokoli/goutils v1.0.1/go.mod h1:SijmP0QR8LtwsmDs8Yii5Z/S4trXFGFC2oO5g9DP+DQ= @@ -19,8 +21,10 @@ github.com/apex/log v1.1.1/go.mod h1:Ls949n1HFtXfbDcjiTTFQqkVUrte0puoIBfO3SVgwOA github.com/aphistic/golf v0.0.0-20180712155816-02c07f170c5a/go.mod h1:3NqKYiepwy8kCu4PNA+aP7WUV72eXWJeP9/r3/K9aLE= github.com/aphistic/sweet v0.2.0/go.mod h1:fWDlIh/isSE9n6EPsRmC0det+whmX6dJid3stzu0Xys= github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/aws/aws-sdk-go v1.20.6/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo= +github.com/aws/aws-sdk-go v1.30.7/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZveU8YkpAk0= github.com/aybabtme/rgbterm v0.0.0-20170906152045-cc83f3b3ce59/go.mod h1:q/89r3U2H7sSsE2t6Kca0lfwTK8JdoNGS/yzM/4iH5I= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= @@ -41,7 +45,10 @@ github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8 github.com/emirpasic/gods v1.12.0 h1:QAUIPSaCu4G+POclxeqb3F+WPpdKqFGlw36+yOzGlrg= github.com/emirpasic/gods v1.12.0/go.mod h1:YfzfFFoVP/catgzJb4IKIqXjX78Ha8FMSDh3ymbK86o= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= +github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568 h1:BHsljHzVlRcyQhjrss6TZTdY2VfCqZPbv5k3iBFa2ZQ= github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:xEzjJPgXI435gkrCt3MPfRiAkVrwSbHsst4LCFVfpJc= +github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw= +github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g= github.com/fsnotify/fsnotify v1.4.7 h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= @@ -49,6 +56,7 @@ github.com/gin-contrib/sse v0.0.0-20190301062529-5545eab6dad3 h1:t8FVkw33L+wilf2 github.com/gin-contrib/sse v0.0.0-20190301062529-5545eab6dad3/go.mod h1:VJ0WA2NBN22VlZ2dKZQPAPnyWw5XTlK1KymzLKsr59s= github.com/gin-gonic/gin v1.4.0 h1:3tMoCCfM7ppqsR0ptz/wi1impNpT7/9wQtMZ8lr1mCQ= github.com/gin-gonic/gin v1.4.0/go.mod h1:OW2EZn3DO8Ln9oIKOvM++LBO+5UPHJJDH72/q/3rZdM= +github.com/gliderlabs/ssh v0.2.2 h1:6zsha5zo/TWhRhwqCD3+EarCAgZ2yN28ipRnGPnwkI0= github.com/gliderlabs/ssh v0.2.2/go.mod h1:U7qILu1NlMHj9FlMhZLlkCdDnU1DBEAqr0aevW3Awn0= github.com/globalsign/mgo v0.0.0-20181015135952-eeefdecb41b8 h1:DujepqpGd1hyOd7aW59XpK7Qymp8iy83xq74fLr21is= github.com/globalsign/mgo v0.0.0-20181015135952-eeefdecb41b8/go.mod h1:xkRDCp4j0OGD1HRkm4kmhM+pmpv3AKq5SU7GMg4oO/Q= @@ -59,11 +67,13 @@ github.com/go-playground/locales v0.12.1 h1:2FITxuFt/xuCNP1Acdhv62OzaCiviiE4kotf github.com/go-playground/locales v0.12.1/go.mod h1:IUMDtCfWo/w/mtMfIE/IG2K+Ey3ygWanZIBtBW0W2TM= github.com/go-playground/universal-translator v0.16.0 h1:X++omBR/4cE2MNg91AoC3rmGrCjJ8eAeUP/K/EKx4DM= github.com/go-playground/universal-translator v0.16.0/go.mod h1:1AnU7NaIRDWWzGEKwgtJRd2xk99HeFyHw3yid4rvQIY= +github.com/go-sql-driver/mysql v1.5.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/groupcache v0.0.0-20190129154638-5b532d6fd5ef/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg= @@ -72,8 +82,10 @@ github.com/gomodule/redigo v2.0.0+incompatible h1:K/R+8tc58AaqLkqG2Ol3Qk+DR/TlNu github.com/gomodule/redigo v2.0.0+incompatible/go.mod h1:B4C85qUVwatsJoIUNIfCRsp7qO0iAmpGFZ4EELWSbC4= github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= +github.com/google/go-cmp v0.3.0 h1:crn/baboCvb5fXaQ0IJ1SGTsTVrWpDsCWC8EGETZijY= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= -github.com/google/uuid v1.0.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/google/go-cmp v0.4.0 h1:xsAVV57WRhGj6kEIi8ReJzQlHHqcBYCElAvkovg3B/4= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/uuid v1.1.1 h1:Gkbcsh/GbpXz7lPftLA3P6TYMwjCLYm83jiFQZF/3gY= github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8= @@ -97,6 +109,7 @@ github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 h1:BQSFePA1RWJOl github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99/go.mod h1:1lJo3i6rXxKeerYnT8Nvf0QmHCRC1n8sfWVwXF2Frvo= github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI= github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k= +github.com/jmespath/go-jmespath v0.3.0/go.mod h1:9QtRXoHjLGCJ5IBSaohpXITPlowMeeYCZ7fLUTSywik= github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo= github.com/jpillora/backoff v0.0.0-20180909062703-3050d21c67d7/go.mod h1:2iMrUgbbvHEiQClaW2NsSzMyGHqN+rDFqY705q49KG0= github.com/json-iterator/go v1.1.6 h1:MrUvLMLTMxbqFJ9kzlvat/rYZqZnW3u4wkLzWTaFwKs= @@ -120,9 +133,10 @@ github.com/leodido/go-urn v1.1.0 h1:Sm1gr51B1kKyfD2BlRcLSiEkffoG96g6TPv6eRoEiB8= github.com/leodido/go-urn v1.1.0/go.mod h1:+cyI34gQWZcE1eQU7NVgKkkzdXDQHr1dBMtdAPozLkw= github.com/magiconair/properties v1.8.0 h1:LLgXmsheXeRoUOBOjtwPQCWIYqM/LU1ayDtDePerRcY= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= +github.com/mailru/easyjson v0.7.1 h1:mdxE1MF9o53iCb2Ghj1VfWvh7ZOwHpnVG/xwXrV90U8= +github.com/mailru/easyjson v0.7.1/go.mod h1:KAzv3t3aY1NaHWoQz1+4F1ccyAH66Jk7yos7ldAVICs= github.com/matcornic/hermes v1.2.0 h1:AuqZpYcTOtTB7cahdevLfnhIpfzmpqw5Czv8vpdnFDU= github.com/matcornic/hermes v1.2.0/go.mod h1:lujJomb016Xjv8wBnWlNvUdtmvowjjfkqri5J/+1hYc= -github.com/matcornic/hermes/v2 v2.0.2/go.mod h1:iVsJWSIS4NtMNtgan22sy6lt7pImok7bATGPWCoaKNY= github.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcnceauSikq3lYCQ= github.com/mattn/go-colorable v0.1.2/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= github.com/mattn/go-isatty v0.0.5/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= @@ -145,14 +159,19 @@ github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRW github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U= github.com/olekukonko/tablewriter v0.0.1 h1:b3iUnf1v+ppJiOfNX4yxxqfWKMQPZR5yoh8urCTFX88= github.com/olekukonko/tablewriter v0.0.1/go.mod h1:vsDQFd/mU46D+Z4whnwzcISnGGzXWMclvtLoiIKAKIo= +github.com/olivere/elastic/v7 v7.0.14 h1:89dYPg6kD3WJx42ZtO4U6WDIzRy69FvQqz/yRiwekuM= +github.com/olivere/elastic/v7 v7.0.14/go.mod h1:+FgncZ8ho1QF3NlBo77XbuoTKYHhvEOfFZKIAfHnnDE= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/gomega v1.5.0/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= +github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= github.com/pelletier/go-buffruneio v0.2.0/go.mod h1:JkE26KsDizTr40EUHkXVtNPvgGtbSNq5BcowyYOWdKo= github.com/pelletier/go-toml v1.2.0 h1:T5zMGML61Wp+FlcbWjRDT7yAxhJNAiPPLOFECq181zc= github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= @@ -166,9 +185,6 @@ github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084/go.mod h1:TjEm7z github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU= github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg= github.com/rogpeppe/fastuuid v1.1.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= -github.com/royeo/dingrobot v1.0.0 h1:K4GrF+fOecNX0yi+oBKpfh7z0XP/8TzaIIHu1B2kKUQ= -github.com/royeo/dingrobot v1.0.0/go.mod h1:RqDM8E/hySCVwI2aUFRJAUGDcHHRnIhzNmbNG3bamQs= -github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/satori/go.uuid v1.2.0 h1:0uYX9dsZ2yD7q2RtLRtPSdGDWzjeM3TbMJP9utgA0ww= github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= github.com/sergi/go-diff v1.0.0 h1:Kpca3qRNrduNnOQeazBd0ysaKrUJiIuISHxogkT9RPQ= @@ -205,6 +221,9 @@ github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoH github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H4= +github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= github.com/tj/assert v0.0.0-20171129193455-018094318fb0/go.mod h1:mZ9/Rh9oLWpLLDRpvE+3b7gP/C2YyLFYxNmcLnPTMe0= github.com/tj/go-elastic v0.0.0-20171221160941-36157cbbebc2/go.mod h1:WjeM0Oo1eNAjXGDx2yma7uG2XoyRZTq1uv3M/o7imD0= github.com/tj/go-kinesis v0.0.0-20171128231115-08b17f58cb1b/go.mod h1:/yhzCV0xPfx6jb1bBgRFjl5lytqVqZXEaeqWP8lTEao= @@ -217,24 +236,27 @@ github.com/xanzy/ssh-agent v0.2.1/go.mod h1:mLlQY/MoOhWBj+gOGMQkOeiEvkx+8pJSI+0B github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU= github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q= go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= +go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= -golang.org/x/crypto v0.0.0-20181029175232-7e6ffbd03851/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190219172222-a4c6cb3142f2/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190426145343-a29dc8fdc734 h1:p/H982KKEjUnLJkM3tt/LemDnOc1GiZL5FCVlORJ5zo= golang.org/x/crypto v0.0.0-20190426145343-a29dc8fdc734/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4 h1:HuIa8hRrWRSrqYzx1qI49NNxhdi2PrY7gxVSq1JjLDc= golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20181029044818-c44066c5c816/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181220203305-927f97764cc3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= @@ -243,10 +265,13 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859 h1:R/3boaszxrf1GEUWTVDzSKVwL golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190724013045-ca1201d0de80 h1:Ao/3l156eZf2AW5wK8a7/smtodRU+gha3+BeqJ69lRk= golang.org/x/net v0.0.0-20190724013045-ca1201d0de80/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200202094626-16171245cfb2 h1:CCH4IOTTfewWjGOlSp+zGcjutRKlBEZQ6wTn8ozI/nI= +golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -258,6 +283,7 @@ golang.org/x/sys v0.0.0-20190221075227-b4e8571b14e0/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d h1:+R4KGOnez64A81RvjARKc4UT5/tI9ujCIVX+P5KiHuI= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e h1:D5TXcfTk7xF7hvieo4QErS3qqCB4teTffacDWr7CI+0= golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= @@ -268,12 +294,18 @@ golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxb golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190729092621-ff9f1409240a/go.mod h1:jcCCGcm9btYwXyDqrUWc6MKQKKGJCWEQ3AfLSRIbEuI= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= +google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= +google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= +google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.0/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/alexcesaro/quotedprintable.v3 v3.0.0-20150716171945-2caba252f4dc h1:2gGKlE2+asNV9m7xrywl36YYNnBG5ZQ0r/BOOxqPpmk= @@ -295,6 +327,7 @@ gopkg.in/russross/blackfriday.v2 v2.0.0 h1:+FlnIV8DSQnT7NZ43hcVKcdJdzZoeCmJj4Ql8 gopkg.in/russross/blackfriday.v2 v2.0.0/go.mod h1:6sSBNz/GtOm/pJTuh5UmBK2ZHfmnxGbl2NZg1UliSOI= gopkg.in/src-d/go-billy.v4 v4.3.2 h1:0SQA1pRztfTFx2miS8sA97XvooFeNOmvUenF4o0EcVg= gopkg.in/src-d/go-billy.v4 v4.3.2/go.mod h1:nDjArDMp+XMs1aFAESLRjfGSgfvoYN0hDfzEk0GjC98= +gopkg.in/src-d/go-git-fixtures.v3 v3.5.0 h1:ivZFOIltbce2Mo8IjzUHAFoq/IylO9WHhNOAJK+LsJg= gopkg.in/src-d/go-git-fixtures.v3 v3.5.0/go.mod h1:dLBcvytrw/TYZsNTWCnkNF2DSIlzWYqTe3rJR56Ac7g= gopkg.in/src-d/go-git.v4 v4.13.1 h1:SRtFyV8Kxc0UP7aCHcijOMQGPxHSmMOPrzulQWolkYE= gopkg.in/src-d/go-git.v4 v4.13.1/go.mod h1:nx5NYcxdKxq5fpltdHnPa2Exj4Sx0EclMWZQbYDu2z8= diff --git a/backend/model/task.go b/backend/model/task.go index 35e738ab..0b2ed0a9 100644 --- a/backend/model/task.go +++ b/backend/model/task.go @@ -508,3 +508,4 @@ func UpdateTaskErrorLogs(taskId string, errorRegexPattern string) error { return nil } + diff --git a/backend/services/task.go b/backend/services/task.go index a0bb9a49..16278d24 100644 --- a/backend/services/task.go +++ b/backend/services/task.go @@ -16,7 +16,7 @@ import ( "github.com/apex/log" "github.com/globalsign/mgo/bson" "github.com/imroc/req" - uuid "github.com/satori/go.uuid" + "github.com/satori/go.uuid" "github.com/spf13/viper" "net/http" "os" @@ -166,7 +166,11 @@ func SetEnv(cmd *exec.Cmd, envs []model.Env, task model.Task, spider model.Spide return cmd } -func SetLogConfig(cmd *exec.Cmd, t model.Task, u model.User) error { +func SetLogConfig(wg *sync.WaitGroup, cmd *exec.Cmd, t model.Task, u model.User) error { + + esChan := make(chan string, 1) + esClientStr := viper.GetString("setting.esClient") + spiderLogIndex := viper.GetString("setting.spiderLogIndex") // get stdout reader stdout, err := cmd.StdoutPipe() readerStdout := bufio.NewReader(stdout) @@ -191,7 +195,9 @@ func SetLogConfig(cmd *exec.Cmd, t model.Task, u model.User) error { isStderrFinished := false // periodically (1 sec) insert log items + wg.Add(3) go func() { + defer wg.Done() for { _ = model.AddLogItems(logs) logs = []model.LogItem{} @@ -211,6 +217,7 @@ func SetLogConfig(cmd *exec.Cmd, t model.Task, u model.User) error { // read stdout go func() { + defer wg.Done() for { line, err := readerStdout.ReadString('\n') if err != nil { @@ -227,12 +234,18 @@ func SetLogConfig(cmd *exec.Cmd, t model.Task, u model.User) error { Ts: time.Now(), ExpireTs: time.Now().Add(time.Duration(expireDuration) * time.Second), } + esChan <- l.Message + if esClientStr != "" { + go database.WriteMsgToES(time.Now(), esChan, spiderLogIndex) + } + logs = append(logs, l) } }() // read stderr go func() { + defer wg.Done() for { line, err := readerStderr.ReadString('\n') if err != nil { @@ -249,10 +262,15 @@ func SetLogConfig(cmd *exec.Cmd, t model.Task, u model.User) error { Ts: time.Now(), ExpireTs: time.Now().Add(time.Duration(expireDuration) * time.Second), } + esChan <- l.Message + if esClientStr != "" { + go database.WriteMsgToES(time.Now(), esChan, spiderLogIndex) + } logs = append(logs, l) } }() + wg.Wait() return nil } @@ -337,6 +355,8 @@ func ExecuteShellCmd(cmdStr string, cwd string, t model.Task, s model.Spider, u log.Infof("cwd: %s", cwd) log.Infof("cmd: %s", cmdStr) + wg := &sync.WaitGroup{} + // 生成执行命令 var cmd *exec.Cmd if runtime.GOOS == constants.Windows { @@ -349,9 +369,7 @@ func ExecuteShellCmd(cmdStr string, cwd string, t model.Task, s model.Spider, u cmd.Dir = cwd // 日志配置 - if err := SetLogConfig(cmd, t, u); err != nil { - return err - } + go SetLogConfig(wg, cmd, t, u) // 环境变量配置 envs := s.Envs From 2858a48f9a99377b4c45fceac6582323a5b3297e Mon Sep 17 00:00:00 2001 From: hantmac Date: Mon, 27 Apr 2020 16:01:24 +0800 Subject: [PATCH 2/4] support send log to ES --- backend/app/spiders/amazon_config/Spiderfile | 51 -------- .../amazon_config/config_spider/__init__.py | 0 .../amazon_config/config_spider/items.py | 20 ---- .../config_spider/middlewares.py | 103 ---------------- .../amazon_config/config_spider/pipelines.py | 27 ----- .../amazon_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 37 ------ backend/app/spiders/amazon_config/md5.txt | 1 - backend/app/spiders/amazon_config/scrapy.cfg | 11 -- .../app/spiders/autohome_config/Spiderfile | 57 --------- .../autohome_config/config_spider/__init__.py | 0 .../autohome_config/config_spider/items.py | 21 ---- .../config_spider/middlewares.py | 103 ---------------- .../config_spider/pipelines.py | 27 ----- .../autohome_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 38 ------ backend/app/spiders/autohome_config/md5.txt | 1 - .../app/spiders/autohome_config/scrapy.cfg | 11 -- backend/app/spiders/baidu_config/Spiderfile | 39 ------ .../baidu_config/config_spider/__init__.py | 0 .../baidu_config/config_spider/items.py | 18 --- .../baidu_config/config_spider/middlewares.py | 103 ---------------- .../baidu_config/config_spider/pipelines.py | 27 ----- .../baidu_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 35 ------ backend/app/spiders/baidu_config/md5.txt | 1 - backend/app/spiders/baidu_config/scrapy.cfg | 11 -- backend/app/spiders/bing_general/Spiderfile | 6 - .../app/spiders/bing_general/bing_spider.py | 41 ------- backend/app/spiders/bing_general/md5.txt | 1 - backend/app/spiders/chinaz/Spiderfile | 5 - backend/app/spiders/chinaz/chinaz/__init__.py | 0 backend/app/spiders/chinaz/chinaz/items.py | 21 ---- .../app/spiders/chinaz/chinaz/middlewares.py | 103 ---------------- .../app/spiders/chinaz/chinaz/pipelines.py | 7 -- backend/app/spiders/chinaz/chinaz/settings.py | 90 -------------- .../spiders/chinaz/chinaz/spiders/__init__.py | 4 - .../chinaz/chinaz/spiders/chinaz_spider.py | 63 ---------- backend/app/spiders/chinaz/md5.txt | 1 - backend/app/spiders/chinaz/scrapy.cfg | 11 -- backend/app/spiders/csdn_config/Spiderfile | 60 ---------- .../csdn_config/config_spider/__init__.py | 0 .../csdn_config/config_spider/items.py | 20 ---- .../csdn_config/config_spider/middlewares.py | 103 ---------------- .../csdn_config/config_spider/pipelines.py | 27 ----- .../csdn_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 41 ------- backend/app/spiders/csdn_config/md5.txt | 1 - backend/app/spiders/csdn_config/scrapy.cfg | 11 -- backend/app/spiders/douban_config/Spiderfile | 57 --------- .../douban_config/config_spider/__init__.py | 0 .../douban_config/config_spider/items.py | 21 ---- .../config_spider/middlewares.py | 103 ---------------- .../douban_config/config_spider/pipelines.py | 27 ----- .../douban_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 36 ------ backend/app/spiders/douban_config/md5.txt | 1 - backend/app/spiders/douban_config/scrapy.cfg | 11 -- backend/app/spiders/jd/Spiderfile | 5 - backend/app/spiders/jd/jd/__init__.py | 0 backend/app/spiders/jd/jd/items.py | 15 --- backend/app/spiders/jd/jd/middlewares.py | 103 ---------------- backend/app/spiders/jd/jd/pipelines.py | 6 - backend/app/spiders/jd/jd/settings.py | 90 -------------- backend/app/spiders/jd/jd/spiders/__init__.py | 4 - .../app/spiders/jd/jd/spiders/jd_spider.py | 21 ---- backend/app/spiders/jd/md5.txt | 1 - backend/app/spiders/jd/scrapy.cfg | 11 -- backend/app/spiders/sinastock/Spiderfile | 5 - backend/app/spiders/sinastock/md5.txt | 1 - backend/app/spiders/sinastock/scrapy.cfg | 11 -- .../spiders/sinastock/sinastock/__init__.py | 0 .../app/spiders/sinastock/sinastock/items.py | 21 ---- .../sinastock/sinastock/middlewares.py | 103 ---------------- .../spiders/sinastock/sinastock/pipelines.py | 6 - .../spiders/sinastock/sinastock/settings.py | 89 -------------- .../sinastock/sinastock/spiders/__init__.py | 4 - .../sinastock/spiders/sinastock_spider.py | 59 ---------- backend/app/spiders/v2ex_config/Spiderfile | 54 --------- .../v2ex_config/config_spider/__init__.py | 0 .../v2ex_config/config_spider/items.py | 19 --- .../v2ex_config/config_spider/middlewares.py | 103 ---------------- .../v2ex_config/config_spider/pipelines.py | 27 ----- .../v2ex_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 38 ------ backend/app/spiders/v2ex_config/md5.txt | 1 - backend/app/spiders/v2ex_config/scrapy.cfg | 11 -- backend/app/spiders/xueqiu/Spiderfile | 5 - backend/app/spiders/xueqiu/md5.txt | 1 - backend/app/spiders/xueqiu/scrapy.cfg | 11 -- backend/app/spiders/xueqiu/xueqiu/__init__.py | 0 backend/app/spiders/xueqiu/xueqiu/items.py | 23 ---- .../app/spiders/xueqiu/xueqiu/middlewares.py | 103 ---------------- .../app/spiders/xueqiu/xueqiu/pipelines.py | 6 - backend/app/spiders/xueqiu/xueqiu/settings.py | 89 -------------- .../spiders/xueqiu/xueqiu/spiders/__init__.py | 4 - .../xueqiu/xueqiu/spiders/xueqiu_spider.py | 46 -------- backend/app/spiders/xueqiu_config/Spiderfile | 39 ------ .../xueqiu_config/config_spider/__init__.py | 0 .../xueqiu_config/config_spider/items.py | 18 --- .../config_spider/middlewares.py | 103 ---------------- .../xueqiu_config/config_spider/pipelines.py | 27 ----- .../xueqiu_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 33 ------ backend/app/spiders/xueqiu_config/md5.txt | 1 - backend/app/spiders/xueqiu_config/scrapy.cfg | 11 -- .../app/spiders/zongheng_config/Spiderfile | 45 ------- .../zongheng_config/config_spider/__init__.py | 0 .../zongheng_config/config_spider/items.py | 19 --- .../config_spider/middlewares.py | 103 ---------------- .../config_spider/pipelines.py | 27 ----- .../zongheng_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 34 ------ backend/app/spiders/zongheng_config/md5.txt | 1 - .../app/spiders/zongheng_config/scrapy.cfg | 11 -- 123 files changed, 4102 deletions(-) delete mode 100755 backend/app/spiders/amazon_config/Spiderfile delete mode 100755 backend/app/spiders/amazon_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/amazon_config/config_spider/items.py delete mode 100755 backend/app/spiders/amazon_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/amazon_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/amazon_config/config_spider/settings.py delete mode 100755 backend/app/spiders/amazon_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/amazon_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/amazon_config/md5.txt delete mode 100755 backend/app/spiders/amazon_config/scrapy.cfg delete mode 100755 backend/app/spiders/autohome_config/Spiderfile delete mode 100755 backend/app/spiders/autohome_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/autohome_config/config_spider/items.py delete mode 100755 backend/app/spiders/autohome_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/autohome_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/autohome_config/config_spider/settings.py delete mode 100755 backend/app/spiders/autohome_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/autohome_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/autohome_config/md5.txt delete mode 100755 backend/app/spiders/autohome_config/scrapy.cfg delete mode 100755 backend/app/spiders/baidu_config/Spiderfile delete mode 100755 backend/app/spiders/baidu_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/baidu_config/config_spider/items.py delete mode 100755 backend/app/spiders/baidu_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/baidu_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/baidu_config/config_spider/settings.py delete mode 100755 backend/app/spiders/baidu_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/baidu_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/baidu_config/md5.txt delete mode 100755 backend/app/spiders/baidu_config/scrapy.cfg delete mode 100755 backend/app/spiders/bing_general/Spiderfile delete mode 100755 backend/app/spiders/bing_general/bing_spider.py delete mode 100755 backend/app/spiders/bing_general/md5.txt delete mode 100755 backend/app/spiders/chinaz/Spiderfile delete mode 100755 backend/app/spiders/chinaz/chinaz/__init__.py delete mode 100755 backend/app/spiders/chinaz/chinaz/items.py delete mode 100755 backend/app/spiders/chinaz/chinaz/middlewares.py delete mode 100755 backend/app/spiders/chinaz/chinaz/pipelines.py delete mode 100755 backend/app/spiders/chinaz/chinaz/settings.py delete mode 100755 backend/app/spiders/chinaz/chinaz/spiders/__init__.py delete mode 100755 backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py delete mode 100755 backend/app/spiders/chinaz/md5.txt delete mode 100755 backend/app/spiders/chinaz/scrapy.cfg delete mode 100755 backend/app/spiders/csdn_config/Spiderfile delete mode 100755 backend/app/spiders/csdn_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/csdn_config/config_spider/items.py delete mode 100755 backend/app/spiders/csdn_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/csdn_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/csdn_config/config_spider/settings.py delete mode 100755 backend/app/spiders/csdn_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/csdn_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/csdn_config/md5.txt delete mode 100755 backend/app/spiders/csdn_config/scrapy.cfg delete mode 100755 backend/app/spiders/douban_config/Spiderfile delete mode 100755 backend/app/spiders/douban_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/douban_config/config_spider/items.py delete mode 100755 backend/app/spiders/douban_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/douban_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/douban_config/config_spider/settings.py delete mode 100755 backend/app/spiders/douban_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/douban_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/douban_config/md5.txt delete mode 100755 backend/app/spiders/douban_config/scrapy.cfg delete mode 100755 backend/app/spiders/jd/Spiderfile delete mode 100755 backend/app/spiders/jd/jd/__init__.py delete mode 100755 backend/app/spiders/jd/jd/items.py delete mode 100755 backend/app/spiders/jd/jd/middlewares.py delete mode 100755 backend/app/spiders/jd/jd/pipelines.py delete mode 100755 backend/app/spiders/jd/jd/settings.py delete mode 100755 backend/app/spiders/jd/jd/spiders/__init__.py delete mode 100755 backend/app/spiders/jd/jd/spiders/jd_spider.py delete mode 100755 backend/app/spiders/jd/md5.txt delete mode 100755 backend/app/spiders/jd/scrapy.cfg delete mode 100755 backend/app/spiders/sinastock/Spiderfile delete mode 100755 backend/app/spiders/sinastock/md5.txt delete mode 100755 backend/app/spiders/sinastock/scrapy.cfg delete mode 100755 backend/app/spiders/sinastock/sinastock/__init__.py delete mode 100755 backend/app/spiders/sinastock/sinastock/items.py delete mode 100755 backend/app/spiders/sinastock/sinastock/middlewares.py delete mode 100755 backend/app/spiders/sinastock/sinastock/pipelines.py delete mode 100755 backend/app/spiders/sinastock/sinastock/settings.py delete mode 100755 backend/app/spiders/sinastock/sinastock/spiders/__init__.py delete mode 100755 backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py delete mode 100755 backend/app/spiders/v2ex_config/Spiderfile delete mode 100755 backend/app/spiders/v2ex_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/v2ex_config/config_spider/items.py delete mode 100755 backend/app/spiders/v2ex_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/v2ex_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/v2ex_config/config_spider/settings.py delete mode 100755 backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/v2ex_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/v2ex_config/md5.txt delete mode 100755 backend/app/spiders/v2ex_config/scrapy.cfg delete mode 100755 backend/app/spiders/xueqiu/Spiderfile delete mode 100755 backend/app/spiders/xueqiu/md5.txt delete mode 100755 backend/app/spiders/xueqiu/scrapy.cfg delete mode 100755 backend/app/spiders/xueqiu/xueqiu/__init__.py delete mode 100755 backend/app/spiders/xueqiu/xueqiu/items.py delete mode 100755 backend/app/spiders/xueqiu/xueqiu/middlewares.py delete mode 100755 backend/app/spiders/xueqiu/xueqiu/pipelines.py delete mode 100755 backend/app/spiders/xueqiu/xueqiu/settings.py delete mode 100755 backend/app/spiders/xueqiu/xueqiu/spiders/__init__.py delete mode 100755 backend/app/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py delete mode 100755 backend/app/spiders/xueqiu_config/Spiderfile delete mode 100755 backend/app/spiders/xueqiu_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/xueqiu_config/config_spider/items.py delete mode 100755 backend/app/spiders/xueqiu_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/xueqiu_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/xueqiu_config/config_spider/settings.py delete mode 100755 backend/app/spiders/xueqiu_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/xueqiu_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/xueqiu_config/md5.txt delete mode 100755 backend/app/spiders/xueqiu_config/scrapy.cfg delete mode 100755 backend/app/spiders/zongheng_config/Spiderfile delete mode 100755 backend/app/spiders/zongheng_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/zongheng_config/config_spider/items.py delete mode 100755 backend/app/spiders/zongheng_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/zongheng_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/zongheng_config/config_spider/settings.py delete mode 100755 backend/app/spiders/zongheng_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/zongheng_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/zongheng_config/md5.txt delete mode 100755 backend/app/spiders/zongheng_config/scrapy.cfg diff --git a/backend/app/spiders/amazon_config/Spiderfile b/backend/app/spiders/amazon_config/Spiderfile deleted file mode 100755 index eea8a538..00000000 --- a/backend/app/spiders/amazon_config/Spiderfile +++ /dev/null @@ -1,51 +0,0 @@ -name: "amazon_config" -display_name: "亚马逊中国(可配置)" -remark: "亚马逊中国搜索手机,列表+分页" -type: "configurable" -col: "results_amazon_config" -engine: scrapy -start_url: https://www.amazon.cn/s?k=%E6%89%8B%E6%9C%BA&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_2 -start_stage: list -stages: -- name: list - is_list: true - list_css: .s-result-item - list_xpath: "" - page_css: .a-last > a - page_xpath: "" - page_attr: href - fields: - - name: title - css: span.a-text-normal - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: url - css: .a-link-normal - xpath: "" - attr: href - next_stage: "" - remark: "" - - name: price - css: "" - xpath: .//*[@class="a-price-whole"] - attr: "" - next_stage: "" - remark: "" - - name: price_fraction - css: "" - xpath: .//*[@class="a-price-fraction"] - attr: "" - next_stage: "" - remark: "" - - name: img - css: .s-image-square-aspect > img - xpath: "" - attr: src - next_stage: "" - remark: "" -settings: - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/amazon_config/config_spider/__init__.py b/backend/app/spiders/amazon_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/amazon_config/config_spider/items.py b/backend/app/spiders/amazon_config/config_spider/items.py deleted file mode 100755 index 79bf0adb..00000000 --- a/backend/app/spiders/amazon_config/config_spider/items.py +++ /dev/null @@ -1,20 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - title = scrapy.Field() - url = scrapy.Field() - price = scrapy.Field() - price_fraction = scrapy.Field() - img = scrapy.Field() - diff --git a/backend/app/spiders/amazon_config/config_spider/middlewares.py b/backend/app/spiders/amazon_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/amazon_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/amazon_config/config_spider/pipelines.py b/backend/app/spiders/amazon_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/amazon_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/amazon_config/config_spider/settings.py b/backend/app/spiders/amazon_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/amazon_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/amazon_config/config_spider/spiders/__init__.py b/backend/app/spiders/amazon_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/amazon_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/amazon_config/config_spider/spiders/spider.py b/backend/app/spiders/amazon_config/config_spider/spiders/spider.py deleted file mode 100755 index a7421df3..00000000 --- a/backend/app/spiders/amazon_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,37 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='https://www.amazon.cn/s?k=%E6%89%8B%E6%9C%BA&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_2', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.css('.s-result-item'): - item = Item() - item['title'] = elem.css('span.a-text-normal::text').extract_first() - item['url'] = elem.css('.a-link-normal::attr("href")').extract_first() - item['price'] = elem.xpath('string(.//*[@class="a-price-whole"])').extract_first() - item['price_fraction'] = elem.xpath('string(.//*[@class="a-price-fraction"])').extract_first() - item['img'] = elem.css('.s-image-square-aspect > img::attr("src")').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield item - next_url = response.css('.a-last > a::attr("href")').extract_first() - yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) - - diff --git a/backend/app/spiders/amazon_config/md5.txt b/backend/app/spiders/amazon_config/md5.txt deleted file mode 100755 index 52c5423f..00000000 --- a/backend/app/spiders/amazon_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -4b716dd3c15b993ccb7a9f0be1cc0de9 diff --git a/backend/app/spiders/amazon_config/scrapy.cfg b/backend/app/spiders/amazon_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/amazon_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider diff --git a/backend/app/spiders/autohome_config/Spiderfile b/backend/app/spiders/autohome_config/Spiderfile deleted file mode 100755 index e69880cb..00000000 --- a/backend/app/spiders/autohome_config/Spiderfile +++ /dev/null @@ -1,57 +0,0 @@ -name: "autohome_config" -display_name: "汽车之家(可配置)" -remark: "汽车之家文章,列表+详情+分页" -type: "configurable" -col: "results_autohome_config" -engine: scrapy -start_url: https://www.autohome.com.cn/all/ -start_stage: list -stages: -- name: list - is_list: true - list_css: ul.article > li - list_xpath: "" - page_css: a.page-item-next - page_xpath: "" - page_attr: href - fields: - - name: title - css: li > a > h3 - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: url - css: li > a - xpath: "" - attr: href - next_stage: "" - remark: "" - - name: abstract - css: li > a > p - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: time - css: li > a .fn-left - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: views - css: li > a .fn-right > em:first-child - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: comments - css: li > a .fn-right > em:last-child - xpath: "" - attr: "" - next_stage: "" - remark: "" -settings: - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/autohome_config/config_spider/__init__.py b/backend/app/spiders/autohome_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/autohome_config/config_spider/items.py b/backend/app/spiders/autohome_config/config_spider/items.py deleted file mode 100755 index 206203d5..00000000 --- a/backend/app/spiders/autohome_config/config_spider/items.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - title = scrapy.Field() - url = scrapy.Field() - abstract = scrapy.Field() - time = scrapy.Field() - views = scrapy.Field() - comments = scrapy.Field() - diff --git a/backend/app/spiders/autohome_config/config_spider/middlewares.py b/backend/app/spiders/autohome_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/autohome_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/autohome_config/config_spider/pipelines.py b/backend/app/spiders/autohome_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/autohome_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/autohome_config/config_spider/settings.py b/backend/app/spiders/autohome_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/autohome_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/autohome_config/config_spider/spiders/__init__.py b/backend/app/spiders/autohome_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/autohome_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/autohome_config/config_spider/spiders/spider.py b/backend/app/spiders/autohome_config/config_spider/spiders/spider.py deleted file mode 100755 index 83753f5a..00000000 --- a/backend/app/spiders/autohome_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,38 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='https://www.autohome.com.cn/all/', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.css('ul.article > li'): - item = Item() - item['title'] = elem.css('li > a > h3::text').extract_first() - item['url'] = elem.css('li > a::attr("href")').extract_first() - item['abstract'] = elem.css('li > a > p::text').extract_first() - item['time'] = elem.css('li > a .fn-left::text').extract_first() - item['views'] = elem.css('li > a .fn-right > em:first-child::text').extract_first() - item['comments'] = elem.css('li > a .fn-right > em:last-child::text').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield item - next_url = response.css('a.page-item-next::attr("href")').extract_first() - yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) - - diff --git a/backend/app/spiders/autohome_config/md5.txt b/backend/app/spiders/autohome_config/md5.txt deleted file mode 100755 index c4707adf..00000000 --- a/backend/app/spiders/autohome_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -d784a11085e298eaf344eadc3a3e9411 diff --git a/backend/app/spiders/autohome_config/scrapy.cfg b/backend/app/spiders/autohome_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/autohome_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider diff --git a/backend/app/spiders/baidu_config/Spiderfile b/backend/app/spiders/baidu_config/Spiderfile deleted file mode 100755 index a29d4acb..00000000 --- a/backend/app/spiders/baidu_config/Spiderfile +++ /dev/null @@ -1,39 +0,0 @@ -name: "baidu_config" -display_name: "百度搜索(可配置)" -remark: "百度搜索Crawlab,列表+分页" -type: "configurable" -col: "results_baidu_config" -engine: scrapy -start_url: http://www.baidu.com/s?wd=crawlab -start_stage: list -stages: -- name: list - is_list: true - list_css: ".result.c-container" - list_xpath: "" - page_css: "a.n" - page_xpath: "" - page_attr: href - fields: - - name: title - css: "" - xpath: .//h3/a - attr: "" - next_stage: "" - remark: "" - - name: url - css: "" - xpath: .//h3/a - attr: href - next_stage: "" - remark: "" - - name: abstract - css: "" - xpath: .//*[@class="c-abstract"] - attr: "" - next_stage: "" - remark: "" -settings: - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/baidu_config/config_spider/__init__.py b/backend/app/spiders/baidu_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/baidu_config/config_spider/items.py b/backend/app/spiders/baidu_config/config_spider/items.py deleted file mode 100755 index 9282765f..00000000 --- a/backend/app/spiders/baidu_config/config_spider/items.py +++ /dev/null @@ -1,18 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - title = scrapy.Field() - url = scrapy.Field() - abstract = scrapy.Field() - diff --git a/backend/app/spiders/baidu_config/config_spider/middlewares.py b/backend/app/spiders/baidu_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/baidu_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/baidu_config/config_spider/pipelines.py b/backend/app/spiders/baidu_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/baidu_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/baidu_config/config_spider/settings.py b/backend/app/spiders/baidu_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/baidu_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/baidu_config/config_spider/spiders/__init__.py b/backend/app/spiders/baidu_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/baidu_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/baidu_config/config_spider/spiders/spider.py b/backend/app/spiders/baidu_config/config_spider/spiders/spider.py deleted file mode 100755 index e5fd793f..00000000 --- a/backend/app/spiders/baidu_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,35 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='http://www.baidu.com/s?wd=crawlab', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.css('.result.c-container'): - item = Item() - item['title'] = elem.xpath('string(.//h3/a)').extract_first() - item['url'] = elem.xpath('.//h3/a/@href').extract_first() - item['abstract'] = elem.xpath('string(.//*[@class="c-abstract"])').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield item - next_url = response.css('a.n::attr("href")').extract_first() - yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) - - diff --git a/backend/app/spiders/baidu_config/md5.txt b/backend/app/spiders/baidu_config/md5.txt deleted file mode 100755 index 32137b76..00000000 --- a/backend/app/spiders/baidu_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -ba25f6f3567b256473d3f0ec6af783fd diff --git a/backend/app/spiders/baidu_config/scrapy.cfg b/backend/app/spiders/baidu_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/baidu_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider diff --git a/backend/app/spiders/bing_general/Spiderfile b/backend/app/spiders/bing_general/Spiderfile deleted file mode 100755 index 614c135e..00000000 --- a/backend/app/spiders/bing_general/Spiderfile +++ /dev/null @@ -1,6 +0,0 @@ -name: "bing_general" -display_name: "必应搜索 (通用)" -remark: "必应搜索 Crawlab,列表+分页" -col: "results_bing_general" -type: "customized" -cmd: "python bing_spider.py" \ No newline at end of file diff --git a/backend/app/spiders/bing_general/bing_spider.py b/backend/app/spiders/bing_general/bing_spider.py deleted file mode 100755 index e982e4ee..00000000 --- a/backend/app/spiders/bing_general/bing_spider.py +++ /dev/null @@ -1,41 +0,0 @@ -import requests -from bs4 import BeautifulSoup as bs -from urllib.parse import urljoin, urlparse -import re -from crawlab import save_item - -s = requests.Session() - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -def start_requests(): - for i in range(0, 9): - fr = 'PERE' if not i else 'MORE' - url = f'https://cn.bing.com/search?q=crawlab&first={10 * i + 1}&FROM={fr}' - request_page(url) - -def request_page(url): - print(f'requesting {url}') - r = s.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}) - parse_list(r) - -def parse_list(response): - soup = bs(response.content.decode('utf-8')) - for el in list(soup.select('#b_results > li')): - try: - save_item({ - 'title': el.select_one('h2').text, - 'url': el.select_one('h2 a').attrs.get('href'), - 'abstract': el.select_one('.b_caption p').text, - }) - except: - pass - -if __name__ == '__main__': - start_requests() \ No newline at end of file diff --git a/backend/app/spiders/bing_general/md5.txt b/backend/app/spiders/bing_general/md5.txt deleted file mode 100755 index 42fb6afd..00000000 --- a/backend/app/spiders/bing_general/md5.txt +++ /dev/null @@ -1 +0,0 @@ -cf295b694a20c99c4857f838aa0402a7 diff --git a/backend/app/spiders/chinaz/Spiderfile b/backend/app/spiders/chinaz/Spiderfile deleted file mode 100755 index 2fb940bb..00000000 --- a/backend/app/spiders/chinaz/Spiderfile +++ /dev/null @@ -1,5 +0,0 @@ -name: "chinaz" -display_name: "站长之家 (Scrapy)" -col: "results_chinaz" -type: "customized" -cmd: "scrapy crawl chinaz_spider" \ No newline at end of file diff --git a/backend/app/spiders/chinaz/chinaz/__init__.py b/backend/app/spiders/chinaz/chinaz/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/chinaz/chinaz/items.py b/backend/app/spiders/chinaz/chinaz/items.py deleted file mode 100755 index 1fdcac1b..00000000 --- a/backend/app/spiders/chinaz/chinaz/items.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class ChinazItem(scrapy.Item): - # define the fields for your item here like: - _id = scrapy.Field() - task_id = scrapy.Field() - name = scrapy.Field() - domain = scrapy.Field() - description = scrapy.Field() - rank = scrapy.Field() - main_category = scrapy.Field() - category = scrapy.Field() - location = scrapy.Field() diff --git a/backend/app/spiders/chinaz/chinaz/middlewares.py b/backend/app/spiders/chinaz/chinaz/middlewares.py deleted file mode 100755 index c98995d5..00000000 --- a/backend/app/spiders/chinaz/chinaz/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ChinazSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Response, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ChinazDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/chinaz/chinaz/pipelines.py b/backend/app/spiders/chinaz/chinaz/pipelines.py deleted file mode 100755 index b29f9eb7..00000000 --- a/backend/app/spiders/chinaz/chinaz/pipelines.py +++ /dev/null @@ -1,7 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html - diff --git a/backend/app/spiders/chinaz/chinaz/settings.py b/backend/app/spiders/chinaz/chinaz/settings.py deleted file mode 100755 index 932ec9ed..00000000 --- a/backend/app/spiders/chinaz/chinaz/settings.py +++ /dev/null @@ -1,90 +0,0 @@ -# -*- coding: utf-8 -*- - -# Scrapy settings for chinaz project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://doc.scrapy.org/en/latest/topics/settings.html -# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'chinaz' - -SPIDER_MODULES = ['chinaz.spiders'] -NEWSPIDER_MODULE = 'chinaz.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'chinaz (+http://www.yourdomain.com)' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'chinaz.middlewares.ChinazSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'chinaz.middlewares.ChinazDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://doc.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'crawlab.pipelines.CrawlabMongoPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/chinaz/chinaz/spiders/__init__.py b/backend/app/spiders/chinaz/chinaz/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/chinaz/chinaz/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py b/backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py deleted file mode 100755 index 28ad84e7..00000000 --- a/backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py +++ /dev/null @@ -1,63 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -from chinaz.items import ChinazItem - - -class ChinazSpiderSpider(scrapy.Spider): - name = 'chinaz_spider' - allowed_domains = ['chinaz.com'] - start_urls = ['http://top.chinaz.com/hangye/'] - - def parse(self, response): - for item in response.css('.listCentent > li'): - name = item.css('h3.rightTxtHead > a::text').extract_first() - href = item.css('h3.rightTxtHead > a::attr("href")').extract_first() - domain = item.css('h3.rightTxtHead > span::text').extract_first() - description = item.css('p.RtCInfo::text').extract_first() - rank = item.css('.RtCRateCent > strong::text').extract_first() - rank = int(rank) - item = ChinazItem( - _id=domain, - name=name, - domain=domain, - description=description, - rank=rank, - ) - yield scrapy.Request( - url='http://top.chinaz.com' + href, - callback=self.parse_item, - meta={ - 'item': item - } - ) - - # pagination - a_list = response.css('.ListPageWrap > a::attr("href")').extract() - url = 'http://top.chinaz.com/hangye/' + a_list[-1] - yield scrapy.Request(url=url, callback=self.parse) - - def parse_item(self, response): - item = response.meta['item'] - - # category info extraction - arr = response.css('.TopMainTag-show .SimSun') - res1 = arr[0].css('a::text').extract() - main_category = res1[0] - if len(res1) == 1: - category = '其他' - else: - category = res1[1] - - # location info extraction - res2 = arr[1].css('a::text').extract() - if len(res2) > 0: - location = res2[0] - else: - location = '其他' - - # assign values to item - item['main_category'] = main_category - item['category'] = category - item['location'] = location - - yield item diff --git a/backend/app/spiders/chinaz/md5.txt b/backend/app/spiders/chinaz/md5.txt deleted file mode 100755 index f5e15fb9..00000000 --- a/backend/app/spiders/chinaz/md5.txt +++ /dev/null @@ -1 +0,0 @@ -1976593e49bf0238602ce35d051bd137 diff --git a/backend/app/spiders/chinaz/scrapy.cfg b/backend/app/spiders/chinaz/scrapy.cfg deleted file mode 100755 index d3b44a1a..00000000 --- a/backend/app/spiders/chinaz/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = chinaz.settings - -[deploy] -#url = http://localhost:6800/ -project = chinaz diff --git a/backend/app/spiders/csdn_config/Spiderfile b/backend/app/spiders/csdn_config/Spiderfile deleted file mode 100755 index 67f4f8c5..00000000 --- a/backend/app/spiders/csdn_config/Spiderfile +++ /dev/null @@ -1,60 +0,0 @@ -name: "csdn_config" -display_name: "CSDN(可配置)" -remark: "CSDN Crawlab 文章,列表+详情+分页" -type: "configurable" -col: "results_csdn_config" -engine: scrapy -start_url: https://so.csdn.net/so/search/s.do?q=crawlab -start_stage: list -stages: -- name: list - is_list: true - list_css: .search-list-con > .search-list - list_xpath: "" - page_css: a.btn-next - page_xpath: "" - page_attr: href - fields: - - name: url - css: "" - xpath: .//*[@class="limit_width"]/a - attr: href - next_stage: detail - remark: "" -- name: detail - is_list: false - list_css: "" - list_xpath: "" - page_css: "" - page_xpath: "" - page_attr: "" - fields: - - name: content - css: "" - xpath: .//div[@id="content_views"] - attr: "" - next_stage: "" - remark: "" - - name: views - css: .read-count - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: title - css: .title-article - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: author - css: .follow-nickName - xpath: "" - attr: "" - next_stage: "" - remark: "" -settings: - AUTOTHROTTLE_ENABLED: "false" - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/79.0.3945.117 Safari/537.36 diff --git a/backend/app/spiders/csdn_config/config_spider/__init__.py b/backend/app/spiders/csdn_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/csdn_config/config_spider/items.py b/backend/app/spiders/csdn_config/config_spider/items.py deleted file mode 100755 index 3c8e5e54..00000000 --- a/backend/app/spiders/csdn_config/config_spider/items.py +++ /dev/null @@ -1,20 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - url = scrapy.Field() - content = scrapy.Field() - views = scrapy.Field() - title = scrapy.Field() - author = scrapy.Field() - diff --git a/backend/app/spiders/csdn_config/config_spider/middlewares.py b/backend/app/spiders/csdn_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/csdn_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/csdn_config/config_spider/pipelines.py b/backend/app/spiders/csdn_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/csdn_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/csdn_config/config_spider/settings.py b/backend/app/spiders/csdn_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/csdn_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/csdn_config/config_spider/spiders/__init__.py b/backend/app/spiders/csdn_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/csdn_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/csdn_config/config_spider/spiders/spider.py b/backend/app/spiders/csdn_config/config_spider/spiders/spider.py deleted file mode 100755 index 9ecc4aae..00000000 --- a/backend/app/spiders/csdn_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,41 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='https://so.csdn.net/so/search/s.do?q=crawlab', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.css('.search-list-con > .search-list'): - item = Item() - item['url'] = elem.xpath('.//*[@class="limit_width"]/a/@href').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield scrapy.Request(url=get_real_url(response, item['url']), callback=self.parse_detail, meta={'item': item}) - next_url = response.css('a.btn-next::attr("href")').extract_first() - yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) - - def parse_detail(self, response): - item = Item() if response.meta.get('item') is None else response.meta.get('item') - item['content'] = response.xpath('string(.//div[@id="content_views"])').extract_first() - item['views'] = response.css('.read-count::text').extract_first() - item['title'] = response.css('.title-article::text').extract_first() - item['author'] = response.css('.follow-nickName::text').extract_first() - yield item - - diff --git a/backend/app/spiders/csdn_config/md5.txt b/backend/app/spiders/csdn_config/md5.txt deleted file mode 100755 index e169c42a..00000000 --- a/backend/app/spiders/csdn_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -b6889c74e006a5e619b525d84db62ffd diff --git a/backend/app/spiders/csdn_config/scrapy.cfg b/backend/app/spiders/csdn_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/csdn_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider diff --git a/backend/app/spiders/douban_config/Spiderfile b/backend/app/spiders/douban_config/Spiderfile deleted file mode 100755 index 84f0647a..00000000 --- a/backend/app/spiders/douban_config/Spiderfile +++ /dev/null @@ -1,57 +0,0 @@ -name: "douban_config" -display_name: "豆瓣读书(可配置)" -remark: "豆瓣读书新书推荐,列表" -type: "configurable" -col: "results_douban_config" -engine: scrapy -start_url: https://book.douban.com/latest -start_stage: list -stages: -- name: list - is_list: true - list_css: ul.cover-col-4 > li - list_xpath: "" - page_css: "" - page_xpath: "" - page_attr: "" - fields: - - name: title - css: h2 > a - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: url - css: h2 > a - xpath: "" - attr: href - next_stage: "" - remark: "" - - name: img - css: a.cover img - xpath: "" - attr: src - next_stage: "" - remark: "" - - name: rating - css: p.rating > .color-lightgray - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: abstract - css: p:last-child - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: info - css: .color-gray - xpath: "" - attr: "" - next_stage: "" - remark: "" -settings: - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/douban_config/config_spider/__init__.py b/backend/app/spiders/douban_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/douban_config/config_spider/items.py b/backend/app/spiders/douban_config/config_spider/items.py deleted file mode 100755 index d6959b8d..00000000 --- a/backend/app/spiders/douban_config/config_spider/items.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - title = scrapy.Field() - url = scrapy.Field() - img = scrapy.Field() - rating = scrapy.Field() - abstract = scrapy.Field() - info = scrapy.Field() - diff --git a/backend/app/spiders/douban_config/config_spider/middlewares.py b/backend/app/spiders/douban_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/douban_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/douban_config/config_spider/pipelines.py b/backend/app/spiders/douban_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/douban_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/douban_config/config_spider/settings.py b/backend/app/spiders/douban_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/douban_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/douban_config/config_spider/spiders/__init__.py b/backend/app/spiders/douban_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/douban_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/douban_config/config_spider/spiders/spider.py b/backend/app/spiders/douban_config/config_spider/spiders/spider.py deleted file mode 100755 index 61bb648d..00000000 --- a/backend/app/spiders/douban_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,36 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='https://book.douban.com/latest', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.css('ul.cover-col-4 > li'): - item = Item() - item['title'] = elem.css('h2 > a::text').extract_first() - item['url'] = elem.css('h2 > a::attr("href")').extract_first() - item['img'] = elem.css('a.cover img::attr("src")').extract_first() - item['rating'] = elem.css('p.rating > .color-lightgray::text').extract_first() - item['abstract'] = elem.css('p:last-child::text').extract_first() - item['info'] = elem.css('.color-gray::text').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield item - - diff --git a/backend/app/spiders/douban_config/md5.txt b/backend/app/spiders/douban_config/md5.txt deleted file mode 100755 index 374e3804..00000000 --- a/backend/app/spiders/douban_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -4d59a6c83b0e125d5321beae86bb93ce diff --git a/backend/app/spiders/douban_config/scrapy.cfg b/backend/app/spiders/douban_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/douban_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider diff --git a/backend/app/spiders/jd/Spiderfile b/backend/app/spiders/jd/Spiderfile deleted file mode 100755 index d090472b..00000000 --- a/backend/app/spiders/jd/Spiderfile +++ /dev/null @@ -1,5 +0,0 @@ -name: "jd" -display_name: "京东 (Scrapy)" -col: "results_jd" -type: "customized" -cmd: "scrapy crawl jd_spider" \ No newline at end of file diff --git a/backend/app/spiders/jd/jd/__init__.py b/backend/app/spiders/jd/jd/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/jd/jd/items.py b/backend/app/spiders/jd/jd/items.py deleted file mode 100755 index b2c5e647..00000000 --- a/backend/app/spiders/jd/jd/items.py +++ /dev/null @@ -1,15 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class JdItem(scrapy.Item): - # define the fields for your item here like: - name = scrapy.Field() - price = scrapy.Field() - url = scrapy.Field() diff --git a/backend/app/spiders/jd/jd/middlewares.py b/backend/app/spiders/jd/jd/middlewares.py deleted file mode 100755 index 6fceded5..00000000 --- a/backend/app/spiders/jd/jd/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class JdSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Response, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class JdDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/jd/jd/pipelines.py b/backend/app/spiders/jd/jd/pipelines.py deleted file mode 100755 index 5a7d7cbf..00000000 --- a/backend/app/spiders/jd/jd/pipelines.py +++ /dev/null @@ -1,6 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/backend/app/spiders/jd/jd/settings.py b/backend/app/spiders/jd/jd/settings.py deleted file mode 100755 index ef89ed0c..00000000 --- a/backend/app/spiders/jd/jd/settings.py +++ /dev/null @@ -1,90 +0,0 @@ -# -*- coding: utf-8 -*- - -# Scrapy settings for jd project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://doc.scrapy.org/en/latest/topics/settings.html -# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'jd' - -SPIDER_MODULES = ['jd.spiders'] -NEWSPIDER_MODULE = 'jd.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'jd (+http://www.yourdomain.com)' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = False - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'jd.middlewares.JdSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'jd.middlewares.JdDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://doc.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'crawlab.pipelines.CrawlabMongoPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/jd/jd/spiders/__init__.py b/backend/app/spiders/jd/jd/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/jd/jd/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/jd/jd/spiders/jd_spider.py b/backend/app/spiders/jd/jd/spiders/jd_spider.py deleted file mode 100755 index 4ec94fa9..00000000 --- a/backend/app/spiders/jd/jd/spiders/jd_spider.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy - -from jd.items import JdItem - - -class JdSpiderSpider(scrapy.Spider): - name = 'jd_spider' - allowed_domains = ['jd.com'] - - def start_requests(self): - for i in range(1, 50): - yield scrapy.Request(url=f'https://search.jd.com/Search?keyword=手机&enc=utf-8&page={i}') - - def parse(self, response): - for el in response.css('.gl-item'): - yield JdItem( - url=el.css('.p-name > a::attr("href")').extract_first(), - name=el.css('.p-name > a::attr("title")').extract_first(), - price=float(el.css('.p-price i::text').extract_first()), - ) diff --git a/backend/app/spiders/jd/md5.txt b/backend/app/spiders/jd/md5.txt deleted file mode 100755 index dcd53f51..00000000 --- a/backend/app/spiders/jd/md5.txt +++ /dev/null @@ -1 +0,0 @@ -621486d31459514eb27a082d159d9b8c diff --git a/backend/app/spiders/jd/scrapy.cfg b/backend/app/spiders/jd/scrapy.cfg deleted file mode 100755 index 87cf0280..00000000 --- a/backend/app/spiders/jd/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = jd.settings - -[deploy] -#url = http://localhost:6800/ -project = jd diff --git a/backend/app/spiders/sinastock/Spiderfile b/backend/app/spiders/sinastock/Spiderfile deleted file mode 100755 index b110cb48..00000000 --- a/backend/app/spiders/sinastock/Spiderfile +++ /dev/null @@ -1,5 +0,0 @@ -name: "sinastock" -display_name: "新浪股票 (Scrapy)" -type: "customized" -col: "results_sinastock" -cmd: "scrapy crawl sinastock_spider" \ No newline at end of file diff --git a/backend/app/spiders/sinastock/md5.txt b/backend/app/spiders/sinastock/md5.txt deleted file mode 100755 index 1e5d8ab9..00000000 --- a/backend/app/spiders/sinastock/md5.txt +++ /dev/null @@ -1 +0,0 @@ -80bc091fa45ef4a85c9f1a66c81a4ed7 diff --git a/backend/app/spiders/sinastock/scrapy.cfg b/backend/app/spiders/sinastock/scrapy.cfg deleted file mode 100755 index 4969ad96..00000000 --- a/backend/app/spiders/sinastock/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = sinastock.settings - -[deploy] -#url = http://localhost:6800/ -project = sinastock diff --git a/backend/app/spiders/sinastock/sinastock/__init__.py b/backend/app/spiders/sinastock/sinastock/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/sinastock/sinastock/items.py b/backend/app/spiders/sinastock/sinastock/items.py deleted file mode 100755 index 6e3e5d8e..00000000 --- a/backend/app/spiders/sinastock/sinastock/items.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class NewsItem(scrapy.Item): - # define the fields for your item here like: - _id = scrapy.Field() - title = scrapy.Field() - ts_str = scrapy.Field() - ts = scrapy.Field() - url = scrapy.Field() - text = scrapy.Field() - task_id = scrapy.Field() - source = scrapy.Field() - stocks = scrapy.Field() diff --git a/backend/app/spiders/sinastock/sinastock/middlewares.py b/backend/app/spiders/sinastock/sinastock/middlewares.py deleted file mode 100755 index 912b5e57..00000000 --- a/backend/app/spiders/sinastock/sinastock/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class SinastockSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Response, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class SinastockDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/sinastock/sinastock/pipelines.py b/backend/app/spiders/sinastock/sinastock/pipelines.py deleted file mode 100755 index 5a7d7cbf..00000000 --- a/backend/app/spiders/sinastock/sinastock/pipelines.py +++ /dev/null @@ -1,6 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/backend/app/spiders/sinastock/sinastock/settings.py b/backend/app/spiders/sinastock/sinastock/settings.py deleted file mode 100755 index 3e01d3ca..00000000 --- a/backend/app/spiders/sinastock/sinastock/settings.py +++ /dev/null @@ -1,89 +0,0 @@ -# -*- coding: utf-8 -*- - -# Scrapy settings for sinastock project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://doc.scrapy.org/en/latest/topics/settings.html -# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'sinastock' - -SPIDER_MODULES = ['sinastock.spiders'] -NEWSPIDER_MODULE = 'sinastock.spiders' - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -# USER_AGENT = 'sinastock (+http://www.yourdomain.com)' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -# CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -# DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -# CONCURRENT_REQUESTS_PER_DOMAIN = 16 -# CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -# COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -# TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -# DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -# } - -# Enable or disable spider middlewares -# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html -# SPIDER_MIDDLEWARES = { -# 'sinastock.middlewares.SinastockSpiderMiddleware': 543, -# } - -# Enable or disable downloader middlewares -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -# DOWNLOADER_MIDDLEWARES = { -# 'sinastock.middlewares.SinastockDownloaderMiddleware': 543, -# } - -# Enable or disable extensions -# See https://doc.scrapy.org/en/latest/topics/extensions.html -# EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -# } - -# Configure item pipelines -# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'crawlab.pipelines.CrawlabMongoPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/autothrottle.html -# AUTOTHROTTLE_ENABLED = True -# The initial download delay -# AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -# AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -# AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -# HTTPCACHE_ENABLED = True -# HTTPCACHE_EXPIRATION_SECS = 0 -# HTTPCACHE_DIR = 'httpcache' -# HTTPCACHE_IGNORE_HTTP_CODES = [] -# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/sinastock/sinastock/spiders/__init__.py b/backend/app/spiders/sinastock/sinastock/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/sinastock/sinastock/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py b/backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py deleted file mode 100755 index 54daf763..00000000 --- a/backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py +++ /dev/null @@ -1,59 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -from datetime import datetime - -import scrapy -from pymongo import MongoClient - -from sinastock.items import NewsItem - -class SinastockSpiderSpider(scrapy.Spider): - name = 'sinastock_spider' - allowed_domains = ['finance.sina.com.cn'] - mongo = MongoClient( - host=os.environ.get('MONGO_HOST') or 'localhost', - port=int(os.environ.get('MONGO_PORT') or 27017) - ) - db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test'] - col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'stock_news') - - def start_requests(self): - col = self.db['stocks'] - for s in col.find({}): - code, ex = s['ts_code'].split('.') - for i in range(10): - url = f'http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol={ex.lower()}{code}&Page={i + 1}' - yield scrapy.Request( - url=url, - callback=self.parse, - meta={'ts_code': s['ts_code']} - ) - - def parse(self, response): - for a in response.css('.datelist > ul > a'): - url = a.css('a::attr("href")').extract_first() - item = NewsItem( - title=a.css('a::text').extract_first(), - url=url, - source='sina', - stocks=[response.meta['ts_code']] - ) - yield scrapy.Request( - url=url, - callback=self.parse_detail, - meta={'item': item} - ) - - def parse_detail(self, response): - item = response.meta['item'] - text = response.css('#artibody').extract_first() - pre = re.compile('>(.*?)<') - text = ''.join(pre.findall(text)) - item['text'] = text.replace('\u3000', '') - item['ts_str'] = response.css('.date::text').extract_first() - if item['text'] is None or item['ts_str'] is None: - pass - else: - item['ts'] = datetime.strptime(item['ts_str'], '%Y年%m月%d日 %H:%M') - yield item diff --git a/backend/app/spiders/v2ex_config/Spiderfile b/backend/app/spiders/v2ex_config/Spiderfile deleted file mode 100755 index bb18d40a..00000000 --- a/backend/app/spiders/v2ex_config/Spiderfile +++ /dev/null @@ -1,54 +0,0 @@ -name: "v2ex_config" -display_name: "V2ex(可配置)" -remark: "V2ex,列表+详情" -type: "configurable" -col: "results_v2ex_config" -engine: scrapy -start_url: https://v2ex.com/ -start_stage: list -stages: -- name: list - is_list: true - list_css: .cell.item - list_xpath: "" - page_css: "" - page_xpath: "" - page_attr: href - fields: - - name: title - css: a.topic-link - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: url - css: a.topic-link - xpath: "" - attr: href - next_stage: detail - remark: "" - - name: replies - css: .count_livid - xpath: "" - attr: "" - next_stage: "" - remark: "" -- name: detail - is_list: false - list_css: "" - list_xpath: "" - page_css: "" - page_xpath: "" - page_attr: "" - fields: - - name: content - css: "" - xpath: .//*[@class="markdown_body"] - attr: "" - next_stage: "" - remark: "" -settings: - AUTOTHROTTLE_ENABLED: "true" - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/79.0.3945.117 Safari/537.36 diff --git a/backend/app/spiders/v2ex_config/config_spider/__init__.py b/backend/app/spiders/v2ex_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/v2ex_config/config_spider/items.py b/backend/app/spiders/v2ex_config/config_spider/items.py deleted file mode 100755 index d2c01a06..00000000 --- a/backend/app/spiders/v2ex_config/config_spider/items.py +++ /dev/null @@ -1,19 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - title = scrapy.Field() - url = scrapy.Field() - replies = scrapy.Field() - content = scrapy.Field() - diff --git a/backend/app/spiders/v2ex_config/config_spider/middlewares.py b/backend/app/spiders/v2ex_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/v2ex_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/v2ex_config/config_spider/pipelines.py b/backend/app/spiders/v2ex_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/v2ex_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/v2ex_config/config_spider/settings.py b/backend/app/spiders/v2ex_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/v2ex_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py b/backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/v2ex_config/config_spider/spiders/spider.py b/backend/app/spiders/v2ex_config/config_spider/spiders/spider.py deleted file mode 100755 index 4763e040..00000000 --- a/backend/app/spiders/v2ex_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,38 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='https://v2ex.com/', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.css('.cell.item'): - item = Item() - item['title'] = elem.css('a.topic-link::text').extract_first() - item['url'] = elem.css('a.topic-link::attr("href")').extract_first() - item['replies'] = elem.css('.count_livid::text').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield scrapy.Request(url=get_real_url(response, item['url']), callback=self.parse_detail, meta={'item': item}) - - def parse_detail(self, response): - item = Item() if response.meta.get('item') is None else response.meta.get('item') - item['content'] = response.xpath('string(.//*[@class="markdown_body"])').extract_first() - yield item - - diff --git a/backend/app/spiders/v2ex_config/md5.txt b/backend/app/spiders/v2ex_config/md5.txt deleted file mode 100755 index 5d725b2c..00000000 --- a/backend/app/spiders/v2ex_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -402c0a07873ef74b9b574bc0f6b28423 diff --git a/backend/app/spiders/v2ex_config/scrapy.cfg b/backend/app/spiders/v2ex_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/v2ex_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider diff --git a/backend/app/spiders/xueqiu/Spiderfile b/backend/app/spiders/xueqiu/Spiderfile deleted file mode 100755 index 38aa5dbe..00000000 --- a/backend/app/spiders/xueqiu/Spiderfile +++ /dev/null @@ -1,5 +0,0 @@ -name: "xueqiu" -display_name: "雪球网 (Scrapy)" -type: "customized" -col: "results_xueqiu" -cmd: "scrapy crawl xueqiu_spider" \ No newline at end of file diff --git a/backend/app/spiders/xueqiu/md5.txt b/backend/app/spiders/xueqiu/md5.txt deleted file mode 100755 index 6a9a2072..00000000 --- a/backend/app/spiders/xueqiu/md5.txt +++ /dev/null @@ -1 +0,0 @@ -df177994199caa691d87fc0c5031326d diff --git a/backend/app/spiders/xueqiu/scrapy.cfg b/backend/app/spiders/xueqiu/scrapy.cfg deleted file mode 100755 index 2c5ce3b3..00000000 --- a/backend/app/spiders/xueqiu/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = xueqiu.settings - -[deploy] -#url = http://localhost:6800/ -project = xueqiu diff --git a/backend/app/spiders/xueqiu/xueqiu/__init__.py b/backend/app/spiders/xueqiu/xueqiu/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/xueqiu/xueqiu/items.py b/backend/app/spiders/xueqiu/xueqiu/items.py deleted file mode 100755 index 5471594d..00000000 --- a/backend/app/spiders/xueqiu/xueqiu/items.py +++ /dev/null @@ -1,23 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class XueqiuItem(scrapy.Item): - # define the fields for your item here like: - _id = scrapy.Field() - task_id = scrapy.Field() - id = scrapy.Field() - text = scrapy.Field() - url = scrapy.Field() - target = scrapy.Field() - view_count = scrapy.Field() - mark = scrapy.Field() - created_at = scrapy.Field() - ts = scrapy.Field() - source = scrapy.Field() diff --git a/backend/app/spiders/xueqiu/xueqiu/middlewares.py b/backend/app/spiders/xueqiu/xueqiu/middlewares.py deleted file mode 100755 index f60102ce..00000000 --- a/backend/app/spiders/xueqiu/xueqiu/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class XueqiuSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Response, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class XueqiuDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/xueqiu/xueqiu/pipelines.py b/backend/app/spiders/xueqiu/xueqiu/pipelines.py deleted file mode 100755 index 5a7d7cbf..00000000 --- a/backend/app/spiders/xueqiu/xueqiu/pipelines.py +++ /dev/null @@ -1,6 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/backend/app/spiders/xueqiu/xueqiu/settings.py b/backend/app/spiders/xueqiu/xueqiu/settings.py deleted file mode 100755 index 1d898e2f..00000000 --- a/backend/app/spiders/xueqiu/xueqiu/settings.py +++ /dev/null @@ -1,89 +0,0 @@ -# -*- coding: utf-8 -*- - -# Scrapy settings for xueqiu project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://doc.scrapy.org/en/latest/topics/settings.html -# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'xueqiu' - -SPIDER_MODULES = ['xueqiu.spiders'] -NEWSPIDER_MODULE = 'xueqiu.spiders' - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = False - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -# CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -# DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -# CONCURRENT_REQUESTS_PER_DOMAIN = 16 -# CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -# COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -# TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -# DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -# } - -# Enable or disable spider middlewares -# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html -# SPIDER_MIDDLEWARES = { -# 'xueqiu.middlewares.XueqiuSpiderMiddleware': 543, -# } - -# Enable or disable downloader middlewares -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -# DOWNLOADER_MIDDLEWARES = { -# 'xueqiu.middlewares.XueqiuDownloaderMiddleware': 543, -# } - -# Enable or disable extensions -# See https://doc.scrapy.org/en/latest/topics/extensions.html -# EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -# } - -# Configure item pipelines -# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'crawlab.pipelines.CrawlabMongoPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/autothrottle.html -# AUTOTHROTTLE_ENABLED = True -# The initial download delay -# AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -# AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -# AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -# HTTPCACHE_ENABLED = True -# HTTPCACHE_EXPIRATION_SECS = 0 -# HTTPCACHE_DIR = 'httpcache' -# HTTPCACHE_IGNORE_HTTP_CODES = [] -# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/xueqiu/xueqiu/spiders/__init__.py b/backend/app/spiders/xueqiu/xueqiu/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/xueqiu/xueqiu/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py b/backend/app/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py deleted file mode 100755 index a746e156..00000000 --- a/backend/app/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py +++ /dev/null @@ -1,46 +0,0 @@ -# -*- coding: utf-8 -*- -import json -from datetime import datetime -from time import sleep - -import scrapy - -from xueqiu.items import XueqiuItem - - -class XueqiuSpiderSpider(scrapy.Spider): - name = 'xueqiu_spider' - allowed_domains = ['xueqiu.com'] - - def start_requests(self): - return [scrapy.Request( - url='https://xueqiu.com', - callback=self.parse_home - )] - - def parse_home(self, response): - yield scrapy.Request( - url='https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=20&category=6' - ) - - def parse(self, response): - data = json.loads(response.body) - next_max_id = data.get('next_max_id') - sleep(1) - for row in data.get('list'): - d = json.loads(row.get('data')) - item = XueqiuItem( - id=d['id'], - text=d['text'], - mark=d['mark'], - url=d['target'], - created_at=d['created_at'], - ts=datetime.fromtimestamp(d['created_at'] / 1e3), - view_count=d['view_count'], - source='xueqiu' - ) - yield item - - yield scrapy.Request( - url=f'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id={next_max_id}&count=20&category=6' - ) diff --git a/backend/app/spiders/xueqiu_config/Spiderfile b/backend/app/spiders/xueqiu_config/Spiderfile deleted file mode 100755 index 0de50e9e..00000000 --- a/backend/app/spiders/xueqiu_config/Spiderfile +++ /dev/null @@ -1,39 +0,0 @@ -name: "xueqiu_config" -display_name: "雪球网(可配置)" -remark: "雪球网新闻,列表" -type: "configurable" -col: "results_xueqiu_config" -engine: scrapy -start_url: https://xueqiu.com/ -start_stage: list -stages: -- name: list - is_list: true - list_css: "" - list_xpath: .//*[contains(@class, "AnonymousHome_home__timeline__item")] - page_css: "" - page_xpath: "" - page_attr: "" - fields: - - name: title - css: h3 > a - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: url - css: h3 > a - xpath: "" - attr: href - next_stage: "" - remark: "" - - name: abstract - css: p - xpath: "" - attr: "" - next_stage: "" - remark: "" -settings: - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/xueqiu_config/config_spider/__init__.py b/backend/app/spiders/xueqiu_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/xueqiu_config/config_spider/items.py b/backend/app/spiders/xueqiu_config/config_spider/items.py deleted file mode 100755 index 9282765f..00000000 --- a/backend/app/spiders/xueqiu_config/config_spider/items.py +++ /dev/null @@ -1,18 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - title = scrapy.Field() - url = scrapy.Field() - abstract = scrapy.Field() - diff --git a/backend/app/spiders/xueqiu_config/config_spider/middlewares.py b/backend/app/spiders/xueqiu_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/xueqiu_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/xueqiu_config/config_spider/pipelines.py b/backend/app/spiders/xueqiu_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/xueqiu_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/xueqiu_config/config_spider/settings.py b/backend/app/spiders/xueqiu_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/xueqiu_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/xueqiu_config/config_spider/spiders/__init__.py b/backend/app/spiders/xueqiu_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/xueqiu_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/xueqiu_config/config_spider/spiders/spider.py b/backend/app/spiders/xueqiu_config/config_spider/spiders/spider.py deleted file mode 100755 index 79d4636b..00000000 --- a/backend/app/spiders/xueqiu_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,33 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='https://xueqiu.com/', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.xpath('.//*[contains(@class, "AnonymousHome_home__timeline__item")]'): - item = Item() - item['title'] = elem.css('h3 > a::text').extract_first() - item['url'] = elem.css('h3 > a::attr("href")').extract_first() - item['abstract'] = elem.css('p::text').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield item - - diff --git a/backend/app/spiders/xueqiu_config/md5.txt b/backend/app/spiders/xueqiu_config/md5.txt deleted file mode 100755 index 39a6df77..00000000 --- a/backend/app/spiders/xueqiu_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -e3da3aacb2d290cb179a79028fbfff9c diff --git a/backend/app/spiders/xueqiu_config/scrapy.cfg b/backend/app/spiders/xueqiu_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/xueqiu_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider diff --git a/backend/app/spiders/zongheng_config/Spiderfile b/backend/app/spiders/zongheng_config/Spiderfile deleted file mode 100755 index 0163fac7..00000000 --- a/backend/app/spiders/zongheng_config/Spiderfile +++ /dev/null @@ -1,45 +0,0 @@ -name: "zongheng_config" -display_name: "纵横(可配置)" -remark: "纵横小说网,列表" -type: "configurable" -col: "results_zongheng_config" -engine: scrapy -start_url: http://www.zongheng.com/rank/details.html?rt=1&d=1 -start_stage: list -stages: -- name: list - is_list: true - list_css: .rank_d_list - list_xpath: "" - page_css: "" - page_xpath: "" - page_attr: href - fields: - - name: title - css: .rank_d_b_name > a - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: url - css: .rank_d_b_name > a - xpath: "" - attr: href - next_stage: "" - remark: "" - - name: abstract - css: body - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: votes - css: .rank_d_b_ticket - xpath: "" - attr: "" - next_stage: "" - remark: "" -settings: - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/zongheng_config/config_spider/__init__.py b/backend/app/spiders/zongheng_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/zongheng_config/config_spider/items.py b/backend/app/spiders/zongheng_config/config_spider/items.py deleted file mode 100755 index 528c3187..00000000 --- a/backend/app/spiders/zongheng_config/config_spider/items.py +++ /dev/null @@ -1,19 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - title = scrapy.Field() - url = scrapy.Field() - abstract = scrapy.Field() - votes = scrapy.Field() - diff --git a/backend/app/spiders/zongheng_config/config_spider/middlewares.py b/backend/app/spiders/zongheng_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/zongheng_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/zongheng_config/config_spider/pipelines.py b/backend/app/spiders/zongheng_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/zongheng_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/zongheng_config/config_spider/settings.py b/backend/app/spiders/zongheng_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/zongheng_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/zongheng_config/config_spider/spiders/__init__.py b/backend/app/spiders/zongheng_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/zongheng_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/zongheng_config/config_spider/spiders/spider.py b/backend/app/spiders/zongheng_config/config_spider/spiders/spider.py deleted file mode 100755 index cf1b6a08..00000000 --- a/backend/app/spiders/zongheng_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,34 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='http://www.zongheng.com/rank/details.html?rt=1&d=1', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.css('.rank_d_list'): - item = Item() - item['title'] = elem.css('.rank_d_b_name > a::text').extract_first() - item['url'] = elem.css('.rank_d_b_name > a::attr("href")').extract_first() - item['abstract'] = elem.css('body::text').extract_first() - item['votes'] = elem.css('.rank_d_b_ticket::text').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield item - - diff --git a/backend/app/spiders/zongheng_config/md5.txt b/backend/app/spiders/zongheng_config/md5.txt deleted file mode 100755 index 46fd3de6..00000000 --- a/backend/app/spiders/zongheng_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -82cb98a6103fb878501df81f191703ba diff --git a/backend/app/spiders/zongheng_config/scrapy.cfg b/backend/app/spiders/zongheng_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/zongheng_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider From 45a1cfa70b5b32849cabe25d7e3de1d01c6c1014 Mon Sep 17 00:00:00 2001 From: hantmac Date: Mon, 27 Apr 2020 17:18:57 +0800 Subject: [PATCH 3/4] support crawlab runtime log to ES --- backend/conf/config.yml | 2 ++ backend/main.go | 10 ++++++ backend/middlewares/es_log.go | 57 +++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+) create mode 100644 backend/middlewares/es_log.go diff --git a/backend/conf/config.yml b/backend/conf/config.yml index 1c2c8507..686865f9 100644 --- a/backend/conf/config.yml +++ b/backend/conf/config.yml @@ -40,6 +40,8 @@ other: tmppath: "/tmp" version: 0.4.10 setting: + crawlabLogToES: "N" # Send crawlab runtime log to ES, open this option "Y", remember to set esClient + crawlabLogIndex: "crawlab-log" allowRegister: "N" enableTutorial: "N" runOnMaster: "Y" diff --git a/backend/main.go b/backend/main.go index 6ab022f4..7ca18335 100644 --- a/backend/main.go +++ b/backend/main.go @@ -14,6 +14,7 @@ import ( "github.com/apex/log" "github.com/gin-gonic/gin" "github.com/gin-gonic/gin/binding" + "github.com/olivere/elastic/v7" "github.com/spf13/viper" "net" "net/http" @@ -133,6 +134,15 @@ func main() { // 以下为主节点服务 if model.IsMaster() { // 中间件 + esClientStr := viper.GetString("setting.esClient") + if viper.GetString("setting.crawlabLogToES") == "Y" && esClientStr != "" { + ctx := context.Background() + esClient, err := elastic.NewClient(elastic.SetURL(esClientStr), elastic.SetSniff(false)) + if err != nil { + log.Error("Init es client Error:" + err.Error()) + } + app.Use(middlewares.EsLog(ctx, esClient)) + } app.Use(middlewares.CORSMiddleware()) anonymousGroup := app.Group("/") { diff --git a/backend/middlewares/es_log.go b/backend/middlewares/es_log.go new file mode 100644 index 00000000..c119816a --- /dev/null +++ b/backend/middlewares/es_log.go @@ -0,0 +1,57 @@ +package middlewares + +import ( + "bytes" + "context" + "fmt" + "github.com/gin-gonic/gin" + "github.com/olivere/elastic/v7" + "github.com/satori/go.uuid" + "github.com/spf13/viper" + "strconv" + "time" +) + +func EsLog(ctx context.Context, esClient *elastic.Client) gin.HandlerFunc { + + return func(c *gin.Context) { + // 开始时间 + crawlabIndex := viper.GetString("setting.crawlabLogIndex") + sig := make(chan struct{}, 1) + sig <- struct{}{} + start := time.Now() + // 处理请求 + c.Next() + // 结束时间 + end := time.Now() + //执行时间 + latency := strconv.FormatInt(int64(end.Sub(start).Milliseconds()), 10) + path := c.Request.URL.Path + + clientIP := c.ClientIP() + method := c.Request.Method + statusCode := strconv.Itoa(c.Writer.Status()) + buf := new(bytes.Buffer) + buf.ReadFrom(c.Request.Body) + b := buf.String() + accessLog := "costTime:" + latency + "ms--" + "StatusCode:" + statusCode + "--" + "Method:" + method + "--" + "ClientIp:" + clientIP + "--" + + "RequestURI:" + path + "--" + "Host:" + c.Request.Host + "--" + "UserAgent--" + c.Request.UserAgent() + "--RequestBody:" + + string(b) + WriteMsg(ctx, crawlabIndex, esClient, time.Now(), accessLog, sig) + } + +} + +// WriteMsg will write the msg and level into es +func WriteMsg(ctx context.Context, crawlabIndex string, es *elastic.Client, when time.Time, msg string, sig chan struct{}) error { + <-sig + vals := make(map[string]interface{}) + vals["@timestamp"] = when.Format(time.RFC3339) + vals["@msg"] = msg + uid := uuid.NewV4().String() + _, err := es.Index().Index(crawlabIndex).Id(uid).BodyJson(vals).Refresh("wait_for").Do(ctx) + if err != nil { + fmt.Println(err) + } + return err +} From 7add8ea36ff9463774325b3bb434b34dc92c6bc2 Mon Sep 17 00:00:00 2001 From: hantmac Date: Mon, 27 Apr 2020 19:24:10 +0800 Subject: [PATCH 4/4] bugfix --- backend/middlewares/es_log.go | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/backend/middlewares/es_log.go b/backend/middlewares/es_log.go index c119816a..1dff344e 100644 --- a/backend/middlewares/es_log.go +++ b/backend/middlewares/es_log.go @@ -17,8 +17,6 @@ func EsLog(ctx context.Context, esClient *elastic.Client) gin.HandlerFunc { return func(c *gin.Context) { // 开始时间 crawlabIndex := viper.GetString("setting.crawlabLogIndex") - sig := make(chan struct{}, 1) - sig <- struct{}{} start := time.Now() // 处理请求 c.Next() @@ -37,14 +35,13 @@ func EsLog(ctx context.Context, esClient *elastic.Client) gin.HandlerFunc { accessLog := "costTime:" + latency + "ms--" + "StatusCode:" + statusCode + "--" + "Method:" + method + "--" + "ClientIp:" + clientIP + "--" + "RequestURI:" + path + "--" + "Host:" + c.Request.Host + "--" + "UserAgent--" + c.Request.UserAgent() + "--RequestBody:" + string(b) - WriteMsg(ctx, crawlabIndex, esClient, time.Now(), accessLog, sig) + WriteMsg(ctx, crawlabIndex, esClient, time.Now(), accessLog) } } // WriteMsg will write the msg and level into es -func WriteMsg(ctx context.Context, crawlabIndex string, es *elastic.Client, when time.Time, msg string, sig chan struct{}) error { - <-sig +func WriteMsg(ctx context.Context, crawlabIndex string, es *elastic.Client, when time.Time, msg string) error { vals := make(map[string]interface{}) vals["@timestamp"] = when.Format(time.RFC3339) vals["@msg"] = msg