From 81c9ef7daafcf94744382304fd4d83a390eef770 Mon Sep 17 00:00:00 2001 From: hantmac Date: Mon, 27 Apr 2020 16:01:24 +0800 Subject: [PATCH] support send log to ES --- backend/app/spiders/amazon_config/Spiderfile | 51 -------- .../amazon_config/config_spider/__init__.py | 0 .../amazon_config/config_spider/items.py | 20 ---- .../config_spider/middlewares.py | 103 ---------------- .../amazon_config/config_spider/pipelines.py | 27 ----- .../amazon_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 37 ------ backend/app/spiders/amazon_config/md5.txt | 1 - backend/app/spiders/amazon_config/scrapy.cfg | 11 -- .../app/spiders/autohome_config/Spiderfile | 57 --------- .../autohome_config/config_spider/__init__.py | 0 .../autohome_config/config_spider/items.py | 21 ---- .../config_spider/middlewares.py | 103 ---------------- .../config_spider/pipelines.py | 27 ----- .../autohome_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 38 ------ backend/app/spiders/autohome_config/md5.txt | 1 - .../app/spiders/autohome_config/scrapy.cfg | 11 -- backend/app/spiders/baidu_config/Spiderfile | 39 ------ .../baidu_config/config_spider/__init__.py | 0 .../baidu_config/config_spider/items.py | 18 --- .../baidu_config/config_spider/middlewares.py | 103 ---------------- .../baidu_config/config_spider/pipelines.py | 27 ----- .../baidu_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 35 ------ backend/app/spiders/baidu_config/md5.txt | 1 - backend/app/spiders/baidu_config/scrapy.cfg | 11 -- backend/app/spiders/bing_general/Spiderfile | 6 - .../app/spiders/bing_general/bing_spider.py | 41 ------- backend/app/spiders/bing_general/md5.txt | 1 - backend/app/spiders/chinaz/Spiderfile | 5 - backend/app/spiders/chinaz/chinaz/__init__.py | 0 backend/app/spiders/chinaz/chinaz/items.py | 21 ---- .../app/spiders/chinaz/chinaz/middlewares.py | 103 ---------------- .../app/spiders/chinaz/chinaz/pipelines.py | 7 -- backend/app/spiders/chinaz/chinaz/settings.py | 90 -------------- .../spiders/chinaz/chinaz/spiders/__init__.py | 4 - .../chinaz/chinaz/spiders/chinaz_spider.py | 63 ---------- backend/app/spiders/chinaz/md5.txt | 1 - backend/app/spiders/chinaz/scrapy.cfg | 11 -- backend/app/spiders/csdn_config/Spiderfile | 60 ---------- .../csdn_config/config_spider/__init__.py | 0 .../csdn_config/config_spider/items.py | 20 ---- .../csdn_config/config_spider/middlewares.py | 103 ---------------- .../csdn_config/config_spider/pipelines.py | 27 ----- .../csdn_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 41 ------- backend/app/spiders/csdn_config/md5.txt | 1 - backend/app/spiders/csdn_config/scrapy.cfg | 11 -- backend/app/spiders/douban_config/Spiderfile | 57 --------- .../douban_config/config_spider/__init__.py | 0 .../douban_config/config_spider/items.py | 21 ---- .../config_spider/middlewares.py | 103 ---------------- .../douban_config/config_spider/pipelines.py | 27 ----- .../douban_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 36 ------ backend/app/spiders/douban_config/md5.txt | 1 - backend/app/spiders/douban_config/scrapy.cfg | 11 -- backend/app/spiders/jd/Spiderfile | 5 - backend/app/spiders/jd/jd/__init__.py | 0 backend/app/spiders/jd/jd/items.py | 15 --- backend/app/spiders/jd/jd/middlewares.py | 103 ---------------- backend/app/spiders/jd/jd/pipelines.py | 6 - backend/app/spiders/jd/jd/settings.py | 90 -------------- backend/app/spiders/jd/jd/spiders/__init__.py | 4 - .../app/spiders/jd/jd/spiders/jd_spider.py | 21 ---- backend/app/spiders/jd/md5.txt | 1 - backend/app/spiders/jd/scrapy.cfg | 11 -- backend/app/spiders/sinastock/Spiderfile | 5 - backend/app/spiders/sinastock/md5.txt | 1 - backend/app/spiders/sinastock/scrapy.cfg | 11 -- .../spiders/sinastock/sinastock/__init__.py | 0 .../app/spiders/sinastock/sinastock/items.py | 21 ---- .../sinastock/sinastock/middlewares.py | 103 ---------------- .../spiders/sinastock/sinastock/pipelines.py | 6 - .../spiders/sinastock/sinastock/settings.py | 89 -------------- .../sinastock/sinastock/spiders/__init__.py | 4 - .../sinastock/spiders/sinastock_spider.py | 59 ---------- backend/app/spiders/v2ex_config/Spiderfile | 54 --------- .../v2ex_config/config_spider/__init__.py | 0 .../v2ex_config/config_spider/items.py | 19 --- .../v2ex_config/config_spider/middlewares.py | 103 ---------------- .../v2ex_config/config_spider/pipelines.py | 27 ----- .../v2ex_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 38 ------ backend/app/spiders/v2ex_config/md5.txt | 1 - backend/app/spiders/v2ex_config/scrapy.cfg | 11 -- backend/app/spiders/xueqiu/Spiderfile | 5 - backend/app/spiders/xueqiu/md5.txt | 1 - backend/app/spiders/xueqiu/scrapy.cfg | 11 -- backend/app/spiders/xueqiu/xueqiu/__init__.py | 0 backend/app/spiders/xueqiu/xueqiu/items.py | 23 ---- .../app/spiders/xueqiu/xueqiu/middlewares.py | 103 ---------------- .../app/spiders/xueqiu/xueqiu/pipelines.py | 6 - backend/app/spiders/xueqiu/xueqiu/settings.py | 89 -------------- .../spiders/xueqiu/xueqiu/spiders/__init__.py | 4 - .../xueqiu/xueqiu/spiders/xueqiu_spider.py | 46 -------- backend/app/spiders/xueqiu_config/Spiderfile | 39 ------ .../xueqiu_config/config_spider/__init__.py | 0 .../xueqiu_config/config_spider/items.py | 18 --- .../config_spider/middlewares.py | 103 ---------------- .../xueqiu_config/config_spider/pipelines.py | 27 ----- .../xueqiu_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 33 ------ backend/app/spiders/xueqiu_config/md5.txt | 1 - backend/app/spiders/xueqiu_config/scrapy.cfg | 11 -- .../app/spiders/zongheng_config/Spiderfile | 45 ------- .../zongheng_config/config_spider/__init__.py | 0 .../zongheng_config/config_spider/items.py | 19 --- .../config_spider/middlewares.py | 103 ---------------- .../config_spider/pipelines.py | 27 ----- .../zongheng_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 34 ------ backend/app/spiders/zongheng_config/md5.txt | 1 - .../app/spiders/zongheng_config/scrapy.cfg | 11 -- 123 files changed, 4102 deletions(-) delete mode 100755 backend/app/spiders/amazon_config/Spiderfile delete mode 100755 backend/app/spiders/amazon_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/amazon_config/config_spider/items.py delete mode 100755 backend/app/spiders/amazon_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/amazon_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/amazon_config/config_spider/settings.py delete mode 100755 backend/app/spiders/amazon_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/amazon_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/amazon_config/md5.txt delete mode 100755 backend/app/spiders/amazon_config/scrapy.cfg delete mode 100755 backend/app/spiders/autohome_config/Spiderfile delete mode 100755 backend/app/spiders/autohome_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/autohome_config/config_spider/items.py delete mode 100755 backend/app/spiders/autohome_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/autohome_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/autohome_config/config_spider/settings.py delete mode 100755 backend/app/spiders/autohome_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/autohome_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/autohome_config/md5.txt delete mode 100755 backend/app/spiders/autohome_config/scrapy.cfg delete mode 100755 backend/app/spiders/baidu_config/Spiderfile delete mode 100755 backend/app/spiders/baidu_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/baidu_config/config_spider/items.py delete mode 100755 backend/app/spiders/baidu_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/baidu_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/baidu_config/config_spider/settings.py delete mode 100755 backend/app/spiders/baidu_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/baidu_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/baidu_config/md5.txt delete mode 100755 backend/app/spiders/baidu_config/scrapy.cfg delete mode 100755 backend/app/spiders/bing_general/Spiderfile delete mode 100755 backend/app/spiders/bing_general/bing_spider.py delete mode 100755 backend/app/spiders/bing_general/md5.txt delete mode 100755 backend/app/spiders/chinaz/Spiderfile delete mode 100755 backend/app/spiders/chinaz/chinaz/__init__.py delete mode 100755 backend/app/spiders/chinaz/chinaz/items.py delete mode 100755 backend/app/spiders/chinaz/chinaz/middlewares.py delete mode 100755 backend/app/spiders/chinaz/chinaz/pipelines.py delete mode 100755 backend/app/spiders/chinaz/chinaz/settings.py delete mode 100755 backend/app/spiders/chinaz/chinaz/spiders/__init__.py delete mode 100755 backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py delete mode 100755 backend/app/spiders/chinaz/md5.txt delete mode 100755 backend/app/spiders/chinaz/scrapy.cfg delete mode 100755 backend/app/spiders/csdn_config/Spiderfile delete mode 100755 backend/app/spiders/csdn_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/csdn_config/config_spider/items.py delete mode 100755 backend/app/spiders/csdn_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/csdn_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/csdn_config/config_spider/settings.py delete mode 100755 backend/app/spiders/csdn_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/csdn_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/csdn_config/md5.txt delete mode 100755 backend/app/spiders/csdn_config/scrapy.cfg delete mode 100755 backend/app/spiders/douban_config/Spiderfile delete mode 100755 backend/app/spiders/douban_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/douban_config/config_spider/items.py delete mode 100755 backend/app/spiders/douban_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/douban_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/douban_config/config_spider/settings.py delete mode 100755 backend/app/spiders/douban_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/douban_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/douban_config/md5.txt delete mode 100755 backend/app/spiders/douban_config/scrapy.cfg delete mode 100755 backend/app/spiders/jd/Spiderfile delete mode 100755 backend/app/spiders/jd/jd/__init__.py delete mode 100755 backend/app/spiders/jd/jd/items.py delete mode 100755 backend/app/spiders/jd/jd/middlewares.py delete mode 100755 backend/app/spiders/jd/jd/pipelines.py delete mode 100755 backend/app/spiders/jd/jd/settings.py delete mode 100755 backend/app/spiders/jd/jd/spiders/__init__.py delete mode 100755 backend/app/spiders/jd/jd/spiders/jd_spider.py delete mode 100755 backend/app/spiders/jd/md5.txt delete mode 100755 backend/app/spiders/jd/scrapy.cfg delete mode 100755 backend/app/spiders/sinastock/Spiderfile delete mode 100755 backend/app/spiders/sinastock/md5.txt delete mode 100755 backend/app/spiders/sinastock/scrapy.cfg delete mode 100755 backend/app/spiders/sinastock/sinastock/__init__.py delete mode 100755 backend/app/spiders/sinastock/sinastock/items.py delete mode 100755 backend/app/spiders/sinastock/sinastock/middlewares.py delete mode 100755 backend/app/spiders/sinastock/sinastock/pipelines.py delete mode 100755 backend/app/spiders/sinastock/sinastock/settings.py delete mode 100755 backend/app/spiders/sinastock/sinastock/spiders/__init__.py delete mode 100755 backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py delete mode 100755 backend/app/spiders/v2ex_config/Spiderfile delete mode 100755 backend/app/spiders/v2ex_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/v2ex_config/config_spider/items.py delete mode 100755 backend/app/spiders/v2ex_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/v2ex_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/v2ex_config/config_spider/settings.py delete mode 100755 backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/v2ex_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/v2ex_config/md5.txt delete mode 100755 backend/app/spiders/v2ex_config/scrapy.cfg delete mode 100755 backend/app/spiders/xueqiu/Spiderfile delete mode 100755 backend/app/spiders/xueqiu/md5.txt delete mode 100755 backend/app/spiders/xueqiu/scrapy.cfg delete mode 100755 backend/app/spiders/xueqiu/xueqiu/__init__.py delete mode 100755 backend/app/spiders/xueqiu/xueqiu/items.py delete mode 100755 backend/app/spiders/xueqiu/xueqiu/middlewares.py delete mode 100755 backend/app/spiders/xueqiu/xueqiu/pipelines.py delete mode 100755 backend/app/spiders/xueqiu/xueqiu/settings.py delete mode 100755 backend/app/spiders/xueqiu/xueqiu/spiders/__init__.py delete mode 100755 backend/app/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py delete mode 100755 backend/app/spiders/xueqiu_config/Spiderfile delete mode 100755 backend/app/spiders/xueqiu_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/xueqiu_config/config_spider/items.py delete mode 100755 backend/app/spiders/xueqiu_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/xueqiu_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/xueqiu_config/config_spider/settings.py delete mode 100755 backend/app/spiders/xueqiu_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/xueqiu_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/xueqiu_config/md5.txt delete mode 100755 backend/app/spiders/xueqiu_config/scrapy.cfg delete mode 100755 backend/app/spiders/zongheng_config/Spiderfile delete mode 100755 backend/app/spiders/zongheng_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/zongheng_config/config_spider/items.py delete mode 100755 backend/app/spiders/zongheng_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/zongheng_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/zongheng_config/config_spider/settings.py delete mode 100755 backend/app/spiders/zongheng_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/zongheng_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/zongheng_config/md5.txt delete mode 100755 backend/app/spiders/zongheng_config/scrapy.cfg diff --git a/backend/app/spiders/amazon_config/Spiderfile b/backend/app/spiders/amazon_config/Spiderfile deleted file mode 100755 index eea8a538..00000000 --- a/backend/app/spiders/amazon_config/Spiderfile +++ /dev/null @@ -1,51 +0,0 @@ -name: "amazon_config" -display_name: "亚马逊中国(可配置)" -remark: "亚马逊中国搜索手机,列表+分页" -type: "configurable" -col: "results_amazon_config" -engine: scrapy -start_url: https://www.amazon.cn/s?k=%E6%89%8B%E6%9C%BA&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_2 -start_stage: list -stages: -- name: list - is_list: true - list_css: .s-result-item - list_xpath: "" - page_css: .a-last > a - page_xpath: "" - page_attr: href - fields: - - name: title - css: span.a-text-normal - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: url - css: .a-link-normal - xpath: "" - attr: href - next_stage: "" - remark: "" - - name: price - css: "" - xpath: .//*[@class="a-price-whole"] - attr: "" - next_stage: "" - remark: "" - - name: price_fraction - css: "" - xpath: .//*[@class="a-price-fraction"] - attr: "" - next_stage: "" - remark: "" - - name: img - css: .s-image-square-aspect > img - xpath: "" - attr: src - next_stage: "" - remark: "" -settings: - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/amazon_config/config_spider/__init__.py b/backend/app/spiders/amazon_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/amazon_config/config_spider/items.py b/backend/app/spiders/amazon_config/config_spider/items.py deleted file mode 100755 index 79bf0adb..00000000 --- a/backend/app/spiders/amazon_config/config_spider/items.py +++ /dev/null @@ -1,20 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - title = scrapy.Field() - url = scrapy.Field() - price = scrapy.Field() - price_fraction = scrapy.Field() - img = scrapy.Field() - diff --git a/backend/app/spiders/amazon_config/config_spider/middlewares.py b/backend/app/spiders/amazon_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/amazon_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/amazon_config/config_spider/pipelines.py b/backend/app/spiders/amazon_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/amazon_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/amazon_config/config_spider/settings.py b/backend/app/spiders/amazon_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/amazon_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/amazon_config/config_spider/spiders/__init__.py b/backend/app/spiders/amazon_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/amazon_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/amazon_config/config_spider/spiders/spider.py b/backend/app/spiders/amazon_config/config_spider/spiders/spider.py deleted file mode 100755 index a7421df3..00000000 --- a/backend/app/spiders/amazon_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,37 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='https://www.amazon.cn/s?k=%E6%89%8B%E6%9C%BA&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_2', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.css('.s-result-item'): - item = Item() - item['title'] = elem.css('span.a-text-normal::text').extract_first() - item['url'] = elem.css('.a-link-normal::attr("href")').extract_first() - item['price'] = elem.xpath('string(.//*[@class="a-price-whole"])').extract_first() - item['price_fraction'] = elem.xpath('string(.//*[@class="a-price-fraction"])').extract_first() - item['img'] = elem.css('.s-image-square-aspect > img::attr("src")').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield item - next_url = response.css('.a-last > a::attr("href")').extract_first() - yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) - - diff --git a/backend/app/spiders/amazon_config/md5.txt b/backend/app/spiders/amazon_config/md5.txt deleted file mode 100755 index 52c5423f..00000000 --- a/backend/app/spiders/amazon_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -4b716dd3c15b993ccb7a9f0be1cc0de9 diff --git a/backend/app/spiders/amazon_config/scrapy.cfg b/backend/app/spiders/amazon_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/amazon_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider diff --git a/backend/app/spiders/autohome_config/Spiderfile b/backend/app/spiders/autohome_config/Spiderfile deleted file mode 100755 index e69880cb..00000000 --- a/backend/app/spiders/autohome_config/Spiderfile +++ /dev/null @@ -1,57 +0,0 @@ -name: "autohome_config" -display_name: "汽车之家(可配置)" -remark: "汽车之家文章,列表+详情+分页" -type: "configurable" -col: "results_autohome_config" -engine: scrapy -start_url: https://www.autohome.com.cn/all/ -start_stage: list -stages: -- name: list - is_list: true - list_css: ul.article > li - list_xpath: "" - page_css: a.page-item-next - page_xpath: "" - page_attr: href - fields: - - name: title - css: li > a > h3 - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: url - css: li > a - xpath: "" - attr: href - next_stage: "" - remark: "" - - name: abstract - css: li > a > p - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: time - css: li > a .fn-left - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: views - css: li > a .fn-right > em:first-child - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: comments - css: li > a .fn-right > em:last-child - xpath: "" - attr: "" - next_stage: "" - remark: "" -settings: - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/autohome_config/config_spider/__init__.py b/backend/app/spiders/autohome_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/autohome_config/config_spider/items.py b/backend/app/spiders/autohome_config/config_spider/items.py deleted file mode 100755 index 206203d5..00000000 --- a/backend/app/spiders/autohome_config/config_spider/items.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - title = scrapy.Field() - url = scrapy.Field() - abstract = scrapy.Field() - time = scrapy.Field() - views = scrapy.Field() - comments = scrapy.Field() - diff --git a/backend/app/spiders/autohome_config/config_spider/middlewares.py b/backend/app/spiders/autohome_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/autohome_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/autohome_config/config_spider/pipelines.py b/backend/app/spiders/autohome_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/autohome_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/autohome_config/config_spider/settings.py b/backend/app/spiders/autohome_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/autohome_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/autohome_config/config_spider/spiders/__init__.py b/backend/app/spiders/autohome_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/autohome_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/autohome_config/config_spider/spiders/spider.py b/backend/app/spiders/autohome_config/config_spider/spiders/spider.py deleted file mode 100755 index 83753f5a..00000000 --- a/backend/app/spiders/autohome_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,38 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='https://www.autohome.com.cn/all/', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.css('ul.article > li'): - item = Item() - item['title'] = elem.css('li > a > h3::text').extract_first() - item['url'] = elem.css('li > a::attr("href")').extract_first() - item['abstract'] = elem.css('li > a > p::text').extract_first() - item['time'] = elem.css('li > a .fn-left::text').extract_first() - item['views'] = elem.css('li > a .fn-right > em:first-child::text').extract_first() - item['comments'] = elem.css('li > a .fn-right > em:last-child::text').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield item - next_url = response.css('a.page-item-next::attr("href")').extract_first() - yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) - - diff --git a/backend/app/spiders/autohome_config/md5.txt b/backend/app/spiders/autohome_config/md5.txt deleted file mode 100755 index c4707adf..00000000 --- a/backend/app/spiders/autohome_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -d784a11085e298eaf344eadc3a3e9411 diff --git a/backend/app/spiders/autohome_config/scrapy.cfg b/backend/app/spiders/autohome_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/autohome_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider diff --git a/backend/app/spiders/baidu_config/Spiderfile b/backend/app/spiders/baidu_config/Spiderfile deleted file mode 100755 index a29d4acb..00000000 --- a/backend/app/spiders/baidu_config/Spiderfile +++ /dev/null @@ -1,39 +0,0 @@ -name: "baidu_config" -display_name: "百度搜索(可配置)" -remark: "百度搜索Crawlab,列表+分页" -type: "configurable" -col: "results_baidu_config" -engine: scrapy -start_url: http://www.baidu.com/s?wd=crawlab -start_stage: list -stages: -- name: list - is_list: true - list_css: ".result.c-container" - list_xpath: "" - page_css: "a.n" - page_xpath: "" - page_attr: href - fields: - - name: title - css: "" - xpath: .//h3/a - attr: "" - next_stage: "" - remark: "" - - name: url - css: "" - xpath: .//h3/a - attr: href - next_stage: "" - remark: "" - - name: abstract - css: "" - xpath: .//*[@class="c-abstract"] - attr: "" - next_stage: "" - remark: "" -settings: - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/baidu_config/config_spider/__init__.py b/backend/app/spiders/baidu_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/baidu_config/config_spider/items.py b/backend/app/spiders/baidu_config/config_spider/items.py deleted file mode 100755 index 9282765f..00000000 --- a/backend/app/spiders/baidu_config/config_spider/items.py +++ /dev/null @@ -1,18 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - title = scrapy.Field() - url = scrapy.Field() - abstract = scrapy.Field() - diff --git a/backend/app/spiders/baidu_config/config_spider/middlewares.py b/backend/app/spiders/baidu_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/baidu_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/baidu_config/config_spider/pipelines.py b/backend/app/spiders/baidu_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/baidu_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/baidu_config/config_spider/settings.py b/backend/app/spiders/baidu_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/baidu_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/baidu_config/config_spider/spiders/__init__.py b/backend/app/spiders/baidu_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/baidu_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/baidu_config/config_spider/spiders/spider.py b/backend/app/spiders/baidu_config/config_spider/spiders/spider.py deleted file mode 100755 index e5fd793f..00000000 --- a/backend/app/spiders/baidu_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,35 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='http://www.baidu.com/s?wd=crawlab', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.css('.result.c-container'): - item = Item() - item['title'] = elem.xpath('string(.//h3/a)').extract_first() - item['url'] = elem.xpath('.//h3/a/@href').extract_first() - item['abstract'] = elem.xpath('string(.//*[@class="c-abstract"])').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield item - next_url = response.css('a.n::attr("href")').extract_first() - yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) - - diff --git a/backend/app/spiders/baidu_config/md5.txt b/backend/app/spiders/baidu_config/md5.txt deleted file mode 100755 index 32137b76..00000000 --- a/backend/app/spiders/baidu_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -ba25f6f3567b256473d3f0ec6af783fd diff --git a/backend/app/spiders/baidu_config/scrapy.cfg b/backend/app/spiders/baidu_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/baidu_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider diff --git a/backend/app/spiders/bing_general/Spiderfile b/backend/app/spiders/bing_general/Spiderfile deleted file mode 100755 index 614c135e..00000000 --- a/backend/app/spiders/bing_general/Spiderfile +++ /dev/null @@ -1,6 +0,0 @@ -name: "bing_general" -display_name: "必应搜索 (通用)" -remark: "必应搜索 Crawlab,列表+分页" -col: "results_bing_general" -type: "customized" -cmd: "python bing_spider.py" \ No newline at end of file diff --git a/backend/app/spiders/bing_general/bing_spider.py b/backend/app/spiders/bing_general/bing_spider.py deleted file mode 100755 index e982e4ee..00000000 --- a/backend/app/spiders/bing_general/bing_spider.py +++ /dev/null @@ -1,41 +0,0 @@ -import requests -from bs4 import BeautifulSoup as bs -from urllib.parse import urljoin, urlparse -import re -from crawlab import save_item - -s = requests.Session() - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -def start_requests(): - for i in range(0, 9): - fr = 'PERE' if not i else 'MORE' - url = f'https://cn.bing.com/search?q=crawlab&first={10 * i + 1}&FROM={fr}' - request_page(url) - -def request_page(url): - print(f'requesting {url}') - r = s.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}) - parse_list(r) - -def parse_list(response): - soup = bs(response.content.decode('utf-8')) - for el in list(soup.select('#b_results > li')): - try: - save_item({ - 'title': el.select_one('h2').text, - 'url': el.select_one('h2 a').attrs.get('href'), - 'abstract': el.select_one('.b_caption p').text, - }) - except: - pass - -if __name__ == '__main__': - start_requests() \ No newline at end of file diff --git a/backend/app/spiders/bing_general/md5.txt b/backend/app/spiders/bing_general/md5.txt deleted file mode 100755 index 42fb6afd..00000000 --- a/backend/app/spiders/bing_general/md5.txt +++ /dev/null @@ -1 +0,0 @@ -cf295b694a20c99c4857f838aa0402a7 diff --git a/backend/app/spiders/chinaz/Spiderfile b/backend/app/spiders/chinaz/Spiderfile deleted file mode 100755 index 2fb940bb..00000000 --- a/backend/app/spiders/chinaz/Spiderfile +++ /dev/null @@ -1,5 +0,0 @@ -name: "chinaz" -display_name: "站长之家 (Scrapy)" -col: "results_chinaz" -type: "customized" -cmd: "scrapy crawl chinaz_spider" \ No newline at end of file diff --git a/backend/app/spiders/chinaz/chinaz/__init__.py b/backend/app/spiders/chinaz/chinaz/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/chinaz/chinaz/items.py b/backend/app/spiders/chinaz/chinaz/items.py deleted file mode 100755 index 1fdcac1b..00000000 --- a/backend/app/spiders/chinaz/chinaz/items.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class ChinazItem(scrapy.Item): - # define the fields for your item here like: - _id = scrapy.Field() - task_id = scrapy.Field() - name = scrapy.Field() - domain = scrapy.Field() - description = scrapy.Field() - rank = scrapy.Field() - main_category = scrapy.Field() - category = scrapy.Field() - location = scrapy.Field() diff --git a/backend/app/spiders/chinaz/chinaz/middlewares.py b/backend/app/spiders/chinaz/chinaz/middlewares.py deleted file mode 100755 index c98995d5..00000000 --- a/backend/app/spiders/chinaz/chinaz/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ChinazSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Response, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ChinazDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/chinaz/chinaz/pipelines.py b/backend/app/spiders/chinaz/chinaz/pipelines.py deleted file mode 100755 index b29f9eb7..00000000 --- a/backend/app/spiders/chinaz/chinaz/pipelines.py +++ /dev/null @@ -1,7 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html - diff --git a/backend/app/spiders/chinaz/chinaz/settings.py b/backend/app/spiders/chinaz/chinaz/settings.py deleted file mode 100755 index 932ec9ed..00000000 --- a/backend/app/spiders/chinaz/chinaz/settings.py +++ /dev/null @@ -1,90 +0,0 @@ -# -*- coding: utf-8 -*- - -# Scrapy settings for chinaz project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://doc.scrapy.org/en/latest/topics/settings.html -# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'chinaz' - -SPIDER_MODULES = ['chinaz.spiders'] -NEWSPIDER_MODULE = 'chinaz.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'chinaz (+http://www.yourdomain.com)' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'chinaz.middlewares.ChinazSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'chinaz.middlewares.ChinazDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://doc.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'crawlab.pipelines.CrawlabMongoPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/chinaz/chinaz/spiders/__init__.py b/backend/app/spiders/chinaz/chinaz/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/chinaz/chinaz/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py b/backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py deleted file mode 100755 index 28ad84e7..00000000 --- a/backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py +++ /dev/null @@ -1,63 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -from chinaz.items import ChinazItem - - -class ChinazSpiderSpider(scrapy.Spider): - name = 'chinaz_spider' - allowed_domains = ['chinaz.com'] - start_urls = ['http://top.chinaz.com/hangye/'] - - def parse(self, response): - for item in response.css('.listCentent > li'): - name = item.css('h3.rightTxtHead > a::text').extract_first() - href = item.css('h3.rightTxtHead > a::attr("href")').extract_first() - domain = item.css('h3.rightTxtHead > span::text').extract_first() - description = item.css('p.RtCInfo::text').extract_first() - rank = item.css('.RtCRateCent > strong::text').extract_first() - rank = int(rank) - item = ChinazItem( - _id=domain, - name=name, - domain=domain, - description=description, - rank=rank, - ) - yield scrapy.Request( - url='http://top.chinaz.com' + href, - callback=self.parse_item, - meta={ - 'item': item - } - ) - - # pagination - a_list = response.css('.ListPageWrap > a::attr("href")').extract() - url = 'http://top.chinaz.com/hangye/' + a_list[-1] - yield scrapy.Request(url=url, callback=self.parse) - - def parse_item(self, response): - item = response.meta['item'] - - # category info extraction - arr = response.css('.TopMainTag-show .SimSun') - res1 = arr[0].css('a::text').extract() - main_category = res1[0] - if len(res1) == 1: - category = '其他' - else: - category = res1[1] - - # location info extraction - res2 = arr[1].css('a::text').extract() - if len(res2) > 0: - location = res2[0] - else: - location = '其他' - - # assign values to item - item['main_category'] = main_category - item['category'] = category - item['location'] = location - - yield item diff --git a/backend/app/spiders/chinaz/md5.txt b/backend/app/spiders/chinaz/md5.txt deleted file mode 100755 index f5e15fb9..00000000 --- a/backend/app/spiders/chinaz/md5.txt +++ /dev/null @@ -1 +0,0 @@ -1976593e49bf0238602ce35d051bd137 diff --git a/backend/app/spiders/chinaz/scrapy.cfg b/backend/app/spiders/chinaz/scrapy.cfg deleted file mode 100755 index d3b44a1a..00000000 --- a/backend/app/spiders/chinaz/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = chinaz.settings - -[deploy] -#url = http://localhost:6800/ -project = chinaz diff --git a/backend/app/spiders/csdn_config/Spiderfile b/backend/app/spiders/csdn_config/Spiderfile deleted file mode 100755 index 67f4f8c5..00000000 --- a/backend/app/spiders/csdn_config/Spiderfile +++ /dev/null @@ -1,60 +0,0 @@ -name: "csdn_config" -display_name: "CSDN(可配置)" -remark: "CSDN Crawlab 文章,列表+详情+分页" -type: "configurable" -col: "results_csdn_config" -engine: scrapy -start_url: https://so.csdn.net/so/search/s.do?q=crawlab -start_stage: list -stages: -- name: list - is_list: true - list_css: .search-list-con > .search-list - list_xpath: "" - page_css: a.btn-next - page_xpath: "" - page_attr: href - fields: - - name: url - css: "" - xpath: .//*[@class="limit_width"]/a - attr: href - next_stage: detail - remark: "" -- name: detail - is_list: false - list_css: "" - list_xpath: "" - page_css: "" - page_xpath: "" - page_attr: "" - fields: - - name: content - css: "" - xpath: .//div[@id="content_views"] - attr: "" - next_stage: "" - remark: "" - - name: views - css: .read-count - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: title - css: .title-article - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: author - css: .follow-nickName - xpath: "" - attr: "" - next_stage: "" - remark: "" -settings: - AUTOTHROTTLE_ENABLED: "false" - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/79.0.3945.117 Safari/537.36 diff --git a/backend/app/spiders/csdn_config/config_spider/__init__.py b/backend/app/spiders/csdn_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/csdn_config/config_spider/items.py b/backend/app/spiders/csdn_config/config_spider/items.py deleted file mode 100755 index 3c8e5e54..00000000 --- a/backend/app/spiders/csdn_config/config_spider/items.py +++ /dev/null @@ -1,20 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - url = scrapy.Field() - content = scrapy.Field() - views = scrapy.Field() - title = scrapy.Field() - author = scrapy.Field() - diff --git a/backend/app/spiders/csdn_config/config_spider/middlewares.py b/backend/app/spiders/csdn_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/csdn_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/csdn_config/config_spider/pipelines.py b/backend/app/spiders/csdn_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/csdn_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/csdn_config/config_spider/settings.py b/backend/app/spiders/csdn_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/csdn_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/csdn_config/config_spider/spiders/__init__.py b/backend/app/spiders/csdn_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/csdn_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/csdn_config/config_spider/spiders/spider.py b/backend/app/spiders/csdn_config/config_spider/spiders/spider.py deleted file mode 100755 index 9ecc4aae..00000000 --- a/backend/app/spiders/csdn_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,41 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='https://so.csdn.net/so/search/s.do?q=crawlab', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.css('.search-list-con > .search-list'): - item = Item() - item['url'] = elem.xpath('.//*[@class="limit_width"]/a/@href').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield scrapy.Request(url=get_real_url(response, item['url']), callback=self.parse_detail, meta={'item': item}) - next_url = response.css('a.btn-next::attr("href")').extract_first() - yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) - - def parse_detail(self, response): - item = Item() if response.meta.get('item') is None else response.meta.get('item') - item['content'] = response.xpath('string(.//div[@id="content_views"])').extract_first() - item['views'] = response.css('.read-count::text').extract_first() - item['title'] = response.css('.title-article::text').extract_first() - item['author'] = response.css('.follow-nickName::text').extract_first() - yield item - - diff --git a/backend/app/spiders/csdn_config/md5.txt b/backend/app/spiders/csdn_config/md5.txt deleted file mode 100755 index e169c42a..00000000 --- a/backend/app/spiders/csdn_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -b6889c74e006a5e619b525d84db62ffd diff --git a/backend/app/spiders/csdn_config/scrapy.cfg b/backend/app/spiders/csdn_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/csdn_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider diff --git a/backend/app/spiders/douban_config/Spiderfile b/backend/app/spiders/douban_config/Spiderfile deleted file mode 100755 index 84f0647a..00000000 --- a/backend/app/spiders/douban_config/Spiderfile +++ /dev/null @@ -1,57 +0,0 @@ -name: "douban_config" -display_name: "豆瓣读书(可配置)" -remark: "豆瓣读书新书推荐,列表" -type: "configurable" -col: "results_douban_config" -engine: scrapy -start_url: https://book.douban.com/latest -start_stage: list -stages: -- name: list - is_list: true - list_css: ul.cover-col-4 > li - list_xpath: "" - page_css: "" - page_xpath: "" - page_attr: "" - fields: - - name: title - css: h2 > a - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: url - css: h2 > a - xpath: "" - attr: href - next_stage: "" - remark: "" - - name: img - css: a.cover img - xpath: "" - attr: src - next_stage: "" - remark: "" - - name: rating - css: p.rating > .color-lightgray - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: abstract - css: p:last-child - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: info - css: .color-gray - xpath: "" - attr: "" - next_stage: "" - remark: "" -settings: - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/douban_config/config_spider/__init__.py b/backend/app/spiders/douban_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/douban_config/config_spider/items.py b/backend/app/spiders/douban_config/config_spider/items.py deleted file mode 100755 index d6959b8d..00000000 --- a/backend/app/spiders/douban_config/config_spider/items.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - title = scrapy.Field() - url = scrapy.Field() - img = scrapy.Field() - rating = scrapy.Field() - abstract = scrapy.Field() - info = scrapy.Field() - diff --git a/backend/app/spiders/douban_config/config_spider/middlewares.py b/backend/app/spiders/douban_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/douban_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/douban_config/config_spider/pipelines.py b/backend/app/spiders/douban_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/douban_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/douban_config/config_spider/settings.py b/backend/app/spiders/douban_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/douban_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/douban_config/config_spider/spiders/__init__.py b/backend/app/spiders/douban_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/douban_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/douban_config/config_spider/spiders/spider.py b/backend/app/spiders/douban_config/config_spider/spiders/spider.py deleted file mode 100755 index 61bb648d..00000000 --- a/backend/app/spiders/douban_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,36 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='https://book.douban.com/latest', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.css('ul.cover-col-4 > li'): - item = Item() - item['title'] = elem.css('h2 > a::text').extract_first() - item['url'] = elem.css('h2 > a::attr("href")').extract_first() - item['img'] = elem.css('a.cover img::attr("src")').extract_first() - item['rating'] = elem.css('p.rating > .color-lightgray::text').extract_first() - item['abstract'] = elem.css('p:last-child::text').extract_first() - item['info'] = elem.css('.color-gray::text').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield item - - diff --git a/backend/app/spiders/douban_config/md5.txt b/backend/app/spiders/douban_config/md5.txt deleted file mode 100755 index 374e3804..00000000 --- a/backend/app/spiders/douban_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -4d59a6c83b0e125d5321beae86bb93ce diff --git a/backend/app/spiders/douban_config/scrapy.cfg b/backend/app/spiders/douban_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/douban_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider diff --git a/backend/app/spiders/jd/Spiderfile b/backend/app/spiders/jd/Spiderfile deleted file mode 100755 index d090472b..00000000 --- a/backend/app/spiders/jd/Spiderfile +++ /dev/null @@ -1,5 +0,0 @@ -name: "jd" -display_name: "京东 (Scrapy)" -col: "results_jd" -type: "customized" -cmd: "scrapy crawl jd_spider" \ No newline at end of file diff --git a/backend/app/spiders/jd/jd/__init__.py b/backend/app/spiders/jd/jd/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/jd/jd/items.py b/backend/app/spiders/jd/jd/items.py deleted file mode 100755 index b2c5e647..00000000 --- a/backend/app/spiders/jd/jd/items.py +++ /dev/null @@ -1,15 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class JdItem(scrapy.Item): - # define the fields for your item here like: - name = scrapy.Field() - price = scrapy.Field() - url = scrapy.Field() diff --git a/backend/app/spiders/jd/jd/middlewares.py b/backend/app/spiders/jd/jd/middlewares.py deleted file mode 100755 index 6fceded5..00000000 --- a/backend/app/spiders/jd/jd/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class JdSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Response, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class JdDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/jd/jd/pipelines.py b/backend/app/spiders/jd/jd/pipelines.py deleted file mode 100755 index 5a7d7cbf..00000000 --- a/backend/app/spiders/jd/jd/pipelines.py +++ /dev/null @@ -1,6 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/backend/app/spiders/jd/jd/settings.py b/backend/app/spiders/jd/jd/settings.py deleted file mode 100755 index ef89ed0c..00000000 --- a/backend/app/spiders/jd/jd/settings.py +++ /dev/null @@ -1,90 +0,0 @@ -# -*- coding: utf-8 -*- - -# Scrapy settings for jd project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://doc.scrapy.org/en/latest/topics/settings.html -# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'jd' - -SPIDER_MODULES = ['jd.spiders'] -NEWSPIDER_MODULE = 'jd.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'jd (+http://www.yourdomain.com)' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = False - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'jd.middlewares.JdSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'jd.middlewares.JdDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://doc.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'crawlab.pipelines.CrawlabMongoPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/jd/jd/spiders/__init__.py b/backend/app/spiders/jd/jd/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/jd/jd/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/jd/jd/spiders/jd_spider.py b/backend/app/spiders/jd/jd/spiders/jd_spider.py deleted file mode 100755 index 4ec94fa9..00000000 --- a/backend/app/spiders/jd/jd/spiders/jd_spider.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy - -from jd.items import JdItem - - -class JdSpiderSpider(scrapy.Spider): - name = 'jd_spider' - allowed_domains = ['jd.com'] - - def start_requests(self): - for i in range(1, 50): - yield scrapy.Request(url=f'https://search.jd.com/Search?keyword=手机&enc=utf-8&page={i}') - - def parse(self, response): - for el in response.css('.gl-item'): - yield JdItem( - url=el.css('.p-name > a::attr("href")').extract_first(), - name=el.css('.p-name > a::attr("title")').extract_first(), - price=float(el.css('.p-price i::text').extract_first()), - ) diff --git a/backend/app/spiders/jd/md5.txt b/backend/app/spiders/jd/md5.txt deleted file mode 100755 index dcd53f51..00000000 --- a/backend/app/spiders/jd/md5.txt +++ /dev/null @@ -1 +0,0 @@ -621486d31459514eb27a082d159d9b8c diff --git a/backend/app/spiders/jd/scrapy.cfg b/backend/app/spiders/jd/scrapy.cfg deleted file mode 100755 index 87cf0280..00000000 --- a/backend/app/spiders/jd/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = jd.settings - -[deploy] -#url = http://localhost:6800/ -project = jd diff --git a/backend/app/spiders/sinastock/Spiderfile b/backend/app/spiders/sinastock/Spiderfile deleted file mode 100755 index b110cb48..00000000 --- a/backend/app/spiders/sinastock/Spiderfile +++ /dev/null @@ -1,5 +0,0 @@ -name: "sinastock" -display_name: "新浪股票 (Scrapy)" -type: "customized" -col: "results_sinastock" -cmd: "scrapy crawl sinastock_spider" \ No newline at end of file diff --git a/backend/app/spiders/sinastock/md5.txt b/backend/app/spiders/sinastock/md5.txt deleted file mode 100755 index 1e5d8ab9..00000000 --- a/backend/app/spiders/sinastock/md5.txt +++ /dev/null @@ -1 +0,0 @@ -80bc091fa45ef4a85c9f1a66c81a4ed7 diff --git a/backend/app/spiders/sinastock/scrapy.cfg b/backend/app/spiders/sinastock/scrapy.cfg deleted file mode 100755 index 4969ad96..00000000 --- a/backend/app/spiders/sinastock/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = sinastock.settings - -[deploy] -#url = http://localhost:6800/ -project = sinastock diff --git a/backend/app/spiders/sinastock/sinastock/__init__.py b/backend/app/spiders/sinastock/sinastock/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/sinastock/sinastock/items.py b/backend/app/spiders/sinastock/sinastock/items.py deleted file mode 100755 index 6e3e5d8e..00000000 --- a/backend/app/spiders/sinastock/sinastock/items.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class NewsItem(scrapy.Item): - # define the fields for your item here like: - _id = scrapy.Field() - title = scrapy.Field() - ts_str = scrapy.Field() - ts = scrapy.Field() - url = scrapy.Field() - text = scrapy.Field() - task_id = scrapy.Field() - source = scrapy.Field() - stocks = scrapy.Field() diff --git a/backend/app/spiders/sinastock/sinastock/middlewares.py b/backend/app/spiders/sinastock/sinastock/middlewares.py deleted file mode 100755 index 912b5e57..00000000 --- a/backend/app/spiders/sinastock/sinastock/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class SinastockSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Response, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class SinastockDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/sinastock/sinastock/pipelines.py b/backend/app/spiders/sinastock/sinastock/pipelines.py deleted file mode 100755 index 5a7d7cbf..00000000 --- a/backend/app/spiders/sinastock/sinastock/pipelines.py +++ /dev/null @@ -1,6 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/backend/app/spiders/sinastock/sinastock/settings.py b/backend/app/spiders/sinastock/sinastock/settings.py deleted file mode 100755 index 3e01d3ca..00000000 --- a/backend/app/spiders/sinastock/sinastock/settings.py +++ /dev/null @@ -1,89 +0,0 @@ -# -*- coding: utf-8 -*- - -# Scrapy settings for sinastock project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://doc.scrapy.org/en/latest/topics/settings.html -# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'sinastock' - -SPIDER_MODULES = ['sinastock.spiders'] -NEWSPIDER_MODULE = 'sinastock.spiders' - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -# USER_AGENT = 'sinastock (+http://www.yourdomain.com)' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -# CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -# DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -# CONCURRENT_REQUESTS_PER_DOMAIN = 16 -# CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -# COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -# TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -# DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -# } - -# Enable or disable spider middlewares -# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html -# SPIDER_MIDDLEWARES = { -# 'sinastock.middlewares.SinastockSpiderMiddleware': 543, -# } - -# Enable or disable downloader middlewares -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -# DOWNLOADER_MIDDLEWARES = { -# 'sinastock.middlewares.SinastockDownloaderMiddleware': 543, -# } - -# Enable or disable extensions -# See https://doc.scrapy.org/en/latest/topics/extensions.html -# EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -# } - -# Configure item pipelines -# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'crawlab.pipelines.CrawlabMongoPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/autothrottle.html -# AUTOTHROTTLE_ENABLED = True -# The initial download delay -# AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -# AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -# AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -# HTTPCACHE_ENABLED = True -# HTTPCACHE_EXPIRATION_SECS = 0 -# HTTPCACHE_DIR = 'httpcache' -# HTTPCACHE_IGNORE_HTTP_CODES = [] -# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/sinastock/sinastock/spiders/__init__.py b/backend/app/spiders/sinastock/sinastock/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/sinastock/sinastock/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py b/backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py deleted file mode 100755 index 54daf763..00000000 --- a/backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py +++ /dev/null @@ -1,59 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -from datetime import datetime - -import scrapy -from pymongo import MongoClient - -from sinastock.items import NewsItem - -class SinastockSpiderSpider(scrapy.Spider): - name = 'sinastock_spider' - allowed_domains = ['finance.sina.com.cn'] - mongo = MongoClient( - host=os.environ.get('MONGO_HOST') or 'localhost', - port=int(os.environ.get('MONGO_PORT') or 27017) - ) - db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test'] - col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'stock_news') - - def start_requests(self): - col = self.db['stocks'] - for s in col.find({}): - code, ex = s['ts_code'].split('.') - for i in range(10): - url = f'http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol={ex.lower()}{code}&Page={i + 1}' - yield scrapy.Request( - url=url, - callback=self.parse, - meta={'ts_code': s['ts_code']} - ) - - def parse(self, response): - for a in response.css('.datelist > ul > a'): - url = a.css('a::attr("href")').extract_first() - item = NewsItem( - title=a.css('a::text').extract_first(), - url=url, - source='sina', - stocks=[response.meta['ts_code']] - ) - yield scrapy.Request( - url=url, - callback=self.parse_detail, - meta={'item': item} - ) - - def parse_detail(self, response): - item = response.meta['item'] - text = response.css('#artibody').extract_first() - pre = re.compile('>(.*?)<') - text = ''.join(pre.findall(text)) - item['text'] = text.replace('\u3000', '') - item['ts_str'] = response.css('.date::text').extract_first() - if item['text'] is None or item['ts_str'] is None: - pass - else: - item['ts'] = datetime.strptime(item['ts_str'], '%Y年%m月%d日 %H:%M') - yield item diff --git a/backend/app/spiders/v2ex_config/Spiderfile b/backend/app/spiders/v2ex_config/Spiderfile deleted file mode 100755 index bb18d40a..00000000 --- a/backend/app/spiders/v2ex_config/Spiderfile +++ /dev/null @@ -1,54 +0,0 @@ -name: "v2ex_config" -display_name: "V2ex(可配置)" -remark: "V2ex,列表+详情" -type: "configurable" -col: "results_v2ex_config" -engine: scrapy -start_url: https://v2ex.com/ -start_stage: list -stages: -- name: list - is_list: true - list_css: .cell.item - list_xpath: "" - page_css: "" - page_xpath: "" - page_attr: href - fields: - - name: title - css: a.topic-link - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: url - css: a.topic-link - xpath: "" - attr: href - next_stage: detail - remark: "" - - name: replies - css: .count_livid - xpath: "" - attr: "" - next_stage: "" - remark: "" -- name: detail - is_list: false - list_css: "" - list_xpath: "" - page_css: "" - page_xpath: "" - page_attr: "" - fields: - - name: content - css: "" - xpath: .//*[@class="markdown_body"] - attr: "" - next_stage: "" - remark: "" -settings: - AUTOTHROTTLE_ENABLED: "true" - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/79.0.3945.117 Safari/537.36 diff --git a/backend/app/spiders/v2ex_config/config_spider/__init__.py b/backend/app/spiders/v2ex_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/v2ex_config/config_spider/items.py b/backend/app/spiders/v2ex_config/config_spider/items.py deleted file mode 100755 index d2c01a06..00000000 --- a/backend/app/spiders/v2ex_config/config_spider/items.py +++ /dev/null @@ -1,19 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - title = scrapy.Field() - url = scrapy.Field() - replies = scrapy.Field() - content = scrapy.Field() - diff --git a/backend/app/spiders/v2ex_config/config_spider/middlewares.py b/backend/app/spiders/v2ex_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/v2ex_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/v2ex_config/config_spider/pipelines.py b/backend/app/spiders/v2ex_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/v2ex_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/v2ex_config/config_spider/settings.py b/backend/app/spiders/v2ex_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/v2ex_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py b/backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/v2ex_config/config_spider/spiders/spider.py b/backend/app/spiders/v2ex_config/config_spider/spiders/spider.py deleted file mode 100755 index 4763e040..00000000 --- a/backend/app/spiders/v2ex_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,38 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='https://v2ex.com/', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.css('.cell.item'): - item = Item() - item['title'] = elem.css('a.topic-link::text').extract_first() - item['url'] = elem.css('a.topic-link::attr("href")').extract_first() - item['replies'] = elem.css('.count_livid::text').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield scrapy.Request(url=get_real_url(response, item['url']), callback=self.parse_detail, meta={'item': item}) - - def parse_detail(self, response): - item = Item() if response.meta.get('item') is None else response.meta.get('item') - item['content'] = response.xpath('string(.//*[@class="markdown_body"])').extract_first() - yield item - - diff --git a/backend/app/spiders/v2ex_config/md5.txt b/backend/app/spiders/v2ex_config/md5.txt deleted file mode 100755 index 5d725b2c..00000000 --- a/backend/app/spiders/v2ex_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -402c0a07873ef74b9b574bc0f6b28423 diff --git a/backend/app/spiders/v2ex_config/scrapy.cfg b/backend/app/spiders/v2ex_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/v2ex_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider diff --git a/backend/app/spiders/xueqiu/Spiderfile b/backend/app/spiders/xueqiu/Spiderfile deleted file mode 100755 index 38aa5dbe..00000000 --- a/backend/app/spiders/xueqiu/Spiderfile +++ /dev/null @@ -1,5 +0,0 @@ -name: "xueqiu" -display_name: "雪球网 (Scrapy)" -type: "customized" -col: "results_xueqiu" -cmd: "scrapy crawl xueqiu_spider" \ No newline at end of file diff --git a/backend/app/spiders/xueqiu/md5.txt b/backend/app/spiders/xueqiu/md5.txt deleted file mode 100755 index 6a9a2072..00000000 --- a/backend/app/spiders/xueqiu/md5.txt +++ /dev/null @@ -1 +0,0 @@ -df177994199caa691d87fc0c5031326d diff --git a/backend/app/spiders/xueqiu/scrapy.cfg b/backend/app/spiders/xueqiu/scrapy.cfg deleted file mode 100755 index 2c5ce3b3..00000000 --- a/backend/app/spiders/xueqiu/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = xueqiu.settings - -[deploy] -#url = http://localhost:6800/ -project = xueqiu diff --git a/backend/app/spiders/xueqiu/xueqiu/__init__.py b/backend/app/spiders/xueqiu/xueqiu/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/xueqiu/xueqiu/items.py b/backend/app/spiders/xueqiu/xueqiu/items.py deleted file mode 100755 index 5471594d..00000000 --- a/backend/app/spiders/xueqiu/xueqiu/items.py +++ /dev/null @@ -1,23 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class XueqiuItem(scrapy.Item): - # define the fields for your item here like: - _id = scrapy.Field() - task_id = scrapy.Field() - id = scrapy.Field() - text = scrapy.Field() - url = scrapy.Field() - target = scrapy.Field() - view_count = scrapy.Field() - mark = scrapy.Field() - created_at = scrapy.Field() - ts = scrapy.Field() - source = scrapy.Field() diff --git a/backend/app/spiders/xueqiu/xueqiu/middlewares.py b/backend/app/spiders/xueqiu/xueqiu/middlewares.py deleted file mode 100755 index f60102ce..00000000 --- a/backend/app/spiders/xueqiu/xueqiu/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class XueqiuSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Response, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class XueqiuDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/xueqiu/xueqiu/pipelines.py b/backend/app/spiders/xueqiu/xueqiu/pipelines.py deleted file mode 100755 index 5a7d7cbf..00000000 --- a/backend/app/spiders/xueqiu/xueqiu/pipelines.py +++ /dev/null @@ -1,6 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/backend/app/spiders/xueqiu/xueqiu/settings.py b/backend/app/spiders/xueqiu/xueqiu/settings.py deleted file mode 100755 index 1d898e2f..00000000 --- a/backend/app/spiders/xueqiu/xueqiu/settings.py +++ /dev/null @@ -1,89 +0,0 @@ -# -*- coding: utf-8 -*- - -# Scrapy settings for xueqiu project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://doc.scrapy.org/en/latest/topics/settings.html -# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'xueqiu' - -SPIDER_MODULES = ['xueqiu.spiders'] -NEWSPIDER_MODULE = 'xueqiu.spiders' - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = False - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -# CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -# DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -# CONCURRENT_REQUESTS_PER_DOMAIN = 16 -# CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -# COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -# TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -# DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -# } - -# Enable or disable spider middlewares -# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html -# SPIDER_MIDDLEWARES = { -# 'xueqiu.middlewares.XueqiuSpiderMiddleware': 543, -# } - -# Enable or disable downloader middlewares -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -# DOWNLOADER_MIDDLEWARES = { -# 'xueqiu.middlewares.XueqiuDownloaderMiddleware': 543, -# } - -# Enable or disable extensions -# See https://doc.scrapy.org/en/latest/topics/extensions.html -# EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -# } - -# Configure item pipelines -# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'crawlab.pipelines.CrawlabMongoPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/autothrottle.html -# AUTOTHROTTLE_ENABLED = True -# The initial download delay -# AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -# AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -# AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -# HTTPCACHE_ENABLED = True -# HTTPCACHE_EXPIRATION_SECS = 0 -# HTTPCACHE_DIR = 'httpcache' -# HTTPCACHE_IGNORE_HTTP_CODES = [] -# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/xueqiu/xueqiu/spiders/__init__.py b/backend/app/spiders/xueqiu/xueqiu/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/xueqiu/xueqiu/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py b/backend/app/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py deleted file mode 100755 index a746e156..00000000 --- a/backend/app/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py +++ /dev/null @@ -1,46 +0,0 @@ -# -*- coding: utf-8 -*- -import json -from datetime import datetime -from time import sleep - -import scrapy - -from xueqiu.items import XueqiuItem - - -class XueqiuSpiderSpider(scrapy.Spider): - name = 'xueqiu_spider' - allowed_domains = ['xueqiu.com'] - - def start_requests(self): - return [scrapy.Request( - url='https://xueqiu.com', - callback=self.parse_home - )] - - def parse_home(self, response): - yield scrapy.Request( - url='https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=20&category=6' - ) - - def parse(self, response): - data = json.loads(response.body) - next_max_id = data.get('next_max_id') - sleep(1) - for row in data.get('list'): - d = json.loads(row.get('data')) - item = XueqiuItem( - id=d['id'], - text=d['text'], - mark=d['mark'], - url=d['target'], - created_at=d['created_at'], - ts=datetime.fromtimestamp(d['created_at'] / 1e3), - view_count=d['view_count'], - source='xueqiu' - ) - yield item - - yield scrapy.Request( - url=f'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id={next_max_id}&count=20&category=6' - ) diff --git a/backend/app/spiders/xueqiu_config/Spiderfile b/backend/app/spiders/xueqiu_config/Spiderfile deleted file mode 100755 index 0de50e9e..00000000 --- a/backend/app/spiders/xueqiu_config/Spiderfile +++ /dev/null @@ -1,39 +0,0 @@ -name: "xueqiu_config" -display_name: "雪球网(可配置)" -remark: "雪球网新闻,列表" -type: "configurable" -col: "results_xueqiu_config" -engine: scrapy -start_url: https://xueqiu.com/ -start_stage: list -stages: -- name: list - is_list: true - list_css: "" - list_xpath: .//*[contains(@class, "AnonymousHome_home__timeline__item")] - page_css: "" - page_xpath: "" - page_attr: "" - fields: - - name: title - css: h3 > a - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: url - css: h3 > a - xpath: "" - attr: href - next_stage: "" - remark: "" - - name: abstract - css: p - xpath: "" - attr: "" - next_stage: "" - remark: "" -settings: - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/xueqiu_config/config_spider/__init__.py b/backend/app/spiders/xueqiu_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/xueqiu_config/config_spider/items.py b/backend/app/spiders/xueqiu_config/config_spider/items.py deleted file mode 100755 index 9282765f..00000000 --- a/backend/app/spiders/xueqiu_config/config_spider/items.py +++ /dev/null @@ -1,18 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - title = scrapy.Field() - url = scrapy.Field() - abstract = scrapy.Field() - diff --git a/backend/app/spiders/xueqiu_config/config_spider/middlewares.py b/backend/app/spiders/xueqiu_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/xueqiu_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/xueqiu_config/config_spider/pipelines.py b/backend/app/spiders/xueqiu_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/xueqiu_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/xueqiu_config/config_spider/settings.py b/backend/app/spiders/xueqiu_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/xueqiu_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/xueqiu_config/config_spider/spiders/__init__.py b/backend/app/spiders/xueqiu_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/xueqiu_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/xueqiu_config/config_spider/spiders/spider.py b/backend/app/spiders/xueqiu_config/config_spider/spiders/spider.py deleted file mode 100755 index 79d4636b..00000000 --- a/backend/app/spiders/xueqiu_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,33 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='https://xueqiu.com/', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.xpath('.//*[contains(@class, "AnonymousHome_home__timeline__item")]'): - item = Item() - item['title'] = elem.css('h3 > a::text').extract_first() - item['url'] = elem.css('h3 > a::attr("href")').extract_first() - item['abstract'] = elem.css('p::text').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield item - - diff --git a/backend/app/spiders/xueqiu_config/md5.txt b/backend/app/spiders/xueqiu_config/md5.txt deleted file mode 100755 index 39a6df77..00000000 --- a/backend/app/spiders/xueqiu_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -e3da3aacb2d290cb179a79028fbfff9c diff --git a/backend/app/spiders/xueqiu_config/scrapy.cfg b/backend/app/spiders/xueqiu_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/xueqiu_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider diff --git a/backend/app/spiders/zongheng_config/Spiderfile b/backend/app/spiders/zongheng_config/Spiderfile deleted file mode 100755 index 0163fac7..00000000 --- a/backend/app/spiders/zongheng_config/Spiderfile +++ /dev/null @@ -1,45 +0,0 @@ -name: "zongheng_config" -display_name: "纵横(可配置)" -remark: "纵横小说网,列表" -type: "configurable" -col: "results_zongheng_config" -engine: scrapy -start_url: http://www.zongheng.com/rank/details.html?rt=1&d=1 -start_stage: list -stages: -- name: list - is_list: true - list_css: .rank_d_list - list_xpath: "" - page_css: "" - page_xpath: "" - page_attr: href - fields: - - name: title - css: .rank_d_b_name > a - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: url - css: .rank_d_b_name > a - xpath: "" - attr: href - next_stage: "" - remark: "" - - name: abstract - css: body - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: votes - css: .rank_d_b_ticket - xpath: "" - attr: "" - next_stage: "" - remark: "" -settings: - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/zongheng_config/config_spider/__init__.py b/backend/app/spiders/zongheng_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/zongheng_config/config_spider/items.py b/backend/app/spiders/zongheng_config/config_spider/items.py deleted file mode 100755 index 528c3187..00000000 --- a/backend/app/spiders/zongheng_config/config_spider/items.py +++ /dev/null @@ -1,19 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - title = scrapy.Field() - url = scrapy.Field() - abstract = scrapy.Field() - votes = scrapy.Field() - diff --git a/backend/app/spiders/zongheng_config/config_spider/middlewares.py b/backend/app/spiders/zongheng_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/zongheng_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/zongheng_config/config_spider/pipelines.py b/backend/app/spiders/zongheng_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/zongheng_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/zongheng_config/config_spider/settings.py b/backend/app/spiders/zongheng_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/zongheng_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/zongheng_config/config_spider/spiders/__init__.py b/backend/app/spiders/zongheng_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/zongheng_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/zongheng_config/config_spider/spiders/spider.py b/backend/app/spiders/zongheng_config/config_spider/spiders/spider.py deleted file mode 100755 index cf1b6a08..00000000 --- a/backend/app/spiders/zongheng_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,34 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='http://www.zongheng.com/rank/details.html?rt=1&d=1', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.css('.rank_d_list'): - item = Item() - item['title'] = elem.css('.rank_d_b_name > a::text').extract_first() - item['url'] = elem.css('.rank_d_b_name > a::attr("href")').extract_first() - item['abstract'] = elem.css('body::text').extract_first() - item['votes'] = elem.css('.rank_d_b_ticket::text').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield item - - diff --git a/backend/app/spiders/zongheng_config/md5.txt b/backend/app/spiders/zongheng_config/md5.txt deleted file mode 100755 index 46fd3de6..00000000 --- a/backend/app/spiders/zongheng_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -82cb98a6103fb878501df81f191703ba diff --git a/backend/app/spiders/zongheng_config/scrapy.cfg b/backend/app/spiders/zongheng_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/zongheng_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider