From a9ae673d5257b91a4d1895de75a9479fdb76ca70 Mon Sep 17 00:00:00 2001 From: hantmac Date: Mon, 27 Apr 2020 15:59:06 +0800 Subject: [PATCH 01/11] support send log to ES --- .gitignore | 1 + backend/app/spiders/amazon_config/Spiderfile | 51 ++++++++ .../amazon_config/config_spider/__init__.py | 0 .../amazon_config/config_spider/items.py | 20 ++++ .../config_spider/middlewares.py | 103 ++++++++++++++++ .../amazon_config/config_spider/pipelines.py | 27 +++++ .../amazon_config/config_spider/settings.py | 111 ++++++++++++++++++ .../config_spider/spiders/__init__.py | 4 + .../config_spider/spiders/spider.py | 37 ++++++ backend/app/spiders/amazon_config/md5.txt | 1 + backend/app/spiders/amazon_config/scrapy.cfg | 11 ++ .../app/spiders/autohome_config/Spiderfile | 57 +++++++++ .../autohome_config/config_spider/__init__.py | 0 .../autohome_config/config_spider/items.py | 21 ++++ .../config_spider/middlewares.py | 103 ++++++++++++++++ .../config_spider/pipelines.py | 27 +++++ .../autohome_config/config_spider/settings.py | 111 ++++++++++++++++++ .../config_spider/spiders/__init__.py | 4 + .../config_spider/spiders/spider.py | 38 ++++++ backend/app/spiders/autohome_config/md5.txt | 1 + .../app/spiders/autohome_config/scrapy.cfg | 11 ++ backend/app/spiders/baidu_config/Spiderfile | 39 ++++++ .../baidu_config/config_spider/__init__.py | 0 .../baidu_config/config_spider/items.py | 18 +++ .../baidu_config/config_spider/middlewares.py | 103 ++++++++++++++++ .../baidu_config/config_spider/pipelines.py | 27 +++++ .../baidu_config/config_spider/settings.py | 111 ++++++++++++++++++ .../config_spider/spiders/__init__.py | 4 + .../config_spider/spiders/spider.py | 35 ++++++ backend/app/spiders/baidu_config/md5.txt | 1 + backend/app/spiders/baidu_config/scrapy.cfg | 11 ++ backend/app/spiders/bing_general/Spiderfile | 6 + .../app/spiders/bing_general/bing_spider.py | 41 +++++++ backend/app/spiders/bing_general/md5.txt | 1 + backend/app/spiders/chinaz/Spiderfile | 5 + backend/app/spiders/chinaz/chinaz/__init__.py | 0 backend/app/spiders/chinaz/chinaz/items.py | 21 ++++ .../app/spiders/chinaz/chinaz/middlewares.py | 103 ++++++++++++++++ .../app/spiders/chinaz/chinaz/pipelines.py | 7 ++ backend/app/spiders/chinaz/chinaz/settings.py | 90 ++++++++++++++ .../spiders/chinaz/chinaz/spiders/__init__.py | 4 + .../chinaz/chinaz/spiders/chinaz_spider.py | 63 ++++++++++ backend/app/spiders/chinaz/md5.txt | 1 + backend/app/spiders/chinaz/scrapy.cfg | 11 ++ backend/app/spiders/csdn_config/Spiderfile | 60 ++++++++++ .../csdn_config/config_spider/__init__.py | 0 .../csdn_config/config_spider/items.py | 20 ++++ .../csdn_config/config_spider/middlewares.py | 103 ++++++++++++++++ .../csdn_config/config_spider/pipelines.py | 27 +++++ .../csdn_config/config_spider/settings.py | 111 ++++++++++++++++++ .../config_spider/spiders/__init__.py | 4 + .../config_spider/spiders/spider.py | 41 +++++++ backend/app/spiders/csdn_config/md5.txt | 1 + backend/app/spiders/csdn_config/scrapy.cfg | 11 ++ backend/app/spiders/douban_config/Spiderfile | 57 +++++++++ .../douban_config/config_spider/__init__.py | 0 .../douban_config/config_spider/items.py | 21 ++++ .../config_spider/middlewares.py | 103 ++++++++++++++++ .../douban_config/config_spider/pipelines.py | 27 +++++ .../douban_config/config_spider/settings.py | 111 ++++++++++++++++++ .../config_spider/spiders/__init__.py | 4 + .../config_spider/spiders/spider.py | 36 ++++++ backend/app/spiders/douban_config/md5.txt | 1 + backend/app/spiders/douban_config/scrapy.cfg | 11 ++ backend/app/spiders/jd/Spiderfile | 5 + backend/app/spiders/jd/jd/__init__.py | 0 backend/app/spiders/jd/jd/items.py | 15 +++ backend/app/spiders/jd/jd/middlewares.py | 103 ++++++++++++++++ backend/app/spiders/jd/jd/pipelines.py | 6 + backend/app/spiders/jd/jd/settings.py | 90 ++++++++++++++ backend/app/spiders/jd/jd/spiders/__init__.py | 4 + .../app/spiders/jd/jd/spiders/jd_spider.py | 21 ++++ backend/app/spiders/jd/md5.txt | 1 + backend/app/spiders/jd/scrapy.cfg | 11 ++ backend/app/spiders/sinastock/Spiderfile | 5 + backend/app/spiders/sinastock/md5.txt | 1 + backend/app/spiders/sinastock/scrapy.cfg | 11 ++ .../spiders/sinastock/sinastock/__init__.py | 0 .../app/spiders/sinastock/sinastock/items.py | 21 ++++ .../sinastock/sinastock/middlewares.py | 103 ++++++++++++++++ .../spiders/sinastock/sinastock/pipelines.py | 6 + .../spiders/sinastock/sinastock/settings.py | 89 ++++++++++++++ .../sinastock/sinastock/spiders/__init__.py | 4 + .../sinastock/spiders/sinastock_spider.py | 59 ++++++++++ backend/app/spiders/v2ex_config/Spiderfile | 54 +++++++++ .../v2ex_config/config_spider/__init__.py | 0 .../v2ex_config/config_spider/items.py | 19 +++ .../v2ex_config/config_spider/middlewares.py | 103 ++++++++++++++++ .../v2ex_config/config_spider/pipelines.py | 27 +++++ .../v2ex_config/config_spider/settings.py | 111 ++++++++++++++++++ .../config_spider/spiders/__init__.py | 4 + .../config_spider/spiders/spider.py | 38 ++++++ backend/app/spiders/v2ex_config/md5.txt | 1 + backend/app/spiders/v2ex_config/scrapy.cfg | 11 ++ backend/app/spiders/xueqiu/Spiderfile | 5 + backend/app/spiders/xueqiu/md5.txt | 1 + backend/app/spiders/xueqiu/scrapy.cfg | 11 ++ backend/app/spiders/xueqiu/xueqiu/__init__.py | 0 backend/app/spiders/xueqiu/xueqiu/items.py | 23 ++++ .../app/spiders/xueqiu/xueqiu/middlewares.py | 103 ++++++++++++++++ .../app/spiders/xueqiu/xueqiu/pipelines.py | 6 + backend/app/spiders/xueqiu/xueqiu/settings.py | 89 ++++++++++++++ .../spiders/xueqiu/xueqiu/spiders/__init__.py | 4 + .../xueqiu/xueqiu/spiders/xueqiu_spider.py | 46 ++++++++ backend/app/spiders/xueqiu_config/Spiderfile | 39 ++++++ .../xueqiu_config/config_spider/__init__.py | 0 .../xueqiu_config/config_spider/items.py | 18 +++ .../config_spider/middlewares.py | 103 ++++++++++++++++ .../xueqiu_config/config_spider/pipelines.py | 27 +++++ .../xueqiu_config/config_spider/settings.py | 111 ++++++++++++++++++ .../config_spider/spiders/__init__.py | 4 + .../config_spider/spiders/spider.py | 33 ++++++ backend/app/spiders/xueqiu_config/md5.txt | 1 + backend/app/spiders/xueqiu_config/scrapy.cfg | 11 ++ .../app/spiders/zongheng_config/Spiderfile | 45 +++++++ .../zongheng_config/config_spider/__init__.py | 0 .../zongheng_config/config_spider/items.py | 19 +++ .../config_spider/middlewares.py | 103 ++++++++++++++++ .../config_spider/pipelines.py | 27 +++++ .../zongheng_config/config_spider/settings.py | 111 ++++++++++++++++++ .../config_spider/spiders/__init__.py | 4 + .../config_spider/spiders/spider.py | 34 ++++++ backend/app/spiders/zongheng_config/md5.txt | 1 + .../app/spiders/zongheng_config/scrapy.cfg | 11 ++ backend/conf/config.yml | 2 + backend/config/config.go | 2 + backend/database/es_base.go | 44 +++++++ backend/go.mod | 15 ++- backend/go.sum | 47 ++++++-- backend/model/task.go | 1 + backend/services/task.go | 28 ++++- 131 files changed, 4227 insertions(+), 15 deletions(-) create mode 100755 backend/app/spiders/amazon_config/Spiderfile create mode 100755 backend/app/spiders/amazon_config/config_spider/__init__.py create mode 100755 backend/app/spiders/amazon_config/config_spider/items.py create mode 100755 backend/app/spiders/amazon_config/config_spider/middlewares.py create mode 100755 backend/app/spiders/amazon_config/config_spider/pipelines.py create mode 100755 backend/app/spiders/amazon_config/config_spider/settings.py create mode 100755 backend/app/spiders/amazon_config/config_spider/spiders/__init__.py create mode 100755 backend/app/spiders/amazon_config/config_spider/spiders/spider.py create mode 100755 backend/app/spiders/amazon_config/md5.txt create mode 100755 backend/app/spiders/amazon_config/scrapy.cfg create mode 100755 backend/app/spiders/autohome_config/Spiderfile create mode 100755 backend/app/spiders/autohome_config/config_spider/__init__.py create mode 100755 backend/app/spiders/autohome_config/config_spider/items.py create mode 100755 backend/app/spiders/autohome_config/config_spider/middlewares.py create mode 100755 backend/app/spiders/autohome_config/config_spider/pipelines.py create mode 100755 backend/app/spiders/autohome_config/config_spider/settings.py create mode 100755 backend/app/spiders/autohome_config/config_spider/spiders/__init__.py create mode 100755 backend/app/spiders/autohome_config/config_spider/spiders/spider.py create mode 100755 backend/app/spiders/autohome_config/md5.txt create mode 100755 backend/app/spiders/autohome_config/scrapy.cfg create mode 100755 backend/app/spiders/baidu_config/Spiderfile create mode 100755 backend/app/spiders/baidu_config/config_spider/__init__.py create mode 100755 backend/app/spiders/baidu_config/config_spider/items.py create mode 100755 backend/app/spiders/baidu_config/config_spider/middlewares.py create mode 100755 backend/app/spiders/baidu_config/config_spider/pipelines.py create mode 100755 backend/app/spiders/baidu_config/config_spider/settings.py create mode 100755 backend/app/spiders/baidu_config/config_spider/spiders/__init__.py create mode 100755 backend/app/spiders/baidu_config/config_spider/spiders/spider.py create mode 100755 backend/app/spiders/baidu_config/md5.txt create mode 100755 backend/app/spiders/baidu_config/scrapy.cfg create mode 100755 backend/app/spiders/bing_general/Spiderfile create mode 100755 backend/app/spiders/bing_general/bing_spider.py create mode 100755 backend/app/spiders/bing_general/md5.txt create mode 100755 backend/app/spiders/chinaz/Spiderfile create mode 100755 backend/app/spiders/chinaz/chinaz/__init__.py create mode 100755 backend/app/spiders/chinaz/chinaz/items.py create mode 100755 backend/app/spiders/chinaz/chinaz/middlewares.py create mode 100755 backend/app/spiders/chinaz/chinaz/pipelines.py create mode 100755 backend/app/spiders/chinaz/chinaz/settings.py create mode 100755 backend/app/spiders/chinaz/chinaz/spiders/__init__.py create mode 100755 backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py create mode 100755 backend/app/spiders/chinaz/md5.txt create mode 100755 backend/app/spiders/chinaz/scrapy.cfg create mode 100755 backend/app/spiders/csdn_config/Spiderfile create mode 100755 backend/app/spiders/csdn_config/config_spider/__init__.py create mode 100755 backend/app/spiders/csdn_config/config_spider/items.py create mode 100755 backend/app/spiders/csdn_config/config_spider/middlewares.py create mode 100755 backend/app/spiders/csdn_config/config_spider/pipelines.py create mode 100755 backend/app/spiders/csdn_config/config_spider/settings.py create mode 100755 backend/app/spiders/csdn_config/config_spider/spiders/__init__.py create mode 100755 backend/app/spiders/csdn_config/config_spider/spiders/spider.py create mode 100755 backend/app/spiders/csdn_config/md5.txt create mode 100755 backend/app/spiders/csdn_config/scrapy.cfg create mode 100755 backend/app/spiders/douban_config/Spiderfile create mode 100755 backend/app/spiders/douban_config/config_spider/__init__.py create mode 100755 backend/app/spiders/douban_config/config_spider/items.py create mode 100755 backend/app/spiders/douban_config/config_spider/middlewares.py create mode 100755 backend/app/spiders/douban_config/config_spider/pipelines.py create mode 100755 backend/app/spiders/douban_config/config_spider/settings.py create mode 100755 backend/app/spiders/douban_config/config_spider/spiders/__init__.py create mode 100755 backend/app/spiders/douban_config/config_spider/spiders/spider.py create mode 100755 backend/app/spiders/douban_config/md5.txt create mode 100755 backend/app/spiders/douban_config/scrapy.cfg create mode 100755 backend/app/spiders/jd/Spiderfile create mode 100755 backend/app/spiders/jd/jd/__init__.py create mode 100755 backend/app/spiders/jd/jd/items.py create mode 100755 backend/app/spiders/jd/jd/middlewares.py create mode 100755 backend/app/spiders/jd/jd/pipelines.py create mode 100755 backend/app/spiders/jd/jd/settings.py create mode 100755 backend/app/spiders/jd/jd/spiders/__init__.py create mode 100755 backend/app/spiders/jd/jd/spiders/jd_spider.py create mode 100755 backend/app/spiders/jd/md5.txt create mode 100755 backend/app/spiders/jd/scrapy.cfg create mode 100755 backend/app/spiders/sinastock/Spiderfile create mode 100755 backend/app/spiders/sinastock/md5.txt create mode 100755 backend/app/spiders/sinastock/scrapy.cfg create mode 100755 backend/app/spiders/sinastock/sinastock/__init__.py create mode 100755 backend/app/spiders/sinastock/sinastock/items.py create mode 100755 backend/app/spiders/sinastock/sinastock/middlewares.py create mode 100755 backend/app/spiders/sinastock/sinastock/pipelines.py create mode 100755 backend/app/spiders/sinastock/sinastock/settings.py create mode 100755 backend/app/spiders/sinastock/sinastock/spiders/__init__.py create mode 100755 backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py create mode 100755 backend/app/spiders/v2ex_config/Spiderfile create mode 100755 backend/app/spiders/v2ex_config/config_spider/__init__.py create mode 100755 backend/app/spiders/v2ex_config/config_spider/items.py create mode 100755 backend/app/spiders/v2ex_config/config_spider/middlewares.py create mode 100755 backend/app/spiders/v2ex_config/config_spider/pipelines.py create mode 100755 backend/app/spiders/v2ex_config/config_spider/settings.py create mode 100755 backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py create mode 100755 backend/app/spiders/v2ex_config/config_spider/spiders/spider.py create mode 100755 backend/app/spiders/v2ex_config/md5.txt create mode 100755 backend/app/spiders/v2ex_config/scrapy.cfg create mode 100755 backend/app/spiders/xueqiu/Spiderfile create mode 100755 backend/app/spiders/xueqiu/md5.txt create mode 100755 backend/app/spiders/xueqiu/scrapy.cfg create mode 100755 backend/app/spiders/xueqiu/xueqiu/__init__.py create mode 100755 backend/app/spiders/xueqiu/xueqiu/items.py create mode 100755 backend/app/spiders/xueqiu/xueqiu/middlewares.py create mode 100755 backend/app/spiders/xueqiu/xueqiu/pipelines.py create mode 100755 backend/app/spiders/xueqiu/xueqiu/settings.py create mode 100755 backend/app/spiders/xueqiu/xueqiu/spiders/__init__.py create mode 100755 backend/app/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py create mode 100755 backend/app/spiders/xueqiu_config/Spiderfile create mode 100755 backend/app/spiders/xueqiu_config/config_spider/__init__.py create mode 100755 backend/app/spiders/xueqiu_config/config_spider/items.py create mode 100755 backend/app/spiders/xueqiu_config/config_spider/middlewares.py create mode 100755 backend/app/spiders/xueqiu_config/config_spider/pipelines.py create mode 100755 backend/app/spiders/xueqiu_config/config_spider/settings.py create mode 100755 backend/app/spiders/xueqiu_config/config_spider/spiders/__init__.py create mode 100755 backend/app/spiders/xueqiu_config/config_spider/spiders/spider.py create mode 100755 backend/app/spiders/xueqiu_config/md5.txt create mode 100755 backend/app/spiders/xueqiu_config/scrapy.cfg create mode 100755 backend/app/spiders/zongheng_config/Spiderfile create mode 100755 backend/app/spiders/zongheng_config/config_spider/__init__.py create mode 100755 backend/app/spiders/zongheng_config/config_spider/items.py create mode 100755 backend/app/spiders/zongheng_config/config_spider/middlewares.py create mode 100755 backend/app/spiders/zongheng_config/config_spider/pipelines.py create mode 100755 backend/app/spiders/zongheng_config/config_spider/settings.py create mode 100755 backend/app/spiders/zongheng_config/config_spider/spiders/__init__.py create mode 100755 backend/app/spiders/zongheng_config/config_spider/spiders/spider.py create mode 100755 backend/app/spiders/zongheng_config/md5.txt create mode 100755 backend/app/spiders/zongheng_config/scrapy.cfg create mode 100644 backend/database/es_base.go diff --git a/.gitignore b/.gitignore index 0b6328c9..6eb0d9f1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .idea/ +.vscode/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/backend/app/spiders/amazon_config/Spiderfile b/backend/app/spiders/amazon_config/Spiderfile new file mode 100755 index 00000000..eea8a538 --- /dev/null +++ b/backend/app/spiders/amazon_config/Spiderfile @@ -0,0 +1,51 @@ +name: "amazon_config" +display_name: "亚马逊中国(可配置)" +remark: "亚马逊中国搜索手机,列表+分页" +type: "configurable" +col: "results_amazon_config" +engine: scrapy +start_url: https://www.amazon.cn/s?k=%E6%89%8B%E6%9C%BA&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_2 +start_stage: list +stages: +- name: list + is_list: true + list_css: .s-result-item + list_xpath: "" + page_css: .a-last > a + page_xpath: "" + page_attr: href + fields: + - name: title + css: span.a-text-normal + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: .a-link-normal + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: price + css: "" + xpath: .//*[@class="a-price-whole"] + attr: "" + next_stage: "" + remark: "" + - name: price_fraction + css: "" + xpath: .//*[@class="a-price-fraction"] + attr: "" + next_stage: "" + remark: "" + - name: img + css: .s-image-square-aspect > img + xpath: "" + attr: src + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/amazon_config/config_spider/__init__.py b/backend/app/spiders/amazon_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/amazon_config/config_spider/items.py b/backend/app/spiders/amazon_config/config_spider/items.py new file mode 100755 index 00000000..79bf0adb --- /dev/null +++ b/backend/app/spiders/amazon_config/config_spider/items.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + title = scrapy.Field() + url = scrapy.Field() + price = scrapy.Field() + price_fraction = scrapy.Field() + img = scrapy.Field() + diff --git a/backend/app/spiders/amazon_config/config_spider/middlewares.py b/backend/app/spiders/amazon_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/amazon_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/amazon_config/config_spider/pipelines.py b/backend/app/spiders/amazon_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/amazon_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/amazon_config/config_spider/settings.py b/backend/app/spiders/amazon_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/amazon_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/amazon_config/config_spider/spiders/__init__.py b/backend/app/spiders/amazon_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/amazon_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/amazon_config/config_spider/spiders/spider.py b/backend/app/spiders/amazon_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..a7421df3 --- /dev/null +++ b/backend/app/spiders/amazon_config/config_spider/spiders/spider.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='https://www.amazon.cn/s?k=%E6%89%8B%E6%9C%BA&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_2', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.css('.s-result-item'): + item = Item() + item['title'] = elem.css('span.a-text-normal::text').extract_first() + item['url'] = elem.css('.a-link-normal::attr("href")').extract_first() + item['price'] = elem.xpath('string(.//*[@class="a-price-whole"])').extract_first() + item['price_fraction'] = elem.xpath('string(.//*[@class="a-price-fraction"])').extract_first() + item['img'] = elem.css('.s-image-square-aspect > img::attr("src")').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield item + next_url = response.css('.a-last > a::attr("href")').extract_first() + yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) + + diff --git a/backend/app/spiders/amazon_config/md5.txt b/backend/app/spiders/amazon_config/md5.txt new file mode 100755 index 00000000..52c5423f --- /dev/null +++ b/backend/app/spiders/amazon_config/md5.txt @@ -0,0 +1 @@ +4b716dd3c15b993ccb7a9f0be1cc0de9 diff --git a/backend/app/spiders/amazon_config/scrapy.cfg b/backend/app/spiders/amazon_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/amazon_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/app/spiders/autohome_config/Spiderfile b/backend/app/spiders/autohome_config/Spiderfile new file mode 100755 index 00000000..e69880cb --- /dev/null +++ b/backend/app/spiders/autohome_config/Spiderfile @@ -0,0 +1,57 @@ +name: "autohome_config" +display_name: "汽车之家(可配置)" +remark: "汽车之家文章,列表+详情+分页" +type: "configurable" +col: "results_autohome_config" +engine: scrapy +start_url: https://www.autohome.com.cn/all/ +start_stage: list +stages: +- name: list + is_list: true + list_css: ul.article > li + list_xpath: "" + page_css: a.page-item-next + page_xpath: "" + page_attr: href + fields: + - name: title + css: li > a > h3 + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: li > a + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: abstract + css: li > a > p + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: time + css: li > a .fn-left + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: views + css: li > a .fn-right > em:first-child + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: comments + css: li > a .fn-right > em:last-child + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/autohome_config/config_spider/__init__.py b/backend/app/spiders/autohome_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/autohome_config/config_spider/items.py b/backend/app/spiders/autohome_config/config_spider/items.py new file mode 100755 index 00000000..206203d5 --- /dev/null +++ b/backend/app/spiders/autohome_config/config_spider/items.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + title = scrapy.Field() + url = scrapy.Field() + abstract = scrapy.Field() + time = scrapy.Field() + views = scrapy.Field() + comments = scrapy.Field() + diff --git a/backend/app/spiders/autohome_config/config_spider/middlewares.py b/backend/app/spiders/autohome_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/autohome_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/autohome_config/config_spider/pipelines.py b/backend/app/spiders/autohome_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/autohome_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/autohome_config/config_spider/settings.py b/backend/app/spiders/autohome_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/autohome_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/autohome_config/config_spider/spiders/__init__.py b/backend/app/spiders/autohome_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/autohome_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/autohome_config/config_spider/spiders/spider.py b/backend/app/spiders/autohome_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..83753f5a --- /dev/null +++ b/backend/app/spiders/autohome_config/config_spider/spiders/spider.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='https://www.autohome.com.cn/all/', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.css('ul.article > li'): + item = Item() + item['title'] = elem.css('li > a > h3::text').extract_first() + item['url'] = elem.css('li > a::attr("href")').extract_first() + item['abstract'] = elem.css('li > a > p::text').extract_first() + item['time'] = elem.css('li > a .fn-left::text').extract_first() + item['views'] = elem.css('li > a .fn-right > em:first-child::text').extract_first() + item['comments'] = elem.css('li > a .fn-right > em:last-child::text').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield item + next_url = response.css('a.page-item-next::attr("href")').extract_first() + yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) + + diff --git a/backend/app/spiders/autohome_config/md5.txt b/backend/app/spiders/autohome_config/md5.txt new file mode 100755 index 00000000..c4707adf --- /dev/null +++ b/backend/app/spiders/autohome_config/md5.txt @@ -0,0 +1 @@ +d784a11085e298eaf344eadc3a3e9411 diff --git a/backend/app/spiders/autohome_config/scrapy.cfg b/backend/app/spiders/autohome_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/autohome_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/app/spiders/baidu_config/Spiderfile b/backend/app/spiders/baidu_config/Spiderfile new file mode 100755 index 00000000..a29d4acb --- /dev/null +++ b/backend/app/spiders/baidu_config/Spiderfile @@ -0,0 +1,39 @@ +name: "baidu_config" +display_name: "百度搜索(可配置)" +remark: "百度搜索Crawlab,列表+分页" +type: "configurable" +col: "results_baidu_config" +engine: scrapy +start_url: http://www.baidu.com/s?wd=crawlab +start_stage: list +stages: +- name: list + is_list: true + list_css: ".result.c-container" + list_xpath: "" + page_css: "a.n" + page_xpath: "" + page_attr: href + fields: + - name: title + css: "" + xpath: .//h3/a + attr: "" + next_stage: "" + remark: "" + - name: url + css: "" + xpath: .//h3/a + attr: href + next_stage: "" + remark: "" + - name: abstract + css: "" + xpath: .//*[@class="c-abstract"] + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/baidu_config/config_spider/__init__.py b/backend/app/spiders/baidu_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/baidu_config/config_spider/items.py b/backend/app/spiders/baidu_config/config_spider/items.py new file mode 100755 index 00000000..9282765f --- /dev/null +++ b/backend/app/spiders/baidu_config/config_spider/items.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + title = scrapy.Field() + url = scrapy.Field() + abstract = scrapy.Field() + diff --git a/backend/app/spiders/baidu_config/config_spider/middlewares.py b/backend/app/spiders/baidu_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/baidu_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/baidu_config/config_spider/pipelines.py b/backend/app/spiders/baidu_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/baidu_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/baidu_config/config_spider/settings.py b/backend/app/spiders/baidu_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/baidu_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/baidu_config/config_spider/spiders/__init__.py b/backend/app/spiders/baidu_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/baidu_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/baidu_config/config_spider/spiders/spider.py b/backend/app/spiders/baidu_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..e5fd793f --- /dev/null +++ b/backend/app/spiders/baidu_config/config_spider/spiders/spider.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='http://www.baidu.com/s?wd=crawlab', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.css('.result.c-container'): + item = Item() + item['title'] = elem.xpath('string(.//h3/a)').extract_first() + item['url'] = elem.xpath('.//h3/a/@href').extract_first() + item['abstract'] = elem.xpath('string(.//*[@class="c-abstract"])').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield item + next_url = response.css('a.n::attr("href")').extract_first() + yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) + + diff --git a/backend/app/spiders/baidu_config/md5.txt b/backend/app/spiders/baidu_config/md5.txt new file mode 100755 index 00000000..32137b76 --- /dev/null +++ b/backend/app/spiders/baidu_config/md5.txt @@ -0,0 +1 @@ +ba25f6f3567b256473d3f0ec6af783fd diff --git a/backend/app/spiders/baidu_config/scrapy.cfg b/backend/app/spiders/baidu_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/baidu_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/app/spiders/bing_general/Spiderfile b/backend/app/spiders/bing_general/Spiderfile new file mode 100755 index 00000000..614c135e --- /dev/null +++ b/backend/app/spiders/bing_general/Spiderfile @@ -0,0 +1,6 @@ +name: "bing_general" +display_name: "必应搜索 (通用)" +remark: "必应搜索 Crawlab,列表+分页" +col: "results_bing_general" +type: "customized" +cmd: "python bing_spider.py" \ No newline at end of file diff --git a/backend/app/spiders/bing_general/bing_spider.py b/backend/app/spiders/bing_general/bing_spider.py new file mode 100755 index 00000000..e982e4ee --- /dev/null +++ b/backend/app/spiders/bing_general/bing_spider.py @@ -0,0 +1,41 @@ +import requests +from bs4 import BeautifulSoup as bs +from urllib.parse import urljoin, urlparse +import re +from crawlab import save_item + +s = requests.Session() + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +def start_requests(): + for i in range(0, 9): + fr = 'PERE' if not i else 'MORE' + url = f'https://cn.bing.com/search?q=crawlab&first={10 * i + 1}&FROM={fr}' + request_page(url) + +def request_page(url): + print(f'requesting {url}') + r = s.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}) + parse_list(r) + +def parse_list(response): + soup = bs(response.content.decode('utf-8')) + for el in list(soup.select('#b_results > li')): + try: + save_item({ + 'title': el.select_one('h2').text, + 'url': el.select_one('h2 a').attrs.get('href'), + 'abstract': el.select_one('.b_caption p').text, + }) + except: + pass + +if __name__ == '__main__': + start_requests() \ No newline at end of file diff --git a/backend/app/spiders/bing_general/md5.txt b/backend/app/spiders/bing_general/md5.txt new file mode 100755 index 00000000..42fb6afd --- /dev/null +++ b/backend/app/spiders/bing_general/md5.txt @@ -0,0 +1 @@ +cf295b694a20c99c4857f838aa0402a7 diff --git a/backend/app/spiders/chinaz/Spiderfile b/backend/app/spiders/chinaz/Spiderfile new file mode 100755 index 00000000..2fb940bb --- /dev/null +++ b/backend/app/spiders/chinaz/Spiderfile @@ -0,0 +1,5 @@ +name: "chinaz" +display_name: "站长之家 (Scrapy)" +col: "results_chinaz" +type: "customized" +cmd: "scrapy crawl chinaz_spider" \ No newline at end of file diff --git a/backend/app/spiders/chinaz/chinaz/__init__.py b/backend/app/spiders/chinaz/chinaz/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/chinaz/chinaz/items.py b/backend/app/spiders/chinaz/chinaz/items.py new file mode 100755 index 00000000..1fdcac1b --- /dev/null +++ b/backend/app/spiders/chinaz/chinaz/items.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class ChinazItem(scrapy.Item): + # define the fields for your item here like: + _id = scrapy.Field() + task_id = scrapy.Field() + name = scrapy.Field() + domain = scrapy.Field() + description = scrapy.Field() + rank = scrapy.Field() + main_category = scrapy.Field() + category = scrapy.Field() + location = scrapy.Field() diff --git a/backend/app/spiders/chinaz/chinaz/middlewares.py b/backend/app/spiders/chinaz/chinaz/middlewares.py new file mode 100755 index 00000000..c98995d5 --- /dev/null +++ b/backend/app/spiders/chinaz/chinaz/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ChinazSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ChinazDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/chinaz/chinaz/pipelines.py b/backend/app/spiders/chinaz/chinaz/pipelines.py new file mode 100755 index 00000000..b29f9eb7 --- /dev/null +++ b/backend/app/spiders/chinaz/chinaz/pipelines.py @@ -0,0 +1,7 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html + diff --git a/backend/app/spiders/chinaz/chinaz/settings.py b/backend/app/spiders/chinaz/chinaz/settings.py new file mode 100755 index 00000000..932ec9ed --- /dev/null +++ b/backend/app/spiders/chinaz/chinaz/settings.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for chinaz project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'chinaz' + +SPIDER_MODULES = ['chinaz.spiders'] +NEWSPIDER_MODULE = 'chinaz.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'chinaz (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'chinaz.middlewares.ChinazSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'chinaz.middlewares.ChinazDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'crawlab.pipelines.CrawlabMongoPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/chinaz/chinaz/spiders/__init__.py b/backend/app/spiders/chinaz/chinaz/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/chinaz/chinaz/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py b/backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py new file mode 100755 index 00000000..28ad84e7 --- /dev/null +++ b/backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- +import scrapy +from chinaz.items import ChinazItem + + +class ChinazSpiderSpider(scrapy.Spider): + name = 'chinaz_spider' + allowed_domains = ['chinaz.com'] + start_urls = ['http://top.chinaz.com/hangye/'] + + def parse(self, response): + for item in response.css('.listCentent > li'): + name = item.css('h3.rightTxtHead > a::text').extract_first() + href = item.css('h3.rightTxtHead > a::attr("href")').extract_first() + domain = item.css('h3.rightTxtHead > span::text').extract_first() + description = item.css('p.RtCInfo::text').extract_first() + rank = item.css('.RtCRateCent > strong::text').extract_first() + rank = int(rank) + item = ChinazItem( + _id=domain, + name=name, + domain=domain, + description=description, + rank=rank, + ) + yield scrapy.Request( + url='http://top.chinaz.com' + href, + callback=self.parse_item, + meta={ + 'item': item + } + ) + + # pagination + a_list = response.css('.ListPageWrap > a::attr("href")').extract() + url = 'http://top.chinaz.com/hangye/' + a_list[-1] + yield scrapy.Request(url=url, callback=self.parse) + + def parse_item(self, response): + item = response.meta['item'] + + # category info extraction + arr = response.css('.TopMainTag-show .SimSun') + res1 = arr[0].css('a::text').extract() + main_category = res1[0] + if len(res1) == 1: + category = '其他' + else: + category = res1[1] + + # location info extraction + res2 = arr[1].css('a::text').extract() + if len(res2) > 0: + location = res2[0] + else: + location = '其他' + + # assign values to item + item['main_category'] = main_category + item['category'] = category + item['location'] = location + + yield item diff --git a/backend/app/spiders/chinaz/md5.txt b/backend/app/spiders/chinaz/md5.txt new file mode 100755 index 00000000..f5e15fb9 --- /dev/null +++ b/backend/app/spiders/chinaz/md5.txt @@ -0,0 +1 @@ +1976593e49bf0238602ce35d051bd137 diff --git a/backend/app/spiders/chinaz/scrapy.cfg b/backend/app/spiders/chinaz/scrapy.cfg new file mode 100755 index 00000000..d3b44a1a --- /dev/null +++ b/backend/app/spiders/chinaz/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = chinaz.settings + +[deploy] +#url = http://localhost:6800/ +project = chinaz diff --git a/backend/app/spiders/csdn_config/Spiderfile b/backend/app/spiders/csdn_config/Spiderfile new file mode 100755 index 00000000..67f4f8c5 --- /dev/null +++ b/backend/app/spiders/csdn_config/Spiderfile @@ -0,0 +1,60 @@ +name: "csdn_config" +display_name: "CSDN(可配置)" +remark: "CSDN Crawlab 文章,列表+详情+分页" +type: "configurable" +col: "results_csdn_config" +engine: scrapy +start_url: https://so.csdn.net/so/search/s.do?q=crawlab +start_stage: list +stages: +- name: list + is_list: true + list_css: .search-list-con > .search-list + list_xpath: "" + page_css: a.btn-next + page_xpath: "" + page_attr: href + fields: + - name: url + css: "" + xpath: .//*[@class="limit_width"]/a + attr: href + next_stage: detail + remark: "" +- name: detail + is_list: false + list_css: "" + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: "" + fields: + - name: content + css: "" + xpath: .//div[@id="content_views"] + attr: "" + next_stage: "" + remark: "" + - name: views + css: .read-count + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: title + css: .title-article + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: author + css: .follow-nickName + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + AUTOTHROTTLE_ENABLED: "false" + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/79.0.3945.117 Safari/537.36 diff --git a/backend/app/spiders/csdn_config/config_spider/__init__.py b/backend/app/spiders/csdn_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/csdn_config/config_spider/items.py b/backend/app/spiders/csdn_config/config_spider/items.py new file mode 100755 index 00000000..3c8e5e54 --- /dev/null +++ b/backend/app/spiders/csdn_config/config_spider/items.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + url = scrapy.Field() + content = scrapy.Field() + views = scrapy.Field() + title = scrapy.Field() + author = scrapy.Field() + diff --git a/backend/app/spiders/csdn_config/config_spider/middlewares.py b/backend/app/spiders/csdn_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/csdn_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/csdn_config/config_spider/pipelines.py b/backend/app/spiders/csdn_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/csdn_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/csdn_config/config_spider/settings.py b/backend/app/spiders/csdn_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/csdn_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/csdn_config/config_spider/spiders/__init__.py b/backend/app/spiders/csdn_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/csdn_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/csdn_config/config_spider/spiders/spider.py b/backend/app/spiders/csdn_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..9ecc4aae --- /dev/null +++ b/backend/app/spiders/csdn_config/config_spider/spiders/spider.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='https://so.csdn.net/so/search/s.do?q=crawlab', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.css('.search-list-con > .search-list'): + item = Item() + item['url'] = elem.xpath('.//*[@class="limit_width"]/a/@href').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield scrapy.Request(url=get_real_url(response, item['url']), callback=self.parse_detail, meta={'item': item}) + next_url = response.css('a.btn-next::attr("href")').extract_first() + yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) + + def parse_detail(self, response): + item = Item() if response.meta.get('item') is None else response.meta.get('item') + item['content'] = response.xpath('string(.//div[@id="content_views"])').extract_first() + item['views'] = response.css('.read-count::text').extract_first() + item['title'] = response.css('.title-article::text').extract_first() + item['author'] = response.css('.follow-nickName::text').extract_first() + yield item + + diff --git a/backend/app/spiders/csdn_config/md5.txt b/backend/app/spiders/csdn_config/md5.txt new file mode 100755 index 00000000..e169c42a --- /dev/null +++ b/backend/app/spiders/csdn_config/md5.txt @@ -0,0 +1 @@ +b6889c74e006a5e619b525d84db62ffd diff --git a/backend/app/spiders/csdn_config/scrapy.cfg b/backend/app/spiders/csdn_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/csdn_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/app/spiders/douban_config/Spiderfile b/backend/app/spiders/douban_config/Spiderfile new file mode 100755 index 00000000..84f0647a --- /dev/null +++ b/backend/app/spiders/douban_config/Spiderfile @@ -0,0 +1,57 @@ +name: "douban_config" +display_name: "豆瓣读书(可配置)" +remark: "豆瓣读书新书推荐,列表" +type: "configurable" +col: "results_douban_config" +engine: scrapy +start_url: https://book.douban.com/latest +start_stage: list +stages: +- name: list + is_list: true + list_css: ul.cover-col-4 > li + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: "" + fields: + - name: title + css: h2 > a + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: h2 > a + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: img + css: a.cover img + xpath: "" + attr: src + next_stage: "" + remark: "" + - name: rating + css: p.rating > .color-lightgray + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: abstract + css: p:last-child + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: info + css: .color-gray + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/douban_config/config_spider/__init__.py b/backend/app/spiders/douban_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/douban_config/config_spider/items.py b/backend/app/spiders/douban_config/config_spider/items.py new file mode 100755 index 00000000..d6959b8d --- /dev/null +++ b/backend/app/spiders/douban_config/config_spider/items.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + title = scrapy.Field() + url = scrapy.Field() + img = scrapy.Field() + rating = scrapy.Field() + abstract = scrapy.Field() + info = scrapy.Field() + diff --git a/backend/app/spiders/douban_config/config_spider/middlewares.py b/backend/app/spiders/douban_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/douban_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/douban_config/config_spider/pipelines.py b/backend/app/spiders/douban_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/douban_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/douban_config/config_spider/settings.py b/backend/app/spiders/douban_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/douban_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/douban_config/config_spider/spiders/__init__.py b/backend/app/spiders/douban_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/douban_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/douban_config/config_spider/spiders/spider.py b/backend/app/spiders/douban_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..61bb648d --- /dev/null +++ b/backend/app/spiders/douban_config/config_spider/spiders/spider.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='https://book.douban.com/latest', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.css('ul.cover-col-4 > li'): + item = Item() + item['title'] = elem.css('h2 > a::text').extract_first() + item['url'] = elem.css('h2 > a::attr("href")').extract_first() + item['img'] = elem.css('a.cover img::attr("src")').extract_first() + item['rating'] = elem.css('p.rating > .color-lightgray::text').extract_first() + item['abstract'] = elem.css('p:last-child::text').extract_first() + item['info'] = elem.css('.color-gray::text').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield item + + diff --git a/backend/app/spiders/douban_config/md5.txt b/backend/app/spiders/douban_config/md5.txt new file mode 100755 index 00000000..374e3804 --- /dev/null +++ b/backend/app/spiders/douban_config/md5.txt @@ -0,0 +1 @@ +4d59a6c83b0e125d5321beae86bb93ce diff --git a/backend/app/spiders/douban_config/scrapy.cfg b/backend/app/spiders/douban_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/douban_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/app/spiders/jd/Spiderfile b/backend/app/spiders/jd/Spiderfile new file mode 100755 index 00000000..d090472b --- /dev/null +++ b/backend/app/spiders/jd/Spiderfile @@ -0,0 +1,5 @@ +name: "jd" +display_name: "京东 (Scrapy)" +col: "results_jd" +type: "customized" +cmd: "scrapy crawl jd_spider" \ No newline at end of file diff --git a/backend/app/spiders/jd/jd/__init__.py b/backend/app/spiders/jd/jd/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/jd/jd/items.py b/backend/app/spiders/jd/jd/items.py new file mode 100755 index 00000000..b2c5e647 --- /dev/null +++ b/backend/app/spiders/jd/jd/items.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class JdItem(scrapy.Item): + # define the fields for your item here like: + name = scrapy.Field() + price = scrapy.Field() + url = scrapy.Field() diff --git a/backend/app/spiders/jd/jd/middlewares.py b/backend/app/spiders/jd/jd/middlewares.py new file mode 100755 index 00000000..6fceded5 --- /dev/null +++ b/backend/app/spiders/jd/jd/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class JdSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class JdDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/jd/jd/pipelines.py b/backend/app/spiders/jd/jd/pipelines.py new file mode 100755 index 00000000..5a7d7cbf --- /dev/null +++ b/backend/app/spiders/jd/jd/pipelines.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/backend/app/spiders/jd/jd/settings.py b/backend/app/spiders/jd/jd/settings.py new file mode 100755 index 00000000..ef89ed0c --- /dev/null +++ b/backend/app/spiders/jd/jd/settings.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for jd project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'jd' + +SPIDER_MODULES = ['jd.spiders'] +NEWSPIDER_MODULE = 'jd.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'jd (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'jd.middlewares.JdSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'jd.middlewares.JdDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'crawlab.pipelines.CrawlabMongoPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/jd/jd/spiders/__init__.py b/backend/app/spiders/jd/jd/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/jd/jd/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/jd/jd/spiders/jd_spider.py b/backend/app/spiders/jd/jd/spiders/jd_spider.py new file mode 100755 index 00000000..4ec94fa9 --- /dev/null +++ b/backend/app/spiders/jd/jd/spiders/jd_spider.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +import scrapy + +from jd.items import JdItem + + +class JdSpiderSpider(scrapy.Spider): + name = 'jd_spider' + allowed_domains = ['jd.com'] + + def start_requests(self): + for i in range(1, 50): + yield scrapy.Request(url=f'https://search.jd.com/Search?keyword=手机&enc=utf-8&page={i}') + + def parse(self, response): + for el in response.css('.gl-item'): + yield JdItem( + url=el.css('.p-name > a::attr("href")').extract_first(), + name=el.css('.p-name > a::attr("title")').extract_first(), + price=float(el.css('.p-price i::text').extract_first()), + ) diff --git a/backend/app/spiders/jd/md5.txt b/backend/app/spiders/jd/md5.txt new file mode 100755 index 00000000..dcd53f51 --- /dev/null +++ b/backend/app/spiders/jd/md5.txt @@ -0,0 +1 @@ +621486d31459514eb27a082d159d9b8c diff --git a/backend/app/spiders/jd/scrapy.cfg b/backend/app/spiders/jd/scrapy.cfg new file mode 100755 index 00000000..87cf0280 --- /dev/null +++ b/backend/app/spiders/jd/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = jd.settings + +[deploy] +#url = http://localhost:6800/ +project = jd diff --git a/backend/app/spiders/sinastock/Spiderfile b/backend/app/spiders/sinastock/Spiderfile new file mode 100755 index 00000000..b110cb48 --- /dev/null +++ b/backend/app/spiders/sinastock/Spiderfile @@ -0,0 +1,5 @@ +name: "sinastock" +display_name: "新浪股票 (Scrapy)" +type: "customized" +col: "results_sinastock" +cmd: "scrapy crawl sinastock_spider" \ No newline at end of file diff --git a/backend/app/spiders/sinastock/md5.txt b/backend/app/spiders/sinastock/md5.txt new file mode 100755 index 00000000..1e5d8ab9 --- /dev/null +++ b/backend/app/spiders/sinastock/md5.txt @@ -0,0 +1 @@ +80bc091fa45ef4a85c9f1a66c81a4ed7 diff --git a/backend/app/spiders/sinastock/scrapy.cfg b/backend/app/spiders/sinastock/scrapy.cfg new file mode 100755 index 00000000..4969ad96 --- /dev/null +++ b/backend/app/spiders/sinastock/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = sinastock.settings + +[deploy] +#url = http://localhost:6800/ +project = sinastock diff --git a/backend/app/spiders/sinastock/sinastock/__init__.py b/backend/app/spiders/sinastock/sinastock/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/sinastock/sinastock/items.py b/backend/app/spiders/sinastock/sinastock/items.py new file mode 100755 index 00000000..6e3e5d8e --- /dev/null +++ b/backend/app/spiders/sinastock/sinastock/items.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class NewsItem(scrapy.Item): + # define the fields for your item here like: + _id = scrapy.Field() + title = scrapy.Field() + ts_str = scrapy.Field() + ts = scrapy.Field() + url = scrapy.Field() + text = scrapy.Field() + task_id = scrapy.Field() + source = scrapy.Field() + stocks = scrapy.Field() diff --git a/backend/app/spiders/sinastock/sinastock/middlewares.py b/backend/app/spiders/sinastock/sinastock/middlewares.py new file mode 100755 index 00000000..912b5e57 --- /dev/null +++ b/backend/app/spiders/sinastock/sinastock/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class SinastockSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class SinastockDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/sinastock/sinastock/pipelines.py b/backend/app/spiders/sinastock/sinastock/pipelines.py new file mode 100755 index 00000000..5a7d7cbf --- /dev/null +++ b/backend/app/spiders/sinastock/sinastock/pipelines.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/backend/app/spiders/sinastock/sinastock/settings.py b/backend/app/spiders/sinastock/sinastock/settings.py new file mode 100755 index 00000000..3e01d3ca --- /dev/null +++ b/backend/app/spiders/sinastock/sinastock/settings.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for sinastock project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'sinastock' + +SPIDER_MODULES = ['sinastock.spiders'] +NEWSPIDER_MODULE = 'sinastock.spiders' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +# USER_AGENT = 'sinastock (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +# CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +# DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +# } + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# 'sinastock.middlewares.SinastockSpiderMiddleware': 543, +# } + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# DOWNLOADER_MIDDLEWARES = { +# 'sinastock.middlewares.SinastockDownloaderMiddleware': 543, +# } + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +# } + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'crawlab.pipelines.CrawlabMongoPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/sinastock/sinastock/spiders/__init__.py b/backend/app/spiders/sinastock/sinastock/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/sinastock/sinastock/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py b/backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py new file mode 100755 index 00000000..54daf763 --- /dev/null +++ b/backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- +import os +import re +from datetime import datetime + +import scrapy +from pymongo import MongoClient + +from sinastock.items import NewsItem + +class SinastockSpiderSpider(scrapy.Spider): + name = 'sinastock_spider' + allowed_domains = ['finance.sina.com.cn'] + mongo = MongoClient( + host=os.environ.get('MONGO_HOST') or 'localhost', + port=int(os.environ.get('MONGO_PORT') or 27017) + ) + db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test'] + col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'stock_news') + + def start_requests(self): + col = self.db['stocks'] + for s in col.find({}): + code, ex = s['ts_code'].split('.') + for i in range(10): + url = f'http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol={ex.lower()}{code}&Page={i + 1}' + yield scrapy.Request( + url=url, + callback=self.parse, + meta={'ts_code': s['ts_code']} + ) + + def parse(self, response): + for a in response.css('.datelist > ul > a'): + url = a.css('a::attr("href")').extract_first() + item = NewsItem( + title=a.css('a::text').extract_first(), + url=url, + source='sina', + stocks=[response.meta['ts_code']] + ) + yield scrapy.Request( + url=url, + callback=self.parse_detail, + meta={'item': item} + ) + + def parse_detail(self, response): + item = response.meta['item'] + text = response.css('#artibody').extract_first() + pre = re.compile('>(.*?)<') + text = ''.join(pre.findall(text)) + item['text'] = text.replace('\u3000', '') + item['ts_str'] = response.css('.date::text').extract_first() + if item['text'] is None or item['ts_str'] is None: + pass + else: + item['ts'] = datetime.strptime(item['ts_str'], '%Y年%m月%d日 %H:%M') + yield item diff --git a/backend/app/spiders/v2ex_config/Spiderfile b/backend/app/spiders/v2ex_config/Spiderfile new file mode 100755 index 00000000..bb18d40a --- /dev/null +++ b/backend/app/spiders/v2ex_config/Spiderfile @@ -0,0 +1,54 @@ +name: "v2ex_config" +display_name: "V2ex(可配置)" +remark: "V2ex,列表+详情" +type: "configurable" +col: "results_v2ex_config" +engine: scrapy +start_url: https://v2ex.com/ +start_stage: list +stages: +- name: list + is_list: true + list_css: .cell.item + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: href + fields: + - name: title + css: a.topic-link + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: a.topic-link + xpath: "" + attr: href + next_stage: detail + remark: "" + - name: replies + css: .count_livid + xpath: "" + attr: "" + next_stage: "" + remark: "" +- name: detail + is_list: false + list_css: "" + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: "" + fields: + - name: content + css: "" + xpath: .//*[@class="markdown_body"] + attr: "" + next_stage: "" + remark: "" +settings: + AUTOTHROTTLE_ENABLED: "true" + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/79.0.3945.117 Safari/537.36 diff --git a/backend/app/spiders/v2ex_config/config_spider/__init__.py b/backend/app/spiders/v2ex_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/v2ex_config/config_spider/items.py b/backend/app/spiders/v2ex_config/config_spider/items.py new file mode 100755 index 00000000..d2c01a06 --- /dev/null +++ b/backend/app/spiders/v2ex_config/config_spider/items.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + title = scrapy.Field() + url = scrapy.Field() + replies = scrapy.Field() + content = scrapy.Field() + diff --git a/backend/app/spiders/v2ex_config/config_spider/middlewares.py b/backend/app/spiders/v2ex_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/v2ex_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/v2ex_config/config_spider/pipelines.py b/backend/app/spiders/v2ex_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/v2ex_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/v2ex_config/config_spider/settings.py b/backend/app/spiders/v2ex_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/v2ex_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py b/backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/v2ex_config/config_spider/spiders/spider.py b/backend/app/spiders/v2ex_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..4763e040 --- /dev/null +++ b/backend/app/spiders/v2ex_config/config_spider/spiders/spider.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='https://v2ex.com/', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.css('.cell.item'): + item = Item() + item['title'] = elem.css('a.topic-link::text').extract_first() + item['url'] = elem.css('a.topic-link::attr("href")').extract_first() + item['replies'] = elem.css('.count_livid::text').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield scrapy.Request(url=get_real_url(response, item['url']), callback=self.parse_detail, meta={'item': item}) + + def parse_detail(self, response): + item = Item() if response.meta.get('item') is None else response.meta.get('item') + item['content'] = response.xpath('string(.//*[@class="markdown_body"])').extract_first() + yield item + + diff --git a/backend/app/spiders/v2ex_config/md5.txt b/backend/app/spiders/v2ex_config/md5.txt new file mode 100755 index 00000000..5d725b2c --- /dev/null +++ b/backend/app/spiders/v2ex_config/md5.txt @@ -0,0 +1 @@ +402c0a07873ef74b9b574bc0f6b28423 diff --git a/backend/app/spiders/v2ex_config/scrapy.cfg b/backend/app/spiders/v2ex_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/v2ex_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/app/spiders/xueqiu/Spiderfile b/backend/app/spiders/xueqiu/Spiderfile new file mode 100755 index 00000000..38aa5dbe --- /dev/null +++ b/backend/app/spiders/xueqiu/Spiderfile @@ -0,0 +1,5 @@ +name: "xueqiu" +display_name: "雪球网 (Scrapy)" +type: "customized" +col: "results_xueqiu" +cmd: "scrapy crawl xueqiu_spider" \ No newline at end of file diff --git a/backend/app/spiders/xueqiu/md5.txt b/backend/app/spiders/xueqiu/md5.txt new file mode 100755 index 00000000..6a9a2072 --- /dev/null +++ b/backend/app/spiders/xueqiu/md5.txt @@ -0,0 +1 @@ +df177994199caa691d87fc0c5031326d diff --git a/backend/app/spiders/xueqiu/scrapy.cfg b/backend/app/spiders/xueqiu/scrapy.cfg new file mode 100755 index 00000000..2c5ce3b3 --- /dev/null +++ b/backend/app/spiders/xueqiu/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = xueqiu.settings + +[deploy] +#url = http://localhost:6800/ +project = xueqiu diff --git a/backend/app/spiders/xueqiu/xueqiu/__init__.py b/backend/app/spiders/xueqiu/xueqiu/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/xueqiu/xueqiu/items.py b/backend/app/spiders/xueqiu/xueqiu/items.py new file mode 100755 index 00000000..5471594d --- /dev/null +++ b/backend/app/spiders/xueqiu/xueqiu/items.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class XueqiuItem(scrapy.Item): + # define the fields for your item here like: + _id = scrapy.Field() + task_id = scrapy.Field() + id = scrapy.Field() + text = scrapy.Field() + url = scrapy.Field() + target = scrapy.Field() + view_count = scrapy.Field() + mark = scrapy.Field() + created_at = scrapy.Field() + ts = scrapy.Field() + source = scrapy.Field() diff --git a/backend/app/spiders/xueqiu/xueqiu/middlewares.py b/backend/app/spiders/xueqiu/xueqiu/middlewares.py new file mode 100755 index 00000000..f60102ce --- /dev/null +++ b/backend/app/spiders/xueqiu/xueqiu/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class XueqiuSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class XueqiuDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/xueqiu/xueqiu/pipelines.py b/backend/app/spiders/xueqiu/xueqiu/pipelines.py new file mode 100755 index 00000000..5a7d7cbf --- /dev/null +++ b/backend/app/spiders/xueqiu/xueqiu/pipelines.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/backend/app/spiders/xueqiu/xueqiu/settings.py b/backend/app/spiders/xueqiu/xueqiu/settings.py new file mode 100755 index 00000000..1d898e2f --- /dev/null +++ b/backend/app/spiders/xueqiu/xueqiu/settings.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for xueqiu project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'xueqiu' + +SPIDER_MODULES = ['xueqiu.spiders'] +NEWSPIDER_MODULE = 'xueqiu.spiders' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +# CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +# DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +# } + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# 'xueqiu.middlewares.XueqiuSpiderMiddleware': 543, +# } + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# DOWNLOADER_MIDDLEWARES = { +# 'xueqiu.middlewares.XueqiuDownloaderMiddleware': 543, +# } + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +# } + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'crawlab.pipelines.CrawlabMongoPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/xueqiu/xueqiu/spiders/__init__.py b/backend/app/spiders/xueqiu/xueqiu/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/xueqiu/xueqiu/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py b/backend/app/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py new file mode 100755 index 00000000..a746e156 --- /dev/null +++ b/backend/app/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +import json +from datetime import datetime +from time import sleep + +import scrapy + +from xueqiu.items import XueqiuItem + + +class XueqiuSpiderSpider(scrapy.Spider): + name = 'xueqiu_spider' + allowed_domains = ['xueqiu.com'] + + def start_requests(self): + return [scrapy.Request( + url='https://xueqiu.com', + callback=self.parse_home + )] + + def parse_home(self, response): + yield scrapy.Request( + url='https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=20&category=6' + ) + + def parse(self, response): + data = json.loads(response.body) + next_max_id = data.get('next_max_id') + sleep(1) + for row in data.get('list'): + d = json.loads(row.get('data')) + item = XueqiuItem( + id=d['id'], + text=d['text'], + mark=d['mark'], + url=d['target'], + created_at=d['created_at'], + ts=datetime.fromtimestamp(d['created_at'] / 1e3), + view_count=d['view_count'], + source='xueqiu' + ) + yield item + + yield scrapy.Request( + url=f'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id={next_max_id}&count=20&category=6' + ) diff --git a/backend/app/spiders/xueqiu_config/Spiderfile b/backend/app/spiders/xueqiu_config/Spiderfile new file mode 100755 index 00000000..0de50e9e --- /dev/null +++ b/backend/app/spiders/xueqiu_config/Spiderfile @@ -0,0 +1,39 @@ +name: "xueqiu_config" +display_name: "雪球网(可配置)" +remark: "雪球网新闻,列表" +type: "configurable" +col: "results_xueqiu_config" +engine: scrapy +start_url: https://xueqiu.com/ +start_stage: list +stages: +- name: list + is_list: true + list_css: "" + list_xpath: .//*[contains(@class, "AnonymousHome_home__timeline__item")] + page_css: "" + page_xpath: "" + page_attr: "" + fields: + - name: title + css: h3 > a + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: h3 > a + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: abstract + css: p + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/xueqiu_config/config_spider/__init__.py b/backend/app/spiders/xueqiu_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/xueqiu_config/config_spider/items.py b/backend/app/spiders/xueqiu_config/config_spider/items.py new file mode 100755 index 00000000..9282765f --- /dev/null +++ b/backend/app/spiders/xueqiu_config/config_spider/items.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + title = scrapy.Field() + url = scrapy.Field() + abstract = scrapy.Field() + diff --git a/backend/app/spiders/xueqiu_config/config_spider/middlewares.py b/backend/app/spiders/xueqiu_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/xueqiu_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/xueqiu_config/config_spider/pipelines.py b/backend/app/spiders/xueqiu_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/xueqiu_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/xueqiu_config/config_spider/settings.py b/backend/app/spiders/xueqiu_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/xueqiu_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/xueqiu_config/config_spider/spiders/__init__.py b/backend/app/spiders/xueqiu_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/xueqiu_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/xueqiu_config/config_spider/spiders/spider.py b/backend/app/spiders/xueqiu_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..79d4636b --- /dev/null +++ b/backend/app/spiders/xueqiu_config/config_spider/spiders/spider.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='https://xueqiu.com/', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.xpath('.//*[contains(@class, "AnonymousHome_home__timeline__item")]'): + item = Item() + item['title'] = elem.css('h3 > a::text').extract_first() + item['url'] = elem.css('h3 > a::attr("href")').extract_first() + item['abstract'] = elem.css('p::text').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield item + + diff --git a/backend/app/spiders/xueqiu_config/md5.txt b/backend/app/spiders/xueqiu_config/md5.txt new file mode 100755 index 00000000..39a6df77 --- /dev/null +++ b/backend/app/spiders/xueqiu_config/md5.txt @@ -0,0 +1 @@ +e3da3aacb2d290cb179a79028fbfff9c diff --git a/backend/app/spiders/xueqiu_config/scrapy.cfg b/backend/app/spiders/xueqiu_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/xueqiu_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/app/spiders/zongheng_config/Spiderfile b/backend/app/spiders/zongheng_config/Spiderfile new file mode 100755 index 00000000..0163fac7 --- /dev/null +++ b/backend/app/spiders/zongheng_config/Spiderfile @@ -0,0 +1,45 @@ +name: "zongheng_config" +display_name: "纵横(可配置)" +remark: "纵横小说网,列表" +type: "configurable" +col: "results_zongheng_config" +engine: scrapy +start_url: http://www.zongheng.com/rank/details.html?rt=1&d=1 +start_stage: list +stages: +- name: list + is_list: true + list_css: .rank_d_list + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: href + fields: + - name: title + css: .rank_d_b_name > a + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: .rank_d_b_name > a + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: abstract + css: body + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: votes + css: .rank_d_b_ticket + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/zongheng_config/config_spider/__init__.py b/backend/app/spiders/zongheng_config/config_spider/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/backend/app/spiders/zongheng_config/config_spider/items.py b/backend/app/spiders/zongheng_config/config_spider/items.py new file mode 100755 index 00000000..528c3187 --- /dev/null +++ b/backend/app/spiders/zongheng_config/config_spider/items.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Item(scrapy.Item): + _id = scrapy.Field() + task_id = scrapy.Field() + ts = scrapy.Field() + title = scrapy.Field() + url = scrapy.Field() + abstract = scrapy.Field() + votes = scrapy.Field() + diff --git a/backend/app/spiders/zongheng_config/config_spider/middlewares.py b/backend/app/spiders/zongheng_config/config_spider/middlewares.py new file mode 100755 index 00000000..e864bd0b --- /dev/null +++ b/backend/app/spiders/zongheng_config/config_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ConfigSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ConfigSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/zongheng_config/config_spider/pipelines.py b/backend/app/spiders/zongheng_config/config_spider/pipelines.py new file mode 100755 index 00000000..69af4c85 --- /dev/null +++ b/backend/app/spiders/zongheng_config/config_spider/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +import os +from pymongo import MongoClient + +mongo = MongoClient( + host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', + port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), + username=os.environ.get('CRAWLAB_MONGO_USERNAME'), + password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), + authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' +) +db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] +col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] +task_id = os.environ.get('CRAWLAB_TASK_ID') + +class ConfigSpiderPipeline(object): + def process_item(self, item, spider): + item['task_id'] = task_id + if col is not None: + col.save(item) + return item diff --git a/backend/app/spiders/zongheng_config/config_spider/settings.py b/backend/app/spiders/zongheng_config/config_spider/settings.py new file mode 100755 index 00000000..4b0965f2 --- /dev/null +++ b/backend/app/spiders/zongheng_config/config_spider/settings.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import re +import json + +# Scrapy settings for config_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'Crawlab Configurable Spider' + +SPIDER_MODULES = ['config_spider.spiders'] +NEWSPIDER_MODULE = 'config_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Crawlab Spider' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'config_spider.pipelines.ConfigSpiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/app/spiders/zongheng_config/config_spider/spiders/__init__.py b/backend/app/spiders/zongheng_config/config_spider/spiders/__init__.py new file mode 100755 index 00000000..ebd689ac --- /dev/null +++ b/backend/app/spiders/zongheng_config/config_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/backend/app/spiders/zongheng_config/config_spider/spiders/spider.py b/backend/app/spiders/zongheng_config/config_spider/spiders/spider.py new file mode 100755 index 00000000..cf1b6a08 --- /dev/null +++ b/backend/app/spiders/zongheng_config/config_spider/spiders/spider.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- +import scrapy +import re +from config_spider.items import Item +from urllib.parse import urljoin, urlparse + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +class ConfigSpider(scrapy.Spider): + name = 'config_spider' + + def start_requests(self): + yield scrapy.Request(url='http://www.zongheng.com/rank/details.html?rt=1&d=1', callback=self.parse_list) + + def parse_list(self, response): + prev_item = response.meta.get('item') + for elem in response.css('.rank_d_list'): + item = Item() + item['title'] = elem.css('.rank_d_b_name > a::text').extract_first() + item['url'] = elem.css('.rank_d_b_name > a::attr("href")').extract_first() + item['abstract'] = elem.css('body::text').extract_first() + item['votes'] = elem.css('.rank_d_b_ticket::text').extract_first() + if prev_item is not None: + for key, value in prev_item.items(): + item[key] = value + yield item + + diff --git a/backend/app/spiders/zongheng_config/md5.txt b/backend/app/spiders/zongheng_config/md5.txt new file mode 100755 index 00000000..46fd3de6 --- /dev/null +++ b/backend/app/spiders/zongheng_config/md5.txt @@ -0,0 +1 @@ +82cb98a6103fb878501df81f191703ba diff --git a/backend/app/spiders/zongheng_config/scrapy.cfg b/backend/app/spiders/zongheng_config/scrapy.cfg new file mode 100755 index 00000000..a78d91e3 --- /dev/null +++ b/backend/app/spiders/zongheng_config/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = config_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = config_spider diff --git a/backend/conf/config.yml b/backend/conf/config.yml index 17341e95..1c2c8507 100644 --- a/backend/conf/config.yml +++ b/backend/conf/config.yml @@ -46,6 +46,8 @@ setting: demoSpiders: "N" checkScrapy: "Y" autoInstall: "Y" + esClient: "" # Your ES client, for example, http://192.168.1.1:9200 or http://your-domain.com, if not use es, set empty + spiderLogIndex: "spider-log" # Index pattern for kibana, need to config on kibana notification: mail: server: '' diff --git a/backend/config/config.go b/backend/config/config.go index e4c4616c..79be808e 100644 --- a/backend/config/config.go +++ b/backend/config/config.go @@ -53,3 +53,5 @@ func InitConfig(cfg string) error { return nil } + + diff --git a/backend/database/es_base.go b/backend/database/es_base.go new file mode 100644 index 00000000..b255958a --- /dev/null +++ b/backend/database/es_base.go @@ -0,0 +1,44 @@ +package database + +import ( + "context" + "github.com/apex/log" + "github.com/olivere/elastic/v7" + "github.com/satori/go.uuid" + "github.com/spf13/viper" + "sync" + "time" +) + +var doOnce sync.Once +var ctx context.Context +var ESClient *elastic.Client + +func InitEsClient() { + esClientStr := viper.GetString("setting.esClient") + ctx = context.Background() + ESClient, _ = elastic.NewClient(elastic.SetURL(esClientStr), elastic.SetSniff(false)) +} + +// WriteMsg will write the msg and level into es +func WriteMsgToES(when time.Time, msg chan string, index string) { + doOnce.Do(InitEsClient) + vals := make(map[string]interface{}) + vals["@timestamp"] = when.Format(time.RFC3339) + for { + select { + case vals["@msg"] = <-msg: + uid := uuid.NewV4().String() + _, err := ESClient.Index().Index(index).Id(uid).BodyJson(vals).Refresh("wait_for").Do(ctx) + if err != nil { + log.Error(err.Error()) + log.Error("send msg log to es error") + return + } + case <-time.After(6 * time.Second): + return + } + } + + return +} diff --git a/backend/go.mod b/backend/go.mod index d91a1a84..7503389a 100644 --- a/backend/go.mod +++ b/backend/go.mod @@ -3,7 +3,10 @@ module crawlab go 1.12 require ( + github.com/Masterminds/semver v1.4.2 // indirect + github.com/Masterminds/sprig v2.16.0+incompatible // indirect github.com/Unknwon/goconfig v0.0.0-20191126170842-860a72fb44fd + github.com/aokoli/goutils v1.0.1 // indirect github.com/apex/log v1.1.1 github.com/dgrijalva/jwt-go v3.2.0+incompatible github.com/fsnotify/fsnotify v1.4.7 @@ -12,15 +15,21 @@ require ( github.com/go-playground/locales v0.12.1 // indirect github.com/go-playground/universal-translator v0.16.0 // indirect github.com/gomodule/redigo v2.0.0+incompatible + github.com/huandu/xstrings v1.2.0 // indirect + github.com/imdario/mergo v0.3.6 // indirect github.com/imroc/req v0.2.4 + github.com/jaytaylor/html2text v0.0.0-20180606194806-57d518f124b0 // indirect github.com/leodido/go-urn v1.1.0 // indirect github.com/matcornic/hermes v1.2.0 - github.com/matcornic/hermes/v2 v2.0.2 // indirect - github.com/pkg/errors v0.8.1 - github.com/royeo/dingrobot v1.0.0 // indirect + github.com/mattn/go-runewidth v0.0.3 // indirect + github.com/olekukonko/tablewriter v0.0.1 // indirect + github.com/olivere/elastic/v7 v7.0.14 + github.com/pkg/errors v0.9.1 github.com/satori/go.uuid v1.2.0 + github.com/shurcooL/sanitized_anchor_name v1.0.0 // indirect github.com/smartystreets/goconvey v0.0.0-20190731233626-505e41936337 github.com/spf13/viper v1.4.0 + github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf // indirect gopkg.in/alexcesaro/quotedprintable.v3 v3.0.0-20150716171945-2caba252f4dc // indirect gopkg.in/go-playground/validator.v9 v9.29.1 gopkg.in/gomail.v2 v2.0.0-20150902115704-41f357289737 diff --git a/backend/go.sum b/backend/go.sum index 463abbee..1a253f5d 100644 --- a/backend/go.sum +++ b/backend/go.sum @@ -8,9 +8,11 @@ github.com/Masterminds/sprig v2.16.0+incompatible/go.mod h1:y6hNFY5UBTIWBxnzTeuN github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= github.com/Unknwon/goconfig v0.0.0-20191126170842-860a72fb44fd h1:+CYOsXi89xOqBkj7CuEJjA2It+j+R3ngUZEydr6mtkw= github.com/Unknwon/goconfig v0.0.0-20191126170842-860a72fb44fd/go.mod h1:wngxua9XCNjvHjDiTiV26DaKDT+0c63QR6H5hjVUUxw= +github.com/alcortesm/tgz v0.0.0-20161220082320-9c5fe88206d7 h1:uSoVVbwJiQipAclBbw+8quDsfcvFjOpI5iCf4p/cqCs= github.com/alcortesm/tgz v0.0.0-20161220082320-9c5fe88206d7/go.mod h1:6zEj6s6u/ghQa61ZWa/C2Aw3RkjiTBOix7dkqa1VLIs= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= +github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239 h1:kFOfPq6dUM1hTo4JG6LR5AXSUEsOjtdm0kw0FtQtMJA= github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239/go.mod h1:2FmKhYUyUczH0OGQWaF5ceTx0UBShxjsH6f8oGKYe2c= github.com/aokoli/goutils v1.0.1 h1:7fpzNGoJ3VA8qcrm++XEE1QUe0mIwNeLa02Nwq7RDkg= github.com/aokoli/goutils v1.0.1/go.mod h1:SijmP0QR8LtwsmDs8Yii5Z/S4trXFGFC2oO5g9DP+DQ= @@ -19,8 +21,10 @@ github.com/apex/log v1.1.1/go.mod h1:Ls949n1HFtXfbDcjiTTFQqkVUrte0puoIBfO3SVgwOA github.com/aphistic/golf v0.0.0-20180712155816-02c07f170c5a/go.mod h1:3NqKYiepwy8kCu4PNA+aP7WUV72eXWJeP9/r3/K9aLE= github.com/aphistic/sweet v0.2.0/go.mod h1:fWDlIh/isSE9n6EPsRmC0det+whmX6dJid3stzu0Xys= github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/aws/aws-sdk-go v1.20.6/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo= +github.com/aws/aws-sdk-go v1.30.7/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZveU8YkpAk0= github.com/aybabtme/rgbterm v0.0.0-20170906152045-cc83f3b3ce59/go.mod h1:q/89r3U2H7sSsE2t6Kca0lfwTK8JdoNGS/yzM/4iH5I= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= @@ -41,7 +45,10 @@ github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8 github.com/emirpasic/gods v1.12.0 h1:QAUIPSaCu4G+POclxeqb3F+WPpdKqFGlw36+yOzGlrg= github.com/emirpasic/gods v1.12.0/go.mod h1:YfzfFFoVP/catgzJb4IKIqXjX78Ha8FMSDh3ymbK86o= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= +github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568 h1:BHsljHzVlRcyQhjrss6TZTdY2VfCqZPbv5k3iBFa2ZQ= github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:xEzjJPgXI435gkrCt3MPfRiAkVrwSbHsst4LCFVfpJc= +github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw= +github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g= github.com/fsnotify/fsnotify v1.4.7 h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= @@ -49,6 +56,7 @@ github.com/gin-contrib/sse v0.0.0-20190301062529-5545eab6dad3 h1:t8FVkw33L+wilf2 github.com/gin-contrib/sse v0.0.0-20190301062529-5545eab6dad3/go.mod h1:VJ0WA2NBN22VlZ2dKZQPAPnyWw5XTlK1KymzLKsr59s= github.com/gin-gonic/gin v1.4.0 h1:3tMoCCfM7ppqsR0ptz/wi1impNpT7/9wQtMZ8lr1mCQ= github.com/gin-gonic/gin v1.4.0/go.mod h1:OW2EZn3DO8Ln9oIKOvM++LBO+5UPHJJDH72/q/3rZdM= +github.com/gliderlabs/ssh v0.2.2 h1:6zsha5zo/TWhRhwqCD3+EarCAgZ2yN28ipRnGPnwkI0= github.com/gliderlabs/ssh v0.2.2/go.mod h1:U7qILu1NlMHj9FlMhZLlkCdDnU1DBEAqr0aevW3Awn0= github.com/globalsign/mgo v0.0.0-20181015135952-eeefdecb41b8 h1:DujepqpGd1hyOd7aW59XpK7Qymp8iy83xq74fLr21is= github.com/globalsign/mgo v0.0.0-20181015135952-eeefdecb41b8/go.mod h1:xkRDCp4j0OGD1HRkm4kmhM+pmpv3AKq5SU7GMg4oO/Q= @@ -59,11 +67,13 @@ github.com/go-playground/locales v0.12.1 h1:2FITxuFt/xuCNP1Acdhv62OzaCiviiE4kotf github.com/go-playground/locales v0.12.1/go.mod h1:IUMDtCfWo/w/mtMfIE/IG2K+Ey3ygWanZIBtBW0W2TM= github.com/go-playground/universal-translator v0.16.0 h1:X++omBR/4cE2MNg91AoC3rmGrCjJ8eAeUP/K/EKx4DM= github.com/go-playground/universal-translator v0.16.0/go.mod h1:1AnU7NaIRDWWzGEKwgtJRd2xk99HeFyHw3yid4rvQIY= +github.com/go-sql-driver/mysql v1.5.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/groupcache v0.0.0-20190129154638-5b532d6fd5ef/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg= @@ -72,8 +82,10 @@ github.com/gomodule/redigo v2.0.0+incompatible h1:K/R+8tc58AaqLkqG2Ol3Qk+DR/TlNu github.com/gomodule/redigo v2.0.0+incompatible/go.mod h1:B4C85qUVwatsJoIUNIfCRsp7qO0iAmpGFZ4EELWSbC4= github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= +github.com/google/go-cmp v0.3.0 h1:crn/baboCvb5fXaQ0IJ1SGTsTVrWpDsCWC8EGETZijY= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= -github.com/google/uuid v1.0.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/google/go-cmp v0.4.0 h1:xsAVV57WRhGj6kEIi8ReJzQlHHqcBYCElAvkovg3B/4= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/uuid v1.1.1 h1:Gkbcsh/GbpXz7lPftLA3P6TYMwjCLYm83jiFQZF/3gY= github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8= @@ -97,6 +109,7 @@ github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 h1:BQSFePA1RWJOl github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99/go.mod h1:1lJo3i6rXxKeerYnT8Nvf0QmHCRC1n8sfWVwXF2Frvo= github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI= github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k= +github.com/jmespath/go-jmespath v0.3.0/go.mod h1:9QtRXoHjLGCJ5IBSaohpXITPlowMeeYCZ7fLUTSywik= github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo= github.com/jpillora/backoff v0.0.0-20180909062703-3050d21c67d7/go.mod h1:2iMrUgbbvHEiQClaW2NsSzMyGHqN+rDFqY705q49KG0= github.com/json-iterator/go v1.1.6 h1:MrUvLMLTMxbqFJ9kzlvat/rYZqZnW3u4wkLzWTaFwKs= @@ -120,9 +133,10 @@ github.com/leodido/go-urn v1.1.0 h1:Sm1gr51B1kKyfD2BlRcLSiEkffoG96g6TPv6eRoEiB8= github.com/leodido/go-urn v1.1.0/go.mod h1:+cyI34gQWZcE1eQU7NVgKkkzdXDQHr1dBMtdAPozLkw= github.com/magiconair/properties v1.8.0 h1:LLgXmsheXeRoUOBOjtwPQCWIYqM/LU1ayDtDePerRcY= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= +github.com/mailru/easyjson v0.7.1 h1:mdxE1MF9o53iCb2Ghj1VfWvh7ZOwHpnVG/xwXrV90U8= +github.com/mailru/easyjson v0.7.1/go.mod h1:KAzv3t3aY1NaHWoQz1+4F1ccyAH66Jk7yos7ldAVICs= github.com/matcornic/hermes v1.2.0 h1:AuqZpYcTOtTB7cahdevLfnhIpfzmpqw5Czv8vpdnFDU= github.com/matcornic/hermes v1.2.0/go.mod h1:lujJomb016Xjv8wBnWlNvUdtmvowjjfkqri5J/+1hYc= -github.com/matcornic/hermes/v2 v2.0.2/go.mod h1:iVsJWSIS4NtMNtgan22sy6lt7pImok7bATGPWCoaKNY= github.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcnceauSikq3lYCQ= github.com/mattn/go-colorable v0.1.2/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= github.com/mattn/go-isatty v0.0.5/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= @@ -145,14 +159,19 @@ github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRW github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U= github.com/olekukonko/tablewriter v0.0.1 h1:b3iUnf1v+ppJiOfNX4yxxqfWKMQPZR5yoh8urCTFX88= github.com/olekukonko/tablewriter v0.0.1/go.mod h1:vsDQFd/mU46D+Z4whnwzcISnGGzXWMclvtLoiIKAKIo= +github.com/olivere/elastic/v7 v7.0.14 h1:89dYPg6kD3WJx42ZtO4U6WDIzRy69FvQqz/yRiwekuM= +github.com/olivere/elastic/v7 v7.0.14/go.mod h1:+FgncZ8ho1QF3NlBo77XbuoTKYHhvEOfFZKIAfHnnDE= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/gomega v1.5.0/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= +github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= github.com/pelletier/go-buffruneio v0.2.0/go.mod h1:JkE26KsDizTr40EUHkXVtNPvgGtbSNq5BcowyYOWdKo= github.com/pelletier/go-toml v1.2.0 h1:T5zMGML61Wp+FlcbWjRDT7yAxhJNAiPPLOFECq181zc= github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= @@ -166,9 +185,6 @@ github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084/go.mod h1:TjEm7z github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU= github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg= github.com/rogpeppe/fastuuid v1.1.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= -github.com/royeo/dingrobot v1.0.0 h1:K4GrF+fOecNX0yi+oBKpfh7z0XP/8TzaIIHu1B2kKUQ= -github.com/royeo/dingrobot v1.0.0/go.mod h1:RqDM8E/hySCVwI2aUFRJAUGDcHHRnIhzNmbNG3bamQs= -github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/satori/go.uuid v1.2.0 h1:0uYX9dsZ2yD7q2RtLRtPSdGDWzjeM3TbMJP9utgA0ww= github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= github.com/sergi/go-diff v1.0.0 h1:Kpca3qRNrduNnOQeazBd0ysaKrUJiIuISHxogkT9RPQ= @@ -205,6 +221,9 @@ github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoH github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H4= +github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= github.com/tj/assert v0.0.0-20171129193455-018094318fb0/go.mod h1:mZ9/Rh9oLWpLLDRpvE+3b7gP/C2YyLFYxNmcLnPTMe0= github.com/tj/go-elastic v0.0.0-20171221160941-36157cbbebc2/go.mod h1:WjeM0Oo1eNAjXGDx2yma7uG2XoyRZTq1uv3M/o7imD0= github.com/tj/go-kinesis v0.0.0-20171128231115-08b17f58cb1b/go.mod h1:/yhzCV0xPfx6jb1bBgRFjl5lytqVqZXEaeqWP8lTEao= @@ -217,24 +236,27 @@ github.com/xanzy/ssh-agent v0.2.1/go.mod h1:mLlQY/MoOhWBj+gOGMQkOeiEvkx+8pJSI+0B github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU= github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q= go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= +go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= -golang.org/x/crypto v0.0.0-20181029175232-7e6ffbd03851/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190219172222-a4c6cb3142f2/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190426145343-a29dc8fdc734 h1:p/H982KKEjUnLJkM3tt/LemDnOc1GiZL5FCVlORJ5zo= golang.org/x/crypto v0.0.0-20190426145343-a29dc8fdc734/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4 h1:HuIa8hRrWRSrqYzx1qI49NNxhdi2PrY7gxVSq1JjLDc= golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20181029044818-c44066c5c816/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181220203305-927f97764cc3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= @@ -243,10 +265,13 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859 h1:R/3boaszxrf1GEUWTVDzSKVwL golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190724013045-ca1201d0de80 h1:Ao/3l156eZf2AW5wK8a7/smtodRU+gha3+BeqJ69lRk= golang.org/x/net v0.0.0-20190724013045-ca1201d0de80/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200202094626-16171245cfb2 h1:CCH4IOTTfewWjGOlSp+zGcjutRKlBEZQ6wTn8ozI/nI= +golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -258,6 +283,7 @@ golang.org/x/sys v0.0.0-20190221075227-b4e8571b14e0/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d h1:+R4KGOnez64A81RvjARKc4UT5/tI9ujCIVX+P5KiHuI= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e h1:D5TXcfTk7xF7hvieo4QErS3qqCB4teTffacDWr7CI+0= golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= @@ -268,12 +294,18 @@ golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxb golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190729092621-ff9f1409240a/go.mod h1:jcCCGcm9btYwXyDqrUWc6MKQKKGJCWEQ3AfLSRIbEuI= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= +google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= +google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= +google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.0/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/alexcesaro/quotedprintable.v3 v3.0.0-20150716171945-2caba252f4dc h1:2gGKlE2+asNV9m7xrywl36YYNnBG5ZQ0r/BOOxqPpmk= @@ -295,6 +327,7 @@ gopkg.in/russross/blackfriday.v2 v2.0.0 h1:+FlnIV8DSQnT7NZ43hcVKcdJdzZoeCmJj4Ql8 gopkg.in/russross/blackfriday.v2 v2.0.0/go.mod h1:6sSBNz/GtOm/pJTuh5UmBK2ZHfmnxGbl2NZg1UliSOI= gopkg.in/src-d/go-billy.v4 v4.3.2 h1:0SQA1pRztfTFx2miS8sA97XvooFeNOmvUenF4o0EcVg= gopkg.in/src-d/go-billy.v4 v4.3.2/go.mod h1:nDjArDMp+XMs1aFAESLRjfGSgfvoYN0hDfzEk0GjC98= +gopkg.in/src-d/go-git-fixtures.v3 v3.5.0 h1:ivZFOIltbce2Mo8IjzUHAFoq/IylO9WHhNOAJK+LsJg= gopkg.in/src-d/go-git-fixtures.v3 v3.5.0/go.mod h1:dLBcvytrw/TYZsNTWCnkNF2DSIlzWYqTe3rJR56Ac7g= gopkg.in/src-d/go-git.v4 v4.13.1 h1:SRtFyV8Kxc0UP7aCHcijOMQGPxHSmMOPrzulQWolkYE= gopkg.in/src-d/go-git.v4 v4.13.1/go.mod h1:nx5NYcxdKxq5fpltdHnPa2Exj4Sx0EclMWZQbYDu2z8= diff --git a/backend/model/task.go b/backend/model/task.go index 35e738ab..0b2ed0a9 100644 --- a/backend/model/task.go +++ b/backend/model/task.go @@ -508,3 +508,4 @@ func UpdateTaskErrorLogs(taskId string, errorRegexPattern string) error { return nil } + diff --git a/backend/services/task.go b/backend/services/task.go index a0bb9a49..16278d24 100644 --- a/backend/services/task.go +++ b/backend/services/task.go @@ -16,7 +16,7 @@ import ( "github.com/apex/log" "github.com/globalsign/mgo/bson" "github.com/imroc/req" - uuid "github.com/satori/go.uuid" + "github.com/satori/go.uuid" "github.com/spf13/viper" "net/http" "os" @@ -166,7 +166,11 @@ func SetEnv(cmd *exec.Cmd, envs []model.Env, task model.Task, spider model.Spide return cmd } -func SetLogConfig(cmd *exec.Cmd, t model.Task, u model.User) error { +func SetLogConfig(wg *sync.WaitGroup, cmd *exec.Cmd, t model.Task, u model.User) error { + + esChan := make(chan string, 1) + esClientStr := viper.GetString("setting.esClient") + spiderLogIndex := viper.GetString("setting.spiderLogIndex") // get stdout reader stdout, err := cmd.StdoutPipe() readerStdout := bufio.NewReader(stdout) @@ -191,7 +195,9 @@ func SetLogConfig(cmd *exec.Cmd, t model.Task, u model.User) error { isStderrFinished := false // periodically (1 sec) insert log items + wg.Add(3) go func() { + defer wg.Done() for { _ = model.AddLogItems(logs) logs = []model.LogItem{} @@ -211,6 +217,7 @@ func SetLogConfig(cmd *exec.Cmd, t model.Task, u model.User) error { // read stdout go func() { + defer wg.Done() for { line, err := readerStdout.ReadString('\n') if err != nil { @@ -227,12 +234,18 @@ func SetLogConfig(cmd *exec.Cmd, t model.Task, u model.User) error { Ts: time.Now(), ExpireTs: time.Now().Add(time.Duration(expireDuration) * time.Second), } + esChan <- l.Message + if esClientStr != "" { + go database.WriteMsgToES(time.Now(), esChan, spiderLogIndex) + } + logs = append(logs, l) } }() // read stderr go func() { + defer wg.Done() for { line, err := readerStderr.ReadString('\n') if err != nil { @@ -249,10 +262,15 @@ func SetLogConfig(cmd *exec.Cmd, t model.Task, u model.User) error { Ts: time.Now(), ExpireTs: time.Now().Add(time.Duration(expireDuration) * time.Second), } + esChan <- l.Message + if esClientStr != "" { + go database.WriteMsgToES(time.Now(), esChan, spiderLogIndex) + } logs = append(logs, l) } }() + wg.Wait() return nil } @@ -337,6 +355,8 @@ func ExecuteShellCmd(cmdStr string, cwd string, t model.Task, s model.Spider, u log.Infof("cwd: %s", cwd) log.Infof("cmd: %s", cmdStr) + wg := &sync.WaitGroup{} + // 生成执行命令 var cmd *exec.Cmd if runtime.GOOS == constants.Windows { @@ -349,9 +369,7 @@ func ExecuteShellCmd(cmdStr string, cwd string, t model.Task, s model.Spider, u cmd.Dir = cwd // 日志配置 - if err := SetLogConfig(cmd, t, u); err != nil { - return err - } + go SetLogConfig(wg, cmd, t, u) // 环境变量配置 envs := s.Envs From 81c9ef7daafcf94744382304fd4d83a390eef770 Mon Sep 17 00:00:00 2001 From: hantmac Date: Mon, 27 Apr 2020 16:01:24 +0800 Subject: [PATCH 02/11] support send log to ES --- backend/app/spiders/amazon_config/Spiderfile | 51 -------- .../amazon_config/config_spider/__init__.py | 0 .../amazon_config/config_spider/items.py | 20 ---- .../config_spider/middlewares.py | 103 ---------------- .../amazon_config/config_spider/pipelines.py | 27 ----- .../amazon_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 37 ------ backend/app/spiders/amazon_config/md5.txt | 1 - backend/app/spiders/amazon_config/scrapy.cfg | 11 -- .../app/spiders/autohome_config/Spiderfile | 57 --------- .../autohome_config/config_spider/__init__.py | 0 .../autohome_config/config_spider/items.py | 21 ---- .../config_spider/middlewares.py | 103 ---------------- .../config_spider/pipelines.py | 27 ----- .../autohome_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 38 ------ backend/app/spiders/autohome_config/md5.txt | 1 - .../app/spiders/autohome_config/scrapy.cfg | 11 -- backend/app/spiders/baidu_config/Spiderfile | 39 ------ .../baidu_config/config_spider/__init__.py | 0 .../baidu_config/config_spider/items.py | 18 --- .../baidu_config/config_spider/middlewares.py | 103 ---------------- .../baidu_config/config_spider/pipelines.py | 27 ----- .../baidu_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 35 ------ backend/app/spiders/baidu_config/md5.txt | 1 - backend/app/spiders/baidu_config/scrapy.cfg | 11 -- backend/app/spiders/bing_general/Spiderfile | 6 - .../app/spiders/bing_general/bing_spider.py | 41 ------- backend/app/spiders/bing_general/md5.txt | 1 - backend/app/spiders/chinaz/Spiderfile | 5 - backend/app/spiders/chinaz/chinaz/__init__.py | 0 backend/app/spiders/chinaz/chinaz/items.py | 21 ---- .../app/spiders/chinaz/chinaz/middlewares.py | 103 ---------------- .../app/spiders/chinaz/chinaz/pipelines.py | 7 -- backend/app/spiders/chinaz/chinaz/settings.py | 90 -------------- .../spiders/chinaz/chinaz/spiders/__init__.py | 4 - .../chinaz/chinaz/spiders/chinaz_spider.py | 63 ---------- backend/app/spiders/chinaz/md5.txt | 1 - backend/app/spiders/chinaz/scrapy.cfg | 11 -- backend/app/spiders/csdn_config/Spiderfile | 60 ---------- .../csdn_config/config_spider/__init__.py | 0 .../csdn_config/config_spider/items.py | 20 ---- .../csdn_config/config_spider/middlewares.py | 103 ---------------- .../csdn_config/config_spider/pipelines.py | 27 ----- .../csdn_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 41 ------- backend/app/spiders/csdn_config/md5.txt | 1 - backend/app/spiders/csdn_config/scrapy.cfg | 11 -- backend/app/spiders/douban_config/Spiderfile | 57 --------- .../douban_config/config_spider/__init__.py | 0 .../douban_config/config_spider/items.py | 21 ---- .../config_spider/middlewares.py | 103 ---------------- .../douban_config/config_spider/pipelines.py | 27 ----- .../douban_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 36 ------ backend/app/spiders/douban_config/md5.txt | 1 - backend/app/spiders/douban_config/scrapy.cfg | 11 -- backend/app/spiders/jd/Spiderfile | 5 - backend/app/spiders/jd/jd/__init__.py | 0 backend/app/spiders/jd/jd/items.py | 15 --- backend/app/spiders/jd/jd/middlewares.py | 103 ---------------- backend/app/spiders/jd/jd/pipelines.py | 6 - backend/app/spiders/jd/jd/settings.py | 90 -------------- backend/app/spiders/jd/jd/spiders/__init__.py | 4 - .../app/spiders/jd/jd/spiders/jd_spider.py | 21 ---- backend/app/spiders/jd/md5.txt | 1 - backend/app/spiders/jd/scrapy.cfg | 11 -- backend/app/spiders/sinastock/Spiderfile | 5 - backend/app/spiders/sinastock/md5.txt | 1 - backend/app/spiders/sinastock/scrapy.cfg | 11 -- .../spiders/sinastock/sinastock/__init__.py | 0 .../app/spiders/sinastock/sinastock/items.py | 21 ---- .../sinastock/sinastock/middlewares.py | 103 ---------------- .../spiders/sinastock/sinastock/pipelines.py | 6 - .../spiders/sinastock/sinastock/settings.py | 89 -------------- .../sinastock/sinastock/spiders/__init__.py | 4 - .../sinastock/spiders/sinastock_spider.py | 59 ---------- backend/app/spiders/v2ex_config/Spiderfile | 54 --------- .../v2ex_config/config_spider/__init__.py | 0 .../v2ex_config/config_spider/items.py | 19 --- .../v2ex_config/config_spider/middlewares.py | 103 ---------------- .../v2ex_config/config_spider/pipelines.py | 27 ----- .../v2ex_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 38 ------ backend/app/spiders/v2ex_config/md5.txt | 1 - backend/app/spiders/v2ex_config/scrapy.cfg | 11 -- backend/app/spiders/xueqiu/Spiderfile | 5 - backend/app/spiders/xueqiu/md5.txt | 1 - backend/app/spiders/xueqiu/scrapy.cfg | 11 -- backend/app/spiders/xueqiu/xueqiu/__init__.py | 0 backend/app/spiders/xueqiu/xueqiu/items.py | 23 ---- .../app/spiders/xueqiu/xueqiu/middlewares.py | 103 ---------------- .../app/spiders/xueqiu/xueqiu/pipelines.py | 6 - backend/app/spiders/xueqiu/xueqiu/settings.py | 89 -------------- .../spiders/xueqiu/xueqiu/spiders/__init__.py | 4 - .../xueqiu/xueqiu/spiders/xueqiu_spider.py | 46 -------- backend/app/spiders/xueqiu_config/Spiderfile | 39 ------ .../xueqiu_config/config_spider/__init__.py | 0 .../xueqiu_config/config_spider/items.py | 18 --- .../config_spider/middlewares.py | 103 ---------------- .../xueqiu_config/config_spider/pipelines.py | 27 ----- .../xueqiu_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 33 ------ backend/app/spiders/xueqiu_config/md5.txt | 1 - backend/app/spiders/xueqiu_config/scrapy.cfg | 11 -- .../app/spiders/zongheng_config/Spiderfile | 45 ------- .../zongheng_config/config_spider/__init__.py | 0 .../zongheng_config/config_spider/items.py | 19 --- .../config_spider/middlewares.py | 103 ---------------- .../config_spider/pipelines.py | 27 ----- .../zongheng_config/config_spider/settings.py | 111 ------------------ .../config_spider/spiders/__init__.py | 4 - .../config_spider/spiders/spider.py | 34 ------ backend/app/spiders/zongheng_config/md5.txt | 1 - .../app/spiders/zongheng_config/scrapy.cfg | 11 -- 123 files changed, 4102 deletions(-) delete mode 100755 backend/app/spiders/amazon_config/Spiderfile delete mode 100755 backend/app/spiders/amazon_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/amazon_config/config_spider/items.py delete mode 100755 backend/app/spiders/amazon_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/amazon_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/amazon_config/config_spider/settings.py delete mode 100755 backend/app/spiders/amazon_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/amazon_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/amazon_config/md5.txt delete mode 100755 backend/app/spiders/amazon_config/scrapy.cfg delete mode 100755 backend/app/spiders/autohome_config/Spiderfile delete mode 100755 backend/app/spiders/autohome_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/autohome_config/config_spider/items.py delete mode 100755 backend/app/spiders/autohome_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/autohome_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/autohome_config/config_spider/settings.py delete mode 100755 backend/app/spiders/autohome_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/autohome_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/autohome_config/md5.txt delete mode 100755 backend/app/spiders/autohome_config/scrapy.cfg delete mode 100755 backend/app/spiders/baidu_config/Spiderfile delete mode 100755 backend/app/spiders/baidu_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/baidu_config/config_spider/items.py delete mode 100755 backend/app/spiders/baidu_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/baidu_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/baidu_config/config_spider/settings.py delete mode 100755 backend/app/spiders/baidu_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/baidu_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/baidu_config/md5.txt delete mode 100755 backend/app/spiders/baidu_config/scrapy.cfg delete mode 100755 backend/app/spiders/bing_general/Spiderfile delete mode 100755 backend/app/spiders/bing_general/bing_spider.py delete mode 100755 backend/app/spiders/bing_general/md5.txt delete mode 100755 backend/app/spiders/chinaz/Spiderfile delete mode 100755 backend/app/spiders/chinaz/chinaz/__init__.py delete mode 100755 backend/app/spiders/chinaz/chinaz/items.py delete mode 100755 backend/app/spiders/chinaz/chinaz/middlewares.py delete mode 100755 backend/app/spiders/chinaz/chinaz/pipelines.py delete mode 100755 backend/app/spiders/chinaz/chinaz/settings.py delete mode 100755 backend/app/spiders/chinaz/chinaz/spiders/__init__.py delete mode 100755 backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py delete mode 100755 backend/app/spiders/chinaz/md5.txt delete mode 100755 backend/app/spiders/chinaz/scrapy.cfg delete mode 100755 backend/app/spiders/csdn_config/Spiderfile delete mode 100755 backend/app/spiders/csdn_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/csdn_config/config_spider/items.py delete mode 100755 backend/app/spiders/csdn_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/csdn_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/csdn_config/config_spider/settings.py delete mode 100755 backend/app/spiders/csdn_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/csdn_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/csdn_config/md5.txt delete mode 100755 backend/app/spiders/csdn_config/scrapy.cfg delete mode 100755 backend/app/spiders/douban_config/Spiderfile delete mode 100755 backend/app/spiders/douban_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/douban_config/config_spider/items.py delete mode 100755 backend/app/spiders/douban_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/douban_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/douban_config/config_spider/settings.py delete mode 100755 backend/app/spiders/douban_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/douban_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/douban_config/md5.txt delete mode 100755 backend/app/spiders/douban_config/scrapy.cfg delete mode 100755 backend/app/spiders/jd/Spiderfile delete mode 100755 backend/app/spiders/jd/jd/__init__.py delete mode 100755 backend/app/spiders/jd/jd/items.py delete mode 100755 backend/app/spiders/jd/jd/middlewares.py delete mode 100755 backend/app/spiders/jd/jd/pipelines.py delete mode 100755 backend/app/spiders/jd/jd/settings.py delete mode 100755 backend/app/spiders/jd/jd/spiders/__init__.py delete mode 100755 backend/app/spiders/jd/jd/spiders/jd_spider.py delete mode 100755 backend/app/spiders/jd/md5.txt delete mode 100755 backend/app/spiders/jd/scrapy.cfg delete mode 100755 backend/app/spiders/sinastock/Spiderfile delete mode 100755 backend/app/spiders/sinastock/md5.txt delete mode 100755 backend/app/spiders/sinastock/scrapy.cfg delete mode 100755 backend/app/spiders/sinastock/sinastock/__init__.py delete mode 100755 backend/app/spiders/sinastock/sinastock/items.py delete mode 100755 backend/app/spiders/sinastock/sinastock/middlewares.py delete mode 100755 backend/app/spiders/sinastock/sinastock/pipelines.py delete mode 100755 backend/app/spiders/sinastock/sinastock/settings.py delete mode 100755 backend/app/spiders/sinastock/sinastock/spiders/__init__.py delete mode 100755 backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py delete mode 100755 backend/app/spiders/v2ex_config/Spiderfile delete mode 100755 backend/app/spiders/v2ex_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/v2ex_config/config_spider/items.py delete mode 100755 backend/app/spiders/v2ex_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/v2ex_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/v2ex_config/config_spider/settings.py delete mode 100755 backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/v2ex_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/v2ex_config/md5.txt delete mode 100755 backend/app/spiders/v2ex_config/scrapy.cfg delete mode 100755 backend/app/spiders/xueqiu/Spiderfile delete mode 100755 backend/app/spiders/xueqiu/md5.txt delete mode 100755 backend/app/spiders/xueqiu/scrapy.cfg delete mode 100755 backend/app/spiders/xueqiu/xueqiu/__init__.py delete mode 100755 backend/app/spiders/xueqiu/xueqiu/items.py delete mode 100755 backend/app/spiders/xueqiu/xueqiu/middlewares.py delete mode 100755 backend/app/spiders/xueqiu/xueqiu/pipelines.py delete mode 100755 backend/app/spiders/xueqiu/xueqiu/settings.py delete mode 100755 backend/app/spiders/xueqiu/xueqiu/spiders/__init__.py delete mode 100755 backend/app/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py delete mode 100755 backend/app/spiders/xueqiu_config/Spiderfile delete mode 100755 backend/app/spiders/xueqiu_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/xueqiu_config/config_spider/items.py delete mode 100755 backend/app/spiders/xueqiu_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/xueqiu_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/xueqiu_config/config_spider/settings.py delete mode 100755 backend/app/spiders/xueqiu_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/xueqiu_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/xueqiu_config/md5.txt delete mode 100755 backend/app/spiders/xueqiu_config/scrapy.cfg delete mode 100755 backend/app/spiders/zongheng_config/Spiderfile delete mode 100755 backend/app/spiders/zongheng_config/config_spider/__init__.py delete mode 100755 backend/app/spiders/zongheng_config/config_spider/items.py delete mode 100755 backend/app/spiders/zongheng_config/config_spider/middlewares.py delete mode 100755 backend/app/spiders/zongheng_config/config_spider/pipelines.py delete mode 100755 backend/app/spiders/zongheng_config/config_spider/settings.py delete mode 100755 backend/app/spiders/zongheng_config/config_spider/spiders/__init__.py delete mode 100755 backend/app/spiders/zongheng_config/config_spider/spiders/spider.py delete mode 100755 backend/app/spiders/zongheng_config/md5.txt delete mode 100755 backend/app/spiders/zongheng_config/scrapy.cfg diff --git a/backend/app/spiders/amazon_config/Spiderfile b/backend/app/spiders/amazon_config/Spiderfile deleted file mode 100755 index eea8a538..00000000 --- a/backend/app/spiders/amazon_config/Spiderfile +++ /dev/null @@ -1,51 +0,0 @@ -name: "amazon_config" -display_name: "亚马逊中国(可配置)" -remark: "亚马逊中国搜索手机,列表+分页" -type: "configurable" -col: "results_amazon_config" -engine: scrapy -start_url: https://www.amazon.cn/s?k=%E6%89%8B%E6%9C%BA&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_2 -start_stage: list -stages: -- name: list - is_list: true - list_css: .s-result-item - list_xpath: "" - page_css: .a-last > a - page_xpath: "" - page_attr: href - fields: - - name: title - css: span.a-text-normal - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: url - css: .a-link-normal - xpath: "" - attr: href - next_stage: "" - remark: "" - - name: price - css: "" - xpath: .//*[@class="a-price-whole"] - attr: "" - next_stage: "" - remark: "" - - name: price_fraction - css: "" - xpath: .//*[@class="a-price-fraction"] - attr: "" - next_stage: "" - remark: "" - - name: img - css: .s-image-square-aspect > img - xpath: "" - attr: src - next_stage: "" - remark: "" -settings: - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/amazon_config/config_spider/__init__.py b/backend/app/spiders/amazon_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/amazon_config/config_spider/items.py b/backend/app/spiders/amazon_config/config_spider/items.py deleted file mode 100755 index 79bf0adb..00000000 --- a/backend/app/spiders/amazon_config/config_spider/items.py +++ /dev/null @@ -1,20 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - title = scrapy.Field() - url = scrapy.Field() - price = scrapy.Field() - price_fraction = scrapy.Field() - img = scrapy.Field() - diff --git a/backend/app/spiders/amazon_config/config_spider/middlewares.py b/backend/app/spiders/amazon_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/amazon_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/amazon_config/config_spider/pipelines.py b/backend/app/spiders/amazon_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/amazon_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/amazon_config/config_spider/settings.py b/backend/app/spiders/amazon_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/amazon_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/amazon_config/config_spider/spiders/__init__.py b/backend/app/spiders/amazon_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/amazon_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/amazon_config/config_spider/spiders/spider.py b/backend/app/spiders/amazon_config/config_spider/spiders/spider.py deleted file mode 100755 index a7421df3..00000000 --- a/backend/app/spiders/amazon_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,37 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='https://www.amazon.cn/s?k=%E6%89%8B%E6%9C%BA&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_2', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.css('.s-result-item'): - item = Item() - item['title'] = elem.css('span.a-text-normal::text').extract_first() - item['url'] = elem.css('.a-link-normal::attr("href")').extract_first() - item['price'] = elem.xpath('string(.//*[@class="a-price-whole"])').extract_first() - item['price_fraction'] = elem.xpath('string(.//*[@class="a-price-fraction"])').extract_first() - item['img'] = elem.css('.s-image-square-aspect > img::attr("src")').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield item - next_url = response.css('.a-last > a::attr("href")').extract_first() - yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) - - diff --git a/backend/app/spiders/amazon_config/md5.txt b/backend/app/spiders/amazon_config/md5.txt deleted file mode 100755 index 52c5423f..00000000 --- a/backend/app/spiders/amazon_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -4b716dd3c15b993ccb7a9f0be1cc0de9 diff --git a/backend/app/spiders/amazon_config/scrapy.cfg b/backend/app/spiders/amazon_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/amazon_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider diff --git a/backend/app/spiders/autohome_config/Spiderfile b/backend/app/spiders/autohome_config/Spiderfile deleted file mode 100755 index e69880cb..00000000 --- a/backend/app/spiders/autohome_config/Spiderfile +++ /dev/null @@ -1,57 +0,0 @@ -name: "autohome_config" -display_name: "汽车之家(可配置)" -remark: "汽车之家文章,列表+详情+分页" -type: "configurable" -col: "results_autohome_config" -engine: scrapy -start_url: https://www.autohome.com.cn/all/ -start_stage: list -stages: -- name: list - is_list: true - list_css: ul.article > li - list_xpath: "" - page_css: a.page-item-next - page_xpath: "" - page_attr: href - fields: - - name: title - css: li > a > h3 - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: url - css: li > a - xpath: "" - attr: href - next_stage: "" - remark: "" - - name: abstract - css: li > a > p - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: time - css: li > a .fn-left - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: views - css: li > a .fn-right > em:first-child - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: comments - css: li > a .fn-right > em:last-child - xpath: "" - attr: "" - next_stage: "" - remark: "" -settings: - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/autohome_config/config_spider/__init__.py b/backend/app/spiders/autohome_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/autohome_config/config_spider/items.py b/backend/app/spiders/autohome_config/config_spider/items.py deleted file mode 100755 index 206203d5..00000000 --- a/backend/app/spiders/autohome_config/config_spider/items.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - title = scrapy.Field() - url = scrapy.Field() - abstract = scrapy.Field() - time = scrapy.Field() - views = scrapy.Field() - comments = scrapy.Field() - diff --git a/backend/app/spiders/autohome_config/config_spider/middlewares.py b/backend/app/spiders/autohome_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/autohome_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/autohome_config/config_spider/pipelines.py b/backend/app/spiders/autohome_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/autohome_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/autohome_config/config_spider/settings.py b/backend/app/spiders/autohome_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/autohome_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/autohome_config/config_spider/spiders/__init__.py b/backend/app/spiders/autohome_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/autohome_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/autohome_config/config_spider/spiders/spider.py b/backend/app/spiders/autohome_config/config_spider/spiders/spider.py deleted file mode 100755 index 83753f5a..00000000 --- a/backend/app/spiders/autohome_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,38 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='https://www.autohome.com.cn/all/', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.css('ul.article > li'): - item = Item() - item['title'] = elem.css('li > a > h3::text').extract_first() - item['url'] = elem.css('li > a::attr("href")').extract_first() - item['abstract'] = elem.css('li > a > p::text').extract_first() - item['time'] = elem.css('li > a .fn-left::text').extract_first() - item['views'] = elem.css('li > a .fn-right > em:first-child::text').extract_first() - item['comments'] = elem.css('li > a .fn-right > em:last-child::text').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield item - next_url = response.css('a.page-item-next::attr("href")').extract_first() - yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) - - diff --git a/backend/app/spiders/autohome_config/md5.txt b/backend/app/spiders/autohome_config/md5.txt deleted file mode 100755 index c4707adf..00000000 --- a/backend/app/spiders/autohome_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -d784a11085e298eaf344eadc3a3e9411 diff --git a/backend/app/spiders/autohome_config/scrapy.cfg b/backend/app/spiders/autohome_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/autohome_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider diff --git a/backend/app/spiders/baidu_config/Spiderfile b/backend/app/spiders/baidu_config/Spiderfile deleted file mode 100755 index a29d4acb..00000000 --- a/backend/app/spiders/baidu_config/Spiderfile +++ /dev/null @@ -1,39 +0,0 @@ -name: "baidu_config" -display_name: "百度搜索(可配置)" -remark: "百度搜索Crawlab,列表+分页" -type: "configurable" -col: "results_baidu_config" -engine: scrapy -start_url: http://www.baidu.com/s?wd=crawlab -start_stage: list -stages: -- name: list - is_list: true - list_css: ".result.c-container" - list_xpath: "" - page_css: "a.n" - page_xpath: "" - page_attr: href - fields: - - name: title - css: "" - xpath: .//h3/a - attr: "" - next_stage: "" - remark: "" - - name: url - css: "" - xpath: .//h3/a - attr: href - next_stage: "" - remark: "" - - name: abstract - css: "" - xpath: .//*[@class="c-abstract"] - attr: "" - next_stage: "" - remark: "" -settings: - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/baidu_config/config_spider/__init__.py b/backend/app/spiders/baidu_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/baidu_config/config_spider/items.py b/backend/app/spiders/baidu_config/config_spider/items.py deleted file mode 100755 index 9282765f..00000000 --- a/backend/app/spiders/baidu_config/config_spider/items.py +++ /dev/null @@ -1,18 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - title = scrapy.Field() - url = scrapy.Field() - abstract = scrapy.Field() - diff --git a/backend/app/spiders/baidu_config/config_spider/middlewares.py b/backend/app/spiders/baidu_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/baidu_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/baidu_config/config_spider/pipelines.py b/backend/app/spiders/baidu_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/baidu_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/baidu_config/config_spider/settings.py b/backend/app/spiders/baidu_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/baidu_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/baidu_config/config_spider/spiders/__init__.py b/backend/app/spiders/baidu_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/baidu_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/baidu_config/config_spider/spiders/spider.py b/backend/app/spiders/baidu_config/config_spider/spiders/spider.py deleted file mode 100755 index e5fd793f..00000000 --- a/backend/app/spiders/baidu_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,35 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='http://www.baidu.com/s?wd=crawlab', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.css('.result.c-container'): - item = Item() - item['title'] = elem.xpath('string(.//h3/a)').extract_first() - item['url'] = elem.xpath('.//h3/a/@href').extract_first() - item['abstract'] = elem.xpath('string(.//*[@class="c-abstract"])').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield item - next_url = response.css('a.n::attr("href")').extract_first() - yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) - - diff --git a/backend/app/spiders/baidu_config/md5.txt b/backend/app/spiders/baidu_config/md5.txt deleted file mode 100755 index 32137b76..00000000 --- a/backend/app/spiders/baidu_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -ba25f6f3567b256473d3f0ec6af783fd diff --git a/backend/app/spiders/baidu_config/scrapy.cfg b/backend/app/spiders/baidu_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/baidu_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider diff --git a/backend/app/spiders/bing_general/Spiderfile b/backend/app/spiders/bing_general/Spiderfile deleted file mode 100755 index 614c135e..00000000 --- a/backend/app/spiders/bing_general/Spiderfile +++ /dev/null @@ -1,6 +0,0 @@ -name: "bing_general" -display_name: "必应搜索 (通用)" -remark: "必应搜索 Crawlab,列表+分页" -col: "results_bing_general" -type: "customized" -cmd: "python bing_spider.py" \ No newline at end of file diff --git a/backend/app/spiders/bing_general/bing_spider.py b/backend/app/spiders/bing_general/bing_spider.py deleted file mode 100755 index e982e4ee..00000000 --- a/backend/app/spiders/bing_general/bing_spider.py +++ /dev/null @@ -1,41 +0,0 @@ -import requests -from bs4 import BeautifulSoup as bs -from urllib.parse import urljoin, urlparse -import re -from crawlab import save_item - -s = requests.Session() - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -def start_requests(): - for i in range(0, 9): - fr = 'PERE' if not i else 'MORE' - url = f'https://cn.bing.com/search?q=crawlab&first={10 * i + 1}&FROM={fr}' - request_page(url) - -def request_page(url): - print(f'requesting {url}') - r = s.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}) - parse_list(r) - -def parse_list(response): - soup = bs(response.content.decode('utf-8')) - for el in list(soup.select('#b_results > li')): - try: - save_item({ - 'title': el.select_one('h2').text, - 'url': el.select_one('h2 a').attrs.get('href'), - 'abstract': el.select_one('.b_caption p').text, - }) - except: - pass - -if __name__ == '__main__': - start_requests() \ No newline at end of file diff --git a/backend/app/spiders/bing_general/md5.txt b/backend/app/spiders/bing_general/md5.txt deleted file mode 100755 index 42fb6afd..00000000 --- a/backend/app/spiders/bing_general/md5.txt +++ /dev/null @@ -1 +0,0 @@ -cf295b694a20c99c4857f838aa0402a7 diff --git a/backend/app/spiders/chinaz/Spiderfile b/backend/app/spiders/chinaz/Spiderfile deleted file mode 100755 index 2fb940bb..00000000 --- a/backend/app/spiders/chinaz/Spiderfile +++ /dev/null @@ -1,5 +0,0 @@ -name: "chinaz" -display_name: "站长之家 (Scrapy)" -col: "results_chinaz" -type: "customized" -cmd: "scrapy crawl chinaz_spider" \ No newline at end of file diff --git a/backend/app/spiders/chinaz/chinaz/__init__.py b/backend/app/spiders/chinaz/chinaz/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/chinaz/chinaz/items.py b/backend/app/spiders/chinaz/chinaz/items.py deleted file mode 100755 index 1fdcac1b..00000000 --- a/backend/app/spiders/chinaz/chinaz/items.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class ChinazItem(scrapy.Item): - # define the fields for your item here like: - _id = scrapy.Field() - task_id = scrapy.Field() - name = scrapy.Field() - domain = scrapy.Field() - description = scrapy.Field() - rank = scrapy.Field() - main_category = scrapy.Field() - category = scrapy.Field() - location = scrapy.Field() diff --git a/backend/app/spiders/chinaz/chinaz/middlewares.py b/backend/app/spiders/chinaz/chinaz/middlewares.py deleted file mode 100755 index c98995d5..00000000 --- a/backend/app/spiders/chinaz/chinaz/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ChinazSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Response, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ChinazDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/chinaz/chinaz/pipelines.py b/backend/app/spiders/chinaz/chinaz/pipelines.py deleted file mode 100755 index b29f9eb7..00000000 --- a/backend/app/spiders/chinaz/chinaz/pipelines.py +++ /dev/null @@ -1,7 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html - diff --git a/backend/app/spiders/chinaz/chinaz/settings.py b/backend/app/spiders/chinaz/chinaz/settings.py deleted file mode 100755 index 932ec9ed..00000000 --- a/backend/app/spiders/chinaz/chinaz/settings.py +++ /dev/null @@ -1,90 +0,0 @@ -# -*- coding: utf-8 -*- - -# Scrapy settings for chinaz project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://doc.scrapy.org/en/latest/topics/settings.html -# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'chinaz' - -SPIDER_MODULES = ['chinaz.spiders'] -NEWSPIDER_MODULE = 'chinaz.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'chinaz (+http://www.yourdomain.com)' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'chinaz.middlewares.ChinazSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'chinaz.middlewares.ChinazDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://doc.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'crawlab.pipelines.CrawlabMongoPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/chinaz/chinaz/spiders/__init__.py b/backend/app/spiders/chinaz/chinaz/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/chinaz/chinaz/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py b/backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py deleted file mode 100755 index 28ad84e7..00000000 --- a/backend/app/spiders/chinaz/chinaz/spiders/chinaz_spider.py +++ /dev/null @@ -1,63 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -from chinaz.items import ChinazItem - - -class ChinazSpiderSpider(scrapy.Spider): - name = 'chinaz_spider' - allowed_domains = ['chinaz.com'] - start_urls = ['http://top.chinaz.com/hangye/'] - - def parse(self, response): - for item in response.css('.listCentent > li'): - name = item.css('h3.rightTxtHead > a::text').extract_first() - href = item.css('h3.rightTxtHead > a::attr("href")').extract_first() - domain = item.css('h3.rightTxtHead > span::text').extract_first() - description = item.css('p.RtCInfo::text').extract_first() - rank = item.css('.RtCRateCent > strong::text').extract_first() - rank = int(rank) - item = ChinazItem( - _id=domain, - name=name, - domain=domain, - description=description, - rank=rank, - ) - yield scrapy.Request( - url='http://top.chinaz.com' + href, - callback=self.parse_item, - meta={ - 'item': item - } - ) - - # pagination - a_list = response.css('.ListPageWrap > a::attr("href")').extract() - url = 'http://top.chinaz.com/hangye/' + a_list[-1] - yield scrapy.Request(url=url, callback=self.parse) - - def parse_item(self, response): - item = response.meta['item'] - - # category info extraction - arr = response.css('.TopMainTag-show .SimSun') - res1 = arr[0].css('a::text').extract() - main_category = res1[0] - if len(res1) == 1: - category = '其他' - else: - category = res1[1] - - # location info extraction - res2 = arr[1].css('a::text').extract() - if len(res2) > 0: - location = res2[0] - else: - location = '其他' - - # assign values to item - item['main_category'] = main_category - item['category'] = category - item['location'] = location - - yield item diff --git a/backend/app/spiders/chinaz/md5.txt b/backend/app/spiders/chinaz/md5.txt deleted file mode 100755 index f5e15fb9..00000000 --- a/backend/app/spiders/chinaz/md5.txt +++ /dev/null @@ -1 +0,0 @@ -1976593e49bf0238602ce35d051bd137 diff --git a/backend/app/spiders/chinaz/scrapy.cfg b/backend/app/spiders/chinaz/scrapy.cfg deleted file mode 100755 index d3b44a1a..00000000 --- a/backend/app/spiders/chinaz/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = chinaz.settings - -[deploy] -#url = http://localhost:6800/ -project = chinaz diff --git a/backend/app/spiders/csdn_config/Spiderfile b/backend/app/spiders/csdn_config/Spiderfile deleted file mode 100755 index 67f4f8c5..00000000 --- a/backend/app/spiders/csdn_config/Spiderfile +++ /dev/null @@ -1,60 +0,0 @@ -name: "csdn_config" -display_name: "CSDN(可配置)" -remark: "CSDN Crawlab 文章,列表+详情+分页" -type: "configurable" -col: "results_csdn_config" -engine: scrapy -start_url: https://so.csdn.net/so/search/s.do?q=crawlab -start_stage: list -stages: -- name: list - is_list: true - list_css: .search-list-con > .search-list - list_xpath: "" - page_css: a.btn-next - page_xpath: "" - page_attr: href - fields: - - name: url - css: "" - xpath: .//*[@class="limit_width"]/a - attr: href - next_stage: detail - remark: "" -- name: detail - is_list: false - list_css: "" - list_xpath: "" - page_css: "" - page_xpath: "" - page_attr: "" - fields: - - name: content - css: "" - xpath: .//div[@id="content_views"] - attr: "" - next_stage: "" - remark: "" - - name: views - css: .read-count - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: title - css: .title-article - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: author - css: .follow-nickName - xpath: "" - attr: "" - next_stage: "" - remark: "" -settings: - AUTOTHROTTLE_ENABLED: "false" - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/79.0.3945.117 Safari/537.36 diff --git a/backend/app/spiders/csdn_config/config_spider/__init__.py b/backend/app/spiders/csdn_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/csdn_config/config_spider/items.py b/backend/app/spiders/csdn_config/config_spider/items.py deleted file mode 100755 index 3c8e5e54..00000000 --- a/backend/app/spiders/csdn_config/config_spider/items.py +++ /dev/null @@ -1,20 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - url = scrapy.Field() - content = scrapy.Field() - views = scrapy.Field() - title = scrapy.Field() - author = scrapy.Field() - diff --git a/backend/app/spiders/csdn_config/config_spider/middlewares.py b/backend/app/spiders/csdn_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/csdn_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/csdn_config/config_spider/pipelines.py b/backend/app/spiders/csdn_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/csdn_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/csdn_config/config_spider/settings.py b/backend/app/spiders/csdn_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/csdn_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/csdn_config/config_spider/spiders/__init__.py b/backend/app/spiders/csdn_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/csdn_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/csdn_config/config_spider/spiders/spider.py b/backend/app/spiders/csdn_config/config_spider/spiders/spider.py deleted file mode 100755 index 9ecc4aae..00000000 --- a/backend/app/spiders/csdn_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,41 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='https://so.csdn.net/so/search/s.do?q=crawlab', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.css('.search-list-con > .search-list'): - item = Item() - item['url'] = elem.xpath('.//*[@class="limit_width"]/a/@href').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield scrapy.Request(url=get_real_url(response, item['url']), callback=self.parse_detail, meta={'item': item}) - next_url = response.css('a.btn-next::attr("href")').extract_first() - yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) - - def parse_detail(self, response): - item = Item() if response.meta.get('item') is None else response.meta.get('item') - item['content'] = response.xpath('string(.//div[@id="content_views"])').extract_first() - item['views'] = response.css('.read-count::text').extract_first() - item['title'] = response.css('.title-article::text').extract_first() - item['author'] = response.css('.follow-nickName::text').extract_first() - yield item - - diff --git a/backend/app/spiders/csdn_config/md5.txt b/backend/app/spiders/csdn_config/md5.txt deleted file mode 100755 index e169c42a..00000000 --- a/backend/app/spiders/csdn_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -b6889c74e006a5e619b525d84db62ffd diff --git a/backend/app/spiders/csdn_config/scrapy.cfg b/backend/app/spiders/csdn_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/csdn_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider diff --git a/backend/app/spiders/douban_config/Spiderfile b/backend/app/spiders/douban_config/Spiderfile deleted file mode 100755 index 84f0647a..00000000 --- a/backend/app/spiders/douban_config/Spiderfile +++ /dev/null @@ -1,57 +0,0 @@ -name: "douban_config" -display_name: "豆瓣读书(可配置)" -remark: "豆瓣读书新书推荐,列表" -type: "configurable" -col: "results_douban_config" -engine: scrapy -start_url: https://book.douban.com/latest -start_stage: list -stages: -- name: list - is_list: true - list_css: ul.cover-col-4 > li - list_xpath: "" - page_css: "" - page_xpath: "" - page_attr: "" - fields: - - name: title - css: h2 > a - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: url - css: h2 > a - xpath: "" - attr: href - next_stage: "" - remark: "" - - name: img - css: a.cover img - xpath: "" - attr: src - next_stage: "" - remark: "" - - name: rating - css: p.rating > .color-lightgray - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: abstract - css: p:last-child - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: info - css: .color-gray - xpath: "" - attr: "" - next_stage: "" - remark: "" -settings: - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/douban_config/config_spider/__init__.py b/backend/app/spiders/douban_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/douban_config/config_spider/items.py b/backend/app/spiders/douban_config/config_spider/items.py deleted file mode 100755 index d6959b8d..00000000 --- a/backend/app/spiders/douban_config/config_spider/items.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - title = scrapy.Field() - url = scrapy.Field() - img = scrapy.Field() - rating = scrapy.Field() - abstract = scrapy.Field() - info = scrapy.Field() - diff --git a/backend/app/spiders/douban_config/config_spider/middlewares.py b/backend/app/spiders/douban_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/douban_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/douban_config/config_spider/pipelines.py b/backend/app/spiders/douban_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/douban_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/douban_config/config_spider/settings.py b/backend/app/spiders/douban_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/douban_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/douban_config/config_spider/spiders/__init__.py b/backend/app/spiders/douban_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/douban_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/douban_config/config_spider/spiders/spider.py b/backend/app/spiders/douban_config/config_spider/spiders/spider.py deleted file mode 100755 index 61bb648d..00000000 --- a/backend/app/spiders/douban_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,36 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='https://book.douban.com/latest', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.css('ul.cover-col-4 > li'): - item = Item() - item['title'] = elem.css('h2 > a::text').extract_first() - item['url'] = elem.css('h2 > a::attr("href")').extract_first() - item['img'] = elem.css('a.cover img::attr("src")').extract_first() - item['rating'] = elem.css('p.rating > .color-lightgray::text').extract_first() - item['abstract'] = elem.css('p:last-child::text').extract_first() - item['info'] = elem.css('.color-gray::text').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield item - - diff --git a/backend/app/spiders/douban_config/md5.txt b/backend/app/spiders/douban_config/md5.txt deleted file mode 100755 index 374e3804..00000000 --- a/backend/app/spiders/douban_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -4d59a6c83b0e125d5321beae86bb93ce diff --git a/backend/app/spiders/douban_config/scrapy.cfg b/backend/app/spiders/douban_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/douban_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider diff --git a/backend/app/spiders/jd/Spiderfile b/backend/app/spiders/jd/Spiderfile deleted file mode 100755 index d090472b..00000000 --- a/backend/app/spiders/jd/Spiderfile +++ /dev/null @@ -1,5 +0,0 @@ -name: "jd" -display_name: "京东 (Scrapy)" -col: "results_jd" -type: "customized" -cmd: "scrapy crawl jd_spider" \ No newline at end of file diff --git a/backend/app/spiders/jd/jd/__init__.py b/backend/app/spiders/jd/jd/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/jd/jd/items.py b/backend/app/spiders/jd/jd/items.py deleted file mode 100755 index b2c5e647..00000000 --- a/backend/app/spiders/jd/jd/items.py +++ /dev/null @@ -1,15 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class JdItem(scrapy.Item): - # define the fields for your item here like: - name = scrapy.Field() - price = scrapy.Field() - url = scrapy.Field() diff --git a/backend/app/spiders/jd/jd/middlewares.py b/backend/app/spiders/jd/jd/middlewares.py deleted file mode 100755 index 6fceded5..00000000 --- a/backend/app/spiders/jd/jd/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class JdSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Response, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class JdDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/jd/jd/pipelines.py b/backend/app/spiders/jd/jd/pipelines.py deleted file mode 100755 index 5a7d7cbf..00000000 --- a/backend/app/spiders/jd/jd/pipelines.py +++ /dev/null @@ -1,6 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/backend/app/spiders/jd/jd/settings.py b/backend/app/spiders/jd/jd/settings.py deleted file mode 100755 index ef89ed0c..00000000 --- a/backend/app/spiders/jd/jd/settings.py +++ /dev/null @@ -1,90 +0,0 @@ -# -*- coding: utf-8 -*- - -# Scrapy settings for jd project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://doc.scrapy.org/en/latest/topics/settings.html -# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'jd' - -SPIDER_MODULES = ['jd.spiders'] -NEWSPIDER_MODULE = 'jd.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'jd (+http://www.yourdomain.com)' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = False - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'jd.middlewares.JdSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'jd.middlewares.JdDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://doc.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'crawlab.pipelines.CrawlabMongoPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/jd/jd/spiders/__init__.py b/backend/app/spiders/jd/jd/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/jd/jd/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/jd/jd/spiders/jd_spider.py b/backend/app/spiders/jd/jd/spiders/jd_spider.py deleted file mode 100755 index 4ec94fa9..00000000 --- a/backend/app/spiders/jd/jd/spiders/jd_spider.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy - -from jd.items import JdItem - - -class JdSpiderSpider(scrapy.Spider): - name = 'jd_spider' - allowed_domains = ['jd.com'] - - def start_requests(self): - for i in range(1, 50): - yield scrapy.Request(url=f'https://search.jd.com/Search?keyword=手机&enc=utf-8&page={i}') - - def parse(self, response): - for el in response.css('.gl-item'): - yield JdItem( - url=el.css('.p-name > a::attr("href")').extract_first(), - name=el.css('.p-name > a::attr("title")').extract_first(), - price=float(el.css('.p-price i::text').extract_first()), - ) diff --git a/backend/app/spiders/jd/md5.txt b/backend/app/spiders/jd/md5.txt deleted file mode 100755 index dcd53f51..00000000 --- a/backend/app/spiders/jd/md5.txt +++ /dev/null @@ -1 +0,0 @@ -621486d31459514eb27a082d159d9b8c diff --git a/backend/app/spiders/jd/scrapy.cfg b/backend/app/spiders/jd/scrapy.cfg deleted file mode 100755 index 87cf0280..00000000 --- a/backend/app/spiders/jd/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = jd.settings - -[deploy] -#url = http://localhost:6800/ -project = jd diff --git a/backend/app/spiders/sinastock/Spiderfile b/backend/app/spiders/sinastock/Spiderfile deleted file mode 100755 index b110cb48..00000000 --- a/backend/app/spiders/sinastock/Spiderfile +++ /dev/null @@ -1,5 +0,0 @@ -name: "sinastock" -display_name: "新浪股票 (Scrapy)" -type: "customized" -col: "results_sinastock" -cmd: "scrapy crawl sinastock_spider" \ No newline at end of file diff --git a/backend/app/spiders/sinastock/md5.txt b/backend/app/spiders/sinastock/md5.txt deleted file mode 100755 index 1e5d8ab9..00000000 --- a/backend/app/spiders/sinastock/md5.txt +++ /dev/null @@ -1 +0,0 @@ -80bc091fa45ef4a85c9f1a66c81a4ed7 diff --git a/backend/app/spiders/sinastock/scrapy.cfg b/backend/app/spiders/sinastock/scrapy.cfg deleted file mode 100755 index 4969ad96..00000000 --- a/backend/app/spiders/sinastock/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = sinastock.settings - -[deploy] -#url = http://localhost:6800/ -project = sinastock diff --git a/backend/app/spiders/sinastock/sinastock/__init__.py b/backend/app/spiders/sinastock/sinastock/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/sinastock/sinastock/items.py b/backend/app/spiders/sinastock/sinastock/items.py deleted file mode 100755 index 6e3e5d8e..00000000 --- a/backend/app/spiders/sinastock/sinastock/items.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class NewsItem(scrapy.Item): - # define the fields for your item here like: - _id = scrapy.Field() - title = scrapy.Field() - ts_str = scrapy.Field() - ts = scrapy.Field() - url = scrapy.Field() - text = scrapy.Field() - task_id = scrapy.Field() - source = scrapy.Field() - stocks = scrapy.Field() diff --git a/backend/app/spiders/sinastock/sinastock/middlewares.py b/backend/app/spiders/sinastock/sinastock/middlewares.py deleted file mode 100755 index 912b5e57..00000000 --- a/backend/app/spiders/sinastock/sinastock/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class SinastockSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Response, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class SinastockDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/sinastock/sinastock/pipelines.py b/backend/app/spiders/sinastock/sinastock/pipelines.py deleted file mode 100755 index 5a7d7cbf..00000000 --- a/backend/app/spiders/sinastock/sinastock/pipelines.py +++ /dev/null @@ -1,6 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/backend/app/spiders/sinastock/sinastock/settings.py b/backend/app/spiders/sinastock/sinastock/settings.py deleted file mode 100755 index 3e01d3ca..00000000 --- a/backend/app/spiders/sinastock/sinastock/settings.py +++ /dev/null @@ -1,89 +0,0 @@ -# -*- coding: utf-8 -*- - -# Scrapy settings for sinastock project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://doc.scrapy.org/en/latest/topics/settings.html -# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'sinastock' - -SPIDER_MODULES = ['sinastock.spiders'] -NEWSPIDER_MODULE = 'sinastock.spiders' - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -# USER_AGENT = 'sinastock (+http://www.yourdomain.com)' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -# CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -# DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -# CONCURRENT_REQUESTS_PER_DOMAIN = 16 -# CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -# COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -# TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -# DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -# } - -# Enable or disable spider middlewares -# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html -# SPIDER_MIDDLEWARES = { -# 'sinastock.middlewares.SinastockSpiderMiddleware': 543, -# } - -# Enable or disable downloader middlewares -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -# DOWNLOADER_MIDDLEWARES = { -# 'sinastock.middlewares.SinastockDownloaderMiddleware': 543, -# } - -# Enable or disable extensions -# See https://doc.scrapy.org/en/latest/topics/extensions.html -# EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -# } - -# Configure item pipelines -# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'crawlab.pipelines.CrawlabMongoPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/autothrottle.html -# AUTOTHROTTLE_ENABLED = True -# The initial download delay -# AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -# AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -# AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -# HTTPCACHE_ENABLED = True -# HTTPCACHE_EXPIRATION_SECS = 0 -# HTTPCACHE_DIR = 'httpcache' -# HTTPCACHE_IGNORE_HTTP_CODES = [] -# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/sinastock/sinastock/spiders/__init__.py b/backend/app/spiders/sinastock/sinastock/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/sinastock/sinastock/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py b/backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py deleted file mode 100755 index 54daf763..00000000 --- a/backend/app/spiders/sinastock/sinastock/spiders/sinastock_spider.py +++ /dev/null @@ -1,59 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -from datetime import datetime - -import scrapy -from pymongo import MongoClient - -from sinastock.items import NewsItem - -class SinastockSpiderSpider(scrapy.Spider): - name = 'sinastock_spider' - allowed_domains = ['finance.sina.com.cn'] - mongo = MongoClient( - host=os.environ.get('MONGO_HOST') or 'localhost', - port=int(os.environ.get('MONGO_PORT') or 27017) - ) - db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test'] - col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'stock_news') - - def start_requests(self): - col = self.db['stocks'] - for s in col.find({}): - code, ex = s['ts_code'].split('.') - for i in range(10): - url = f'http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol={ex.lower()}{code}&Page={i + 1}' - yield scrapy.Request( - url=url, - callback=self.parse, - meta={'ts_code': s['ts_code']} - ) - - def parse(self, response): - for a in response.css('.datelist > ul > a'): - url = a.css('a::attr("href")').extract_first() - item = NewsItem( - title=a.css('a::text').extract_first(), - url=url, - source='sina', - stocks=[response.meta['ts_code']] - ) - yield scrapy.Request( - url=url, - callback=self.parse_detail, - meta={'item': item} - ) - - def parse_detail(self, response): - item = response.meta['item'] - text = response.css('#artibody').extract_first() - pre = re.compile('>(.*?)<') - text = ''.join(pre.findall(text)) - item['text'] = text.replace('\u3000', '') - item['ts_str'] = response.css('.date::text').extract_first() - if item['text'] is None or item['ts_str'] is None: - pass - else: - item['ts'] = datetime.strptime(item['ts_str'], '%Y年%m月%d日 %H:%M') - yield item diff --git a/backend/app/spiders/v2ex_config/Spiderfile b/backend/app/spiders/v2ex_config/Spiderfile deleted file mode 100755 index bb18d40a..00000000 --- a/backend/app/spiders/v2ex_config/Spiderfile +++ /dev/null @@ -1,54 +0,0 @@ -name: "v2ex_config" -display_name: "V2ex(可配置)" -remark: "V2ex,列表+详情" -type: "configurable" -col: "results_v2ex_config" -engine: scrapy -start_url: https://v2ex.com/ -start_stage: list -stages: -- name: list - is_list: true - list_css: .cell.item - list_xpath: "" - page_css: "" - page_xpath: "" - page_attr: href - fields: - - name: title - css: a.topic-link - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: url - css: a.topic-link - xpath: "" - attr: href - next_stage: detail - remark: "" - - name: replies - css: .count_livid - xpath: "" - attr: "" - next_stage: "" - remark: "" -- name: detail - is_list: false - list_css: "" - list_xpath: "" - page_css: "" - page_xpath: "" - page_attr: "" - fields: - - name: content - css: "" - xpath: .//*[@class="markdown_body"] - attr: "" - next_stage: "" - remark: "" -settings: - AUTOTHROTTLE_ENABLED: "true" - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/79.0.3945.117 Safari/537.36 diff --git a/backend/app/spiders/v2ex_config/config_spider/__init__.py b/backend/app/spiders/v2ex_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/v2ex_config/config_spider/items.py b/backend/app/spiders/v2ex_config/config_spider/items.py deleted file mode 100755 index d2c01a06..00000000 --- a/backend/app/spiders/v2ex_config/config_spider/items.py +++ /dev/null @@ -1,19 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - title = scrapy.Field() - url = scrapy.Field() - replies = scrapy.Field() - content = scrapy.Field() - diff --git a/backend/app/spiders/v2ex_config/config_spider/middlewares.py b/backend/app/spiders/v2ex_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/v2ex_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/v2ex_config/config_spider/pipelines.py b/backend/app/spiders/v2ex_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/v2ex_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/v2ex_config/config_spider/settings.py b/backend/app/spiders/v2ex_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/v2ex_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py b/backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/v2ex_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/v2ex_config/config_spider/spiders/spider.py b/backend/app/spiders/v2ex_config/config_spider/spiders/spider.py deleted file mode 100755 index 4763e040..00000000 --- a/backend/app/spiders/v2ex_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,38 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='https://v2ex.com/', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.css('.cell.item'): - item = Item() - item['title'] = elem.css('a.topic-link::text').extract_first() - item['url'] = elem.css('a.topic-link::attr("href")').extract_first() - item['replies'] = elem.css('.count_livid::text').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield scrapy.Request(url=get_real_url(response, item['url']), callback=self.parse_detail, meta={'item': item}) - - def parse_detail(self, response): - item = Item() if response.meta.get('item') is None else response.meta.get('item') - item['content'] = response.xpath('string(.//*[@class="markdown_body"])').extract_first() - yield item - - diff --git a/backend/app/spiders/v2ex_config/md5.txt b/backend/app/spiders/v2ex_config/md5.txt deleted file mode 100755 index 5d725b2c..00000000 --- a/backend/app/spiders/v2ex_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -402c0a07873ef74b9b574bc0f6b28423 diff --git a/backend/app/spiders/v2ex_config/scrapy.cfg b/backend/app/spiders/v2ex_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/v2ex_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider diff --git a/backend/app/spiders/xueqiu/Spiderfile b/backend/app/spiders/xueqiu/Spiderfile deleted file mode 100755 index 38aa5dbe..00000000 --- a/backend/app/spiders/xueqiu/Spiderfile +++ /dev/null @@ -1,5 +0,0 @@ -name: "xueqiu" -display_name: "雪球网 (Scrapy)" -type: "customized" -col: "results_xueqiu" -cmd: "scrapy crawl xueqiu_spider" \ No newline at end of file diff --git a/backend/app/spiders/xueqiu/md5.txt b/backend/app/spiders/xueqiu/md5.txt deleted file mode 100755 index 6a9a2072..00000000 --- a/backend/app/spiders/xueqiu/md5.txt +++ /dev/null @@ -1 +0,0 @@ -df177994199caa691d87fc0c5031326d diff --git a/backend/app/spiders/xueqiu/scrapy.cfg b/backend/app/spiders/xueqiu/scrapy.cfg deleted file mode 100755 index 2c5ce3b3..00000000 --- a/backend/app/spiders/xueqiu/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = xueqiu.settings - -[deploy] -#url = http://localhost:6800/ -project = xueqiu diff --git a/backend/app/spiders/xueqiu/xueqiu/__init__.py b/backend/app/spiders/xueqiu/xueqiu/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/xueqiu/xueqiu/items.py b/backend/app/spiders/xueqiu/xueqiu/items.py deleted file mode 100755 index 5471594d..00000000 --- a/backend/app/spiders/xueqiu/xueqiu/items.py +++ /dev/null @@ -1,23 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class XueqiuItem(scrapy.Item): - # define the fields for your item here like: - _id = scrapy.Field() - task_id = scrapy.Field() - id = scrapy.Field() - text = scrapy.Field() - url = scrapy.Field() - target = scrapy.Field() - view_count = scrapy.Field() - mark = scrapy.Field() - created_at = scrapy.Field() - ts = scrapy.Field() - source = scrapy.Field() diff --git a/backend/app/spiders/xueqiu/xueqiu/middlewares.py b/backend/app/spiders/xueqiu/xueqiu/middlewares.py deleted file mode 100755 index f60102ce..00000000 --- a/backend/app/spiders/xueqiu/xueqiu/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class XueqiuSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Response, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class XueqiuDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/xueqiu/xueqiu/pipelines.py b/backend/app/spiders/xueqiu/xueqiu/pipelines.py deleted file mode 100755 index 5a7d7cbf..00000000 --- a/backend/app/spiders/xueqiu/xueqiu/pipelines.py +++ /dev/null @@ -1,6 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/backend/app/spiders/xueqiu/xueqiu/settings.py b/backend/app/spiders/xueqiu/xueqiu/settings.py deleted file mode 100755 index 1d898e2f..00000000 --- a/backend/app/spiders/xueqiu/xueqiu/settings.py +++ /dev/null @@ -1,89 +0,0 @@ -# -*- coding: utf-8 -*- - -# Scrapy settings for xueqiu project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://doc.scrapy.org/en/latest/topics/settings.html -# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'xueqiu' - -SPIDER_MODULES = ['xueqiu.spiders'] -NEWSPIDER_MODULE = 'xueqiu.spiders' - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = False - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -# CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -# DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -# CONCURRENT_REQUESTS_PER_DOMAIN = 16 -# CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -# COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -# TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -# DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -# } - -# Enable or disable spider middlewares -# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html -# SPIDER_MIDDLEWARES = { -# 'xueqiu.middlewares.XueqiuSpiderMiddleware': 543, -# } - -# Enable or disable downloader middlewares -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -# DOWNLOADER_MIDDLEWARES = { -# 'xueqiu.middlewares.XueqiuDownloaderMiddleware': 543, -# } - -# Enable or disable extensions -# See https://doc.scrapy.org/en/latest/topics/extensions.html -# EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -# } - -# Configure item pipelines -# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'crawlab.pipelines.CrawlabMongoPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/autothrottle.html -# AUTOTHROTTLE_ENABLED = True -# The initial download delay -# AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -# AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -# AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -# HTTPCACHE_ENABLED = True -# HTTPCACHE_EXPIRATION_SECS = 0 -# HTTPCACHE_DIR = 'httpcache' -# HTTPCACHE_IGNORE_HTTP_CODES = [] -# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/backend/app/spiders/xueqiu/xueqiu/spiders/__init__.py b/backend/app/spiders/xueqiu/xueqiu/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/xueqiu/xueqiu/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py b/backend/app/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py deleted file mode 100755 index a746e156..00000000 --- a/backend/app/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py +++ /dev/null @@ -1,46 +0,0 @@ -# -*- coding: utf-8 -*- -import json -from datetime import datetime -from time import sleep - -import scrapy - -from xueqiu.items import XueqiuItem - - -class XueqiuSpiderSpider(scrapy.Spider): - name = 'xueqiu_spider' - allowed_domains = ['xueqiu.com'] - - def start_requests(self): - return [scrapy.Request( - url='https://xueqiu.com', - callback=self.parse_home - )] - - def parse_home(self, response): - yield scrapy.Request( - url='https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=20&category=6' - ) - - def parse(self, response): - data = json.loads(response.body) - next_max_id = data.get('next_max_id') - sleep(1) - for row in data.get('list'): - d = json.loads(row.get('data')) - item = XueqiuItem( - id=d['id'], - text=d['text'], - mark=d['mark'], - url=d['target'], - created_at=d['created_at'], - ts=datetime.fromtimestamp(d['created_at'] / 1e3), - view_count=d['view_count'], - source='xueqiu' - ) - yield item - - yield scrapy.Request( - url=f'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id={next_max_id}&count=20&category=6' - ) diff --git a/backend/app/spiders/xueqiu_config/Spiderfile b/backend/app/spiders/xueqiu_config/Spiderfile deleted file mode 100755 index 0de50e9e..00000000 --- a/backend/app/spiders/xueqiu_config/Spiderfile +++ /dev/null @@ -1,39 +0,0 @@ -name: "xueqiu_config" -display_name: "雪球网(可配置)" -remark: "雪球网新闻,列表" -type: "configurable" -col: "results_xueqiu_config" -engine: scrapy -start_url: https://xueqiu.com/ -start_stage: list -stages: -- name: list - is_list: true - list_css: "" - list_xpath: .//*[contains(@class, "AnonymousHome_home__timeline__item")] - page_css: "" - page_xpath: "" - page_attr: "" - fields: - - name: title - css: h3 > a - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: url - css: h3 > a - xpath: "" - attr: href - next_stage: "" - remark: "" - - name: abstract - css: p - xpath: "" - attr: "" - next_stage: "" - remark: "" -settings: - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/xueqiu_config/config_spider/__init__.py b/backend/app/spiders/xueqiu_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/xueqiu_config/config_spider/items.py b/backend/app/spiders/xueqiu_config/config_spider/items.py deleted file mode 100755 index 9282765f..00000000 --- a/backend/app/spiders/xueqiu_config/config_spider/items.py +++ /dev/null @@ -1,18 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - title = scrapy.Field() - url = scrapy.Field() - abstract = scrapy.Field() - diff --git a/backend/app/spiders/xueqiu_config/config_spider/middlewares.py b/backend/app/spiders/xueqiu_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/xueqiu_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/xueqiu_config/config_spider/pipelines.py b/backend/app/spiders/xueqiu_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/xueqiu_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/xueqiu_config/config_spider/settings.py b/backend/app/spiders/xueqiu_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/xueqiu_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/xueqiu_config/config_spider/spiders/__init__.py b/backend/app/spiders/xueqiu_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/xueqiu_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/xueqiu_config/config_spider/spiders/spider.py b/backend/app/spiders/xueqiu_config/config_spider/spiders/spider.py deleted file mode 100755 index 79d4636b..00000000 --- a/backend/app/spiders/xueqiu_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,33 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='https://xueqiu.com/', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.xpath('.//*[contains(@class, "AnonymousHome_home__timeline__item")]'): - item = Item() - item['title'] = elem.css('h3 > a::text').extract_first() - item['url'] = elem.css('h3 > a::attr("href")').extract_first() - item['abstract'] = elem.css('p::text').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield item - - diff --git a/backend/app/spiders/xueqiu_config/md5.txt b/backend/app/spiders/xueqiu_config/md5.txt deleted file mode 100755 index 39a6df77..00000000 --- a/backend/app/spiders/xueqiu_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -e3da3aacb2d290cb179a79028fbfff9c diff --git a/backend/app/spiders/xueqiu_config/scrapy.cfg b/backend/app/spiders/xueqiu_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/xueqiu_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider diff --git a/backend/app/spiders/zongheng_config/Spiderfile b/backend/app/spiders/zongheng_config/Spiderfile deleted file mode 100755 index 0163fac7..00000000 --- a/backend/app/spiders/zongheng_config/Spiderfile +++ /dev/null @@ -1,45 +0,0 @@ -name: "zongheng_config" -display_name: "纵横(可配置)" -remark: "纵横小说网,列表" -type: "configurable" -col: "results_zongheng_config" -engine: scrapy -start_url: http://www.zongheng.com/rank/details.html?rt=1&d=1 -start_stage: list -stages: -- name: list - is_list: true - list_css: .rank_d_list - list_xpath: "" - page_css: "" - page_xpath: "" - page_attr: href - fields: - - name: title - css: .rank_d_b_name > a - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: url - css: .rank_d_b_name > a - xpath: "" - attr: href - next_stage: "" - remark: "" - - name: abstract - css: body - xpath: "" - attr: "" - next_stage: "" - remark: "" - - name: votes - css: .rank_d_b_ticket - xpath: "" - attr: "" - next_stage: "" - remark: "" -settings: - ROBOTSTXT_OBEY: "false" - USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, - like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/app/spiders/zongheng_config/config_spider/__init__.py b/backend/app/spiders/zongheng_config/config_spider/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/backend/app/spiders/zongheng_config/config_spider/items.py b/backend/app/spiders/zongheng_config/config_spider/items.py deleted file mode 100755 index 528c3187..00000000 --- a/backend/app/spiders/zongheng_config/config_spider/items.py +++ /dev/null @@ -1,19 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class Item(scrapy.Item): - _id = scrapy.Field() - task_id = scrapy.Field() - ts = scrapy.Field() - title = scrapy.Field() - url = scrapy.Field() - abstract = scrapy.Field() - votes = scrapy.Field() - diff --git a/backend/app/spiders/zongheng_config/config_spider/middlewares.py b/backend/app/spiders/zongheng_config/config_spider/middlewares.py deleted file mode 100755 index e864bd0b..00000000 --- a/backend/app/spiders/zongheng_config/config_spider/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ConfigSpiderSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ConfigSpiderDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/backend/app/spiders/zongheng_config/config_spider/pipelines.py b/backend/app/spiders/zongheng_config/config_spider/pipelines.py deleted file mode 100755 index 69af4c85..00000000 --- a/backend/app/spiders/zongheng_config/config_spider/pipelines.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - -import os -from pymongo import MongoClient - -mongo = MongoClient( - host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', - port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), - username=os.environ.get('CRAWLAB_MONGO_USERNAME'), - password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), - authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' -) -db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] -col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] -task_id = os.environ.get('CRAWLAB_TASK_ID') - -class ConfigSpiderPipeline(object): - def process_item(self, item, spider): - item['task_id'] = task_id - if col is not None: - col.save(item) - return item diff --git a/backend/app/spiders/zongheng_config/config_spider/settings.py b/backend/app/spiders/zongheng_config/config_spider/settings.py deleted file mode 100755 index 4b0965f2..00000000 --- a/backend/app/spiders/zongheng_config/config_spider/settings.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re -import json - -# Scrapy settings for config_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'Crawlab Configurable Spider' - -SPIDER_MODULES = ['config_spider.spiders'] -NEWSPIDER_MODULE = 'config_spider.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'config_spider.middlewares.ConfigSpiderDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'config_spider.pipelines.ConfigSpiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: - setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') - setting_value = os.environ.get(setting_env_name) - if setting_value.lower() == 'true': - setting_value = True - elif setting_value.lower() == 'false': - setting_value = False - elif re.search(r'^\d+$', setting_value) is not None: - setting_value = int(setting_value) - elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: - setting_value = json.loads(setting_value) - else: - pass - locals()[setting_name] = setting_value - diff --git a/backend/app/spiders/zongheng_config/config_spider/spiders/__init__.py b/backend/app/spiders/zongheng_config/config_spider/spiders/__init__.py deleted file mode 100755 index ebd689ac..00000000 --- a/backend/app/spiders/zongheng_config/config_spider/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/backend/app/spiders/zongheng_config/config_spider/spiders/spider.py b/backend/app/spiders/zongheng_config/config_spider/spiders/spider.py deleted file mode 100755 index cf1b6a08..00000000 --- a/backend/app/spiders/zongheng_config/config_spider/spiders/spider.py +++ /dev/null @@ -1,34 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -import re -from config_spider.items import Item -from urllib.parse import urljoin, urlparse - -def get_real_url(response, url): - if re.search(r'^https?', url): - return url - elif re.search(r'^\/\/', url): - u = urlparse(response.url) - return u.scheme + url - return urljoin(response.url, url) - -class ConfigSpider(scrapy.Spider): - name = 'config_spider' - - def start_requests(self): - yield scrapy.Request(url='http://www.zongheng.com/rank/details.html?rt=1&d=1', callback=self.parse_list) - - def parse_list(self, response): - prev_item = response.meta.get('item') - for elem in response.css('.rank_d_list'): - item = Item() - item['title'] = elem.css('.rank_d_b_name > a::text').extract_first() - item['url'] = elem.css('.rank_d_b_name > a::attr("href")').extract_first() - item['abstract'] = elem.css('body::text').extract_first() - item['votes'] = elem.css('.rank_d_b_ticket::text').extract_first() - if prev_item is not None: - for key, value in prev_item.items(): - item[key] = value - yield item - - diff --git a/backend/app/spiders/zongheng_config/md5.txt b/backend/app/spiders/zongheng_config/md5.txt deleted file mode 100755 index 46fd3de6..00000000 --- a/backend/app/spiders/zongheng_config/md5.txt +++ /dev/null @@ -1 +0,0 @@ -82cb98a6103fb878501df81f191703ba diff --git a/backend/app/spiders/zongheng_config/scrapy.cfg b/backend/app/spiders/zongheng_config/scrapy.cfg deleted file mode 100755 index a78d91e3..00000000 --- a/backend/app/spiders/zongheng_config/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = config_spider.settings - -[deploy] -#url = http://localhost:6800/ -project = config_spider From b109662466954936f4a4c6ff5f420666ab35de47 Mon Sep 17 00:00:00 2001 From: hantmac Date: Mon, 27 Apr 2020 17:18:57 +0800 Subject: [PATCH 03/11] support crawlab runtime log to ES --- backend/conf/config.yml | 2 ++ backend/main.go | 10 ++++++ backend/middlewares/es_log.go | 57 +++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+) create mode 100644 backend/middlewares/es_log.go diff --git a/backend/conf/config.yml b/backend/conf/config.yml index 1c2c8507..686865f9 100644 --- a/backend/conf/config.yml +++ b/backend/conf/config.yml @@ -40,6 +40,8 @@ other: tmppath: "/tmp" version: 0.4.10 setting: + crawlabLogToES: "N" # Send crawlab runtime log to ES, open this option "Y", remember to set esClient + crawlabLogIndex: "crawlab-log" allowRegister: "N" enableTutorial: "N" runOnMaster: "Y" diff --git a/backend/main.go b/backend/main.go index 6ab022f4..7ca18335 100644 --- a/backend/main.go +++ b/backend/main.go @@ -14,6 +14,7 @@ import ( "github.com/apex/log" "github.com/gin-gonic/gin" "github.com/gin-gonic/gin/binding" + "github.com/olivere/elastic/v7" "github.com/spf13/viper" "net" "net/http" @@ -133,6 +134,15 @@ func main() { // 以下为主节点服务 if model.IsMaster() { // 中间件 + esClientStr := viper.GetString("setting.esClient") + if viper.GetString("setting.crawlabLogToES") == "Y" && esClientStr != "" { + ctx := context.Background() + esClient, err := elastic.NewClient(elastic.SetURL(esClientStr), elastic.SetSniff(false)) + if err != nil { + log.Error("Init es client Error:" + err.Error()) + } + app.Use(middlewares.EsLog(ctx, esClient)) + } app.Use(middlewares.CORSMiddleware()) anonymousGroup := app.Group("/") { diff --git a/backend/middlewares/es_log.go b/backend/middlewares/es_log.go new file mode 100644 index 00000000..c119816a --- /dev/null +++ b/backend/middlewares/es_log.go @@ -0,0 +1,57 @@ +package middlewares + +import ( + "bytes" + "context" + "fmt" + "github.com/gin-gonic/gin" + "github.com/olivere/elastic/v7" + "github.com/satori/go.uuid" + "github.com/spf13/viper" + "strconv" + "time" +) + +func EsLog(ctx context.Context, esClient *elastic.Client) gin.HandlerFunc { + + return func(c *gin.Context) { + // 开始时间 + crawlabIndex := viper.GetString("setting.crawlabLogIndex") + sig := make(chan struct{}, 1) + sig <- struct{}{} + start := time.Now() + // 处理请求 + c.Next() + // 结束时间 + end := time.Now() + //执行时间 + latency := strconv.FormatInt(int64(end.Sub(start).Milliseconds()), 10) + path := c.Request.URL.Path + + clientIP := c.ClientIP() + method := c.Request.Method + statusCode := strconv.Itoa(c.Writer.Status()) + buf := new(bytes.Buffer) + buf.ReadFrom(c.Request.Body) + b := buf.String() + accessLog := "costTime:" + latency + "ms--" + "StatusCode:" + statusCode + "--" + "Method:" + method + "--" + "ClientIp:" + clientIP + "--" + + "RequestURI:" + path + "--" + "Host:" + c.Request.Host + "--" + "UserAgent--" + c.Request.UserAgent() + "--RequestBody:" + + string(b) + WriteMsg(ctx, crawlabIndex, esClient, time.Now(), accessLog, sig) + } + +} + +// WriteMsg will write the msg and level into es +func WriteMsg(ctx context.Context, crawlabIndex string, es *elastic.Client, when time.Time, msg string, sig chan struct{}) error { + <-sig + vals := make(map[string]interface{}) + vals["@timestamp"] = when.Format(time.RFC3339) + vals["@msg"] = msg + uid := uuid.NewV4().String() + _, err := es.Index().Index(crawlabIndex).Id(uid).BodyJson(vals).Refresh("wait_for").Do(ctx) + if err != nil { + fmt.Println(err) + } + return err +} From ef82c8aab8566257041b88e805094cfc6bfea8c2 Mon Sep 17 00:00:00 2001 From: hantmac Date: Mon, 27 Apr 2020 19:24:10 +0800 Subject: [PATCH 04/11] bugfix --- backend/middlewares/es_log.go | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/backend/middlewares/es_log.go b/backend/middlewares/es_log.go index c119816a..1dff344e 100644 --- a/backend/middlewares/es_log.go +++ b/backend/middlewares/es_log.go @@ -17,8 +17,6 @@ func EsLog(ctx context.Context, esClient *elastic.Client) gin.HandlerFunc { return func(c *gin.Context) { // 开始时间 crawlabIndex := viper.GetString("setting.crawlabLogIndex") - sig := make(chan struct{}, 1) - sig <- struct{}{} start := time.Now() // 处理请求 c.Next() @@ -37,14 +35,13 @@ func EsLog(ctx context.Context, esClient *elastic.Client) gin.HandlerFunc { accessLog := "costTime:" + latency + "ms--" + "StatusCode:" + statusCode + "--" + "Method:" + method + "--" + "ClientIp:" + clientIP + "--" + "RequestURI:" + path + "--" + "Host:" + c.Request.Host + "--" + "UserAgent--" + c.Request.UserAgent() + "--RequestBody:" + string(b) - WriteMsg(ctx, crawlabIndex, esClient, time.Now(), accessLog, sig) + WriteMsg(ctx, crawlabIndex, esClient, time.Now(), accessLog) } } // WriteMsg will write the msg and level into es -func WriteMsg(ctx context.Context, crawlabIndex string, es *elastic.Client, when time.Time, msg string, sig chan struct{}) error { - <-sig +func WriteMsg(ctx context.Context, crawlabIndex string, es *elastic.Client, when time.Time, msg string) error { vals := make(map[string]interface{}) vals["@timestamp"] = when.Format(time.RFC3339) vals["@msg"] = msg From 872dc6660f8c7cfc8e43a9f4cc907875bf3cc363 Mon Sep 17 00:00:00 2001 From: hantmac Date: Tue, 28 Apr 2020 20:41:42 +0800 Subject: [PATCH 05/11] support custom node name --- backend/conf/config.yml | 3 +- backend/constants/register.go | 1 + backend/model/node.go | 2 +- backend/services/register/register.go | 69 +++++++++++++++++++++++++-- 4 files changed, 70 insertions(+), 5 deletions(-) diff --git a/backend/conf/config.yml b/backend/conf/config.yml index 686865f9..72e6e4ec 100644 --- a/backend/conf/config.yml +++ b/backend/conf/config.yml @@ -23,8 +23,9 @@ server: master: "Y" secret: "crawlab" register: - # mac地址/ip地址/hostname, 如果是ip,则需要手动指定IP + # type 填 mac/ip/customName, 如果是ip,则需要手动指定IP, 如果是 customName, 需填写你的 customNodeName type: "mac" + customNodeName: "" # 自定义节点名称, default node1,只有在type = customName 时生效 ip: "" lang: # 安装语言环境, Y 为安装,N 为不安装 python: "Y" diff --git a/backend/constants/register.go b/backend/constants/register.go index ad38e7a3..4ed1e396 100644 --- a/backend/constants/register.go +++ b/backend/constants/register.go @@ -4,4 +4,5 @@ const ( RegisterTypeMac = "mac" RegisterTypeIp = "ip" RegisterTypeHostname = "hostname" + RegisterTypeCustomName = "customName" ) diff --git a/backend/model/node.go b/backend/model/node.go index 4d299f51..715be30f 100644 --- a/backend/model/node.go +++ b/backend/model/node.go @@ -80,7 +80,7 @@ func GetCurrentNode() (Node, error) { Key: key, Id: bson.NewObjectId(), Ip: ip, - Name: ip, + Name: key, Mac: mac, Hostname: hostname, IsMaster: true, diff --git a/backend/services/register/register.go b/backend/services/register/register.go index 98e70707..9eedf0e7 100644 --- a/backend/services/register/register.go +++ b/backend/services/register/register.go @@ -25,6 +25,7 @@ type Register interface { GetMac() (string, error) // 注册节点的Hostname GetHostname() (string, error) + GetCustomName() (string, error) } // ===================== mac 地址注册 ===================== @@ -50,11 +51,50 @@ func (mac *MacRegister) GetHostname() (string, error) { return getHostname() } +func (mac *MacRegister) GetCustomName() (string, error) { + return getMac() +} + // ===================== ip 地址注册 ===================== type IpRegister struct { Ip string } +func (ip *IpRegister) GetCustomName() (string, error) { + return ip.Ip, nil +} + +// ============= 自定义节点名称注册 ============== +type CustomNameRegister struct { + CustomName string +} + +func (c *CustomNameRegister) GetType() string { + return "customName" +} + +func (c *CustomNameRegister) GetIp() (string, error) { + return getIp() +} + +func (c *CustomNameRegister) GetMac() (string, error) { + return getMac() +} + +func (c *CustomNameRegister) GetKey() (string, error) { + return c.CustomName, nil +} + +func (c *CustomNameRegister) GetHostname() (string, error) { + + return getHostname() +} + +func (c *CustomNameRegister) GetCustomName() (string, error) { + return c.CustomName, nil +} + +// ============================================================ func (ip *IpRegister) GetType() string { return "ip" } @@ -98,6 +138,10 @@ func (h *HostnameRegister) GetHostname() (string, error) { return getHostname() } +func (h *HostnameRegister) GetCustomName() (string, error) { + return getHostname() +} + // ===================== 公共方法 ===================== // 获取本机的IP地址 // TODO: 考虑多个IP地址的情况 @@ -158,9 +202,14 @@ var once sync.Once func GetRegister() Register { once.Do(func() { registerType := viper.GetString("server.register.type") - if registerType == constants.RegisterTypeMac { + + switch registerType { + case constants.RegisterTypeMac: + register = &MacRegister{} - } else if registerType == constants.RegisterTypeIp { + + case constants.RegisterTypeIp: + ip := viper.GetString("server.register.ip") if ip == "" { log.Error("server.register.ip is empty") @@ -170,8 +219,22 @@ func GetRegister() Register { register = &IpRegister{ Ip: ip, } - } else if registerType == constants.RegisterTypeHostname { + + case constants.RegisterTypeHostname: + register = &HostnameRegister{} + + case constants.RegisterTypeCustomName: + + customNodeName := viper.GetString("server.register.customNodeName") + if customNodeName == "" { + log.Error("server.register.customNodeName is empty") + debug.PrintStack() + register = nil + } + register = &CustomNameRegister{ + CustomName: customNodeName, + } } log.Info("register type is :" + reflect.TypeOf(register).String()) From f59ca2b6055aaa9a82d1ea4fbb5201e5cabd9b2e Mon Sep 17 00:00:00 2001 From: hantmac Date: Fri, 1 May 2020 23:11:51 +0800 Subject: [PATCH 06/11] Add swagger docs --- backend/docs/docs.go | 2816 ++++++++++++++++++++++++++++++++++++ backend/docs/swagger.json | 2748 +++++++++++++++++++++++++++++++++++ backend/docs/swagger.yaml | 1848 +++++++++++++++++++++++ backend/go.mod | 3 + backend/go.sum | 40 + backend/main.go | 11 + backend/routes/node.go | 55 + backend/routes/project.go | 47 + backend/routes/schedule.go | 67 + backend/routes/setting.go | 16 + backend/routes/spider.go | 326 +++++ 11 files changed, 7977 insertions(+) create mode 100644 backend/docs/docs.go create mode 100644 backend/docs/swagger.json create mode 100644 backend/docs/swagger.yaml diff --git a/backend/docs/docs.go b/backend/docs/docs.go new file mode 100644 index 00000000..bd40f33a --- /dev/null +++ b/backend/docs/docs.go @@ -0,0 +1,2816 @@ +// GENERATED BY THE COMMAND ABOVE; DO NOT EDIT +// This file was generated by swaggo/swag at +// 2020-05-01 23:10:59.173446 +0800 CST m=+0.074737526 + +package docs + +import ( + "bytes" + "encoding/json" + "strings" + + "github.com/alecthomas/template" + "github.com/swaggo/swag" +) + +var doc = `{ + "schemes": {{ marshal .Schemes }}, + "swagger": "2.0", + "info": { + "description": "{{.Description}}", + "title": "{{.Title}}", + "contact": {}, + "license": {}, + "version": "{{.Version}}" + }, + "host": "{{.Host}}", + "basePath": "{{.BasePath}}", + "paths": { + "/nodes": { + "get": { + "description": "Get nodes", + "produces": [ + "application/json" + ], + "tags": [ + "node" + ], + "summary": "Get nodes", + "parameters": [ + { + "type": "string", + "description": "With the bearer started", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/nodes/{id}": { + "get": { + "description": "Get node", + "produces": [ + "application/json" + ], + "tags": [ + "node" + ], + "summary": "Get node", + "parameters": [ + { + "type": "string", + "description": "With the bearer started", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "post": { + "description": "Post node", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "node" + ], + "summary": "Post node", + "parameters": [ + { + "type": "string", + "description": "With the bearer started", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "post node", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + }, + "delete": { + "description": "Delete node", + "produces": [ + "application/json" + ], + "tags": [ + "node" + ], + "summary": "Delete node", + "parameters": [ + { + "type": "string", + "description": "With the bearer started", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "node id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/nodes/{id}/system": { + "get": { + "description": "Get system info", + "produces": [ + "application/json" + ], + "tags": [ + "node" + ], + "summary": "Get system info", + "parameters": [ + { + "type": "string", + "description": "With the bearer started", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "node id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/nodes/{id}/tasks": { + "get": { + "description": "Get tasks on node", + "produces": [ + "application/json" + ], + "tags": [ + "node" + ], + "summary": "Get tasks on node", + "parameters": [ + { + "type": "string", + "description": "With the bearer started", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "node id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/projects": { + "get": { + "description": "Get projects", + "produces": [ + "application/json" + ], + "tags": [ + "project" + ], + "summary": "Get projects", + "parameters": [ + { + "type": "string", + "description": "With the bearer started", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "projects", + "name": "tag", + "in": "query", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "put": { + "description": "Put project", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "project" + ], + "summary": "Put project", + "parameters": [ + { + "type": "string", + "description": "With the bearer started", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "post project", + "name": "p", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.Project" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/projects/tags": { + "get": { + "description": "Get projects tags", + "produces": [ + "application/json" + ], + "tags": [ + "project" + ], + "summary": "Get project tags", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/projects/{id}": { + "post": { + "description": "Post project", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "project" + ], + "summary": "Post project", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "project id", + "name": "id", + "in": "path", + "required": true + }, + { + "description": "project item", + "name": "item", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.Project" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + }, + "delete": { + "description": "Delete project", + "produces": [ + "application/json" + ], + "tags": [ + "project" + ], + "summary": "Delete project", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "project id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/schedules": { + "get": { + "description": "Get schedule list", + "produces": [ + "application/json" + ], + "tags": [ + "schedule" + ], + "summary": "Get schedule list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "put": { + "description": "Put schedule", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "schedule" + ], + "summary": "Put schedule", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "schedule item", + "name": "item", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.Schedule" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/schedules/{id}": { + "get": { + "description": "Get schedule by id", + "produces": [ + "application/json" + ], + "tags": [ + "schedule" + ], + "summary": "Get schedule by id", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "schedule id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "post": { + "description": "Post schedule", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "schedule" + ], + "summary": "Post schedule", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "schedule id", + "name": "id", + "in": "path", + "required": true + }, + { + "description": "schedule item", + "name": "newItem", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.Schedule" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + }, + "delete": { + "description": "Delete schedule", + "produces": [ + "application/json" + ], + "tags": [ + "schedule" + ], + "summary": "Delete schedule", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "schedule id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/schedules/{id}/disable": { + "post": { + "description": "disable schedule", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "schedule" + ], + "summary": "disable schedule", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "schedule id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/schedules/{id}/enable": { + "post": { + "description": "enable schedule", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "schedule" + ], + "summary": "enable schedule", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "schedule id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/setting": { + "get": { + "description": "Get setting", + "produces": [ + "application/json" + ], + "tags": [ + "setting" + ], + "summary": "Get setting", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders": { + "put": { + "description": "Put spider", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Put spider", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "spider item", + "name": "spider", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.Spider" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + }, + "post": { + "description": "delete spider", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "delete spider", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders-cancel": { + "post": { + "description": "cancel spider", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "cancel spider", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders-run": { + "post": { + "description": "run spider", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "run spider", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}": { + "get": { + "description": "Get spider by id", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get spider by id", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "schedule id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "post": { + "description": "Post spider", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Post spider", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "schedule id", + "name": "id", + "in": "path", + "required": true + }, + { + "description": "spider item", + "name": "item", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.Spider" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + }, + "delete": { + "description": "Delete spider by id", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Delete spider by id", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/copy": { + "post": { + "description": "Copy spider", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Copy spider", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "schedule id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/dir": { + "get": { + "description": "Get spider dir", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get spider dir", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "path", + "name": "path", + "in": "query", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/file": { + "get": { + "description": "Get spider file", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get spider file", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "path", + "name": "path", + "in": "query", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "put": { + "description": "Post spider dir", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Post spider dir", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + }, + { + "description": "path", + "name": "reqBody", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/routes.SpiderFileReqBody" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "post": { + "description": "Put spider file", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Put spider file", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + }, + { + "description": "path", + "name": "reqBody", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/routes.SpiderFileReqBody" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "delete": { + "description": "Delete spider file", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Delete spider file", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + }, + { + "description": "path", + "name": "reqBody", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/routes.SpiderFileReqBody" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/file/rename": { + "post": { + "description": "Rename spider file", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Rename spider file", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + }, + { + "description": "path", + "name": "reqBody", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/routes.SpiderFileReqBody" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/file/tree": { + "get": { + "description": "Get spider dir", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get spider dir", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/git/reset": { + "post": { + "description": "Post spider reset git", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Post spider reset git", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/git/sync": { + "post": { + "description": "Post spider sync git", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Post spider sync git", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/publish": { + "post": { + "description": "Publish spider", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Publish spider", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "schedule id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/schedules": { + "get": { + "description": "Get schedules", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get schedules", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/scrapy/items": { + "get": { + "description": "Get scrapy spider items", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get scrapy spider items", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "post": { + "description": "Post scrapy spider items", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Post scrapy spider items", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + }, + { + "description": "req data", + "name": "reqData", + "in": "body", + "required": true, + "schema": { + "type": "entity.ScrapyItem", + "items": { + "$ref": "#/definitions/entity.ScrapyItem" + } + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/scrapy/pipelines": { + "get": { + "description": "Get scrapy spider pipelines", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get scrapy spider pipelines", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/scrapy/settings": { + "get": { + "description": "Get scrapy spider settings", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get scrapy spider settings", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "post": { + "description": "Get scrapy spider file", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get scrapy spider file", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + }, + { + "description": "req data", + "name": "reqData", + "in": "body", + "required": true, + "schema": { + "type": "entity.ScrapySettingParam", + "items": { + "$ref": "#/definitions/entity.ScrapySettingParam" + } + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/scrapy/spider/filepath": { + "get": { + "description": "Get scrapy spider file path", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get scrapy spider file path", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/scrapy/spiders": { + "get": { + "description": "Get scrapy spider file", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get scrapy spider file", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "put": { + "description": "Put scrapy spider file", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Put scrapy spider file", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/stats": { + "get": { + "description": "Get spider stats", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get spider stats", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/tasks": { + "get": { + "description": "Get task list", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get task list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/upload": { + "post": { + "description": "Upload spider by id", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Upload spider by id", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "file", + "description": "spider file to upload", + "name": "file", + "in": "formData", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/version": { + "get": { + "description": "Get version", + "produces": [ + "application/json" + ], + "tags": [ + "setting" + ], + "summary": "Get version", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + } + }, + "definitions": { + "entity.ConfigSpiderData": { + "type": "object", + "properties": { + "cmd": { + "description": "自定义爬虫", + "type": "string" + }, + "col": { + "type": "string" + }, + "display_name": { + "type": "string" + }, + "engine": { + "description": "可配置爬虫", + "type": "string" + }, + "name": { + "description": "通用", + "type": "string" + }, + "remark": { + "type": "string" + }, + "settings": { + "type": "object" + }, + "stages": { + "type": "array", + "items": { + "$ref": "#/definitions/entity.Stage" + } + }, + "start_stage": { + "type": "string" + }, + "start_url": { + "type": "string" + }, + "type": { + "type": "string" + } + } + }, + "entity.Field": { + "type": "object", + "properties": { + "attr": { + "type": "string" + }, + "css": { + "type": "string" + }, + "name": { + "type": "string" + }, + "next_stage": { + "type": "string" + }, + "remark": { + "type": "string" + }, + "xpath": { + "type": "string" + } + } + }, + "entity.ScrapyItem": { + "type": "object", + "properties": { + "fields": { + "type": "array", + "items": { + "type": "string" + } + }, + "name": { + "type": "string" + } + } + }, + "entity.ScrapySettingParam": { + "type": "object", + "properties": { + "key": { + "type": "string" + }, + "type": { + "type": "string" + }, + "value": { + "type": "object" + } + } + }, + "entity.Stage": { + "type": "object", + "properties": { + "fields": { + "type": "array", + "items": { + "$ref": "#/definitions/entity.Field" + } + }, + "is_list": { + "type": "boolean" + }, + "list_css": { + "type": "string" + }, + "list_xpath": { + "type": "string" + }, + "name": { + "type": "string" + }, + "page_attr": { + "type": "string" + }, + "page_css": { + "type": "string" + }, + "page_xpath": { + "type": "string" + } + } + }, + "model.Env": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "value": { + "type": "string" + } + } + }, + "model.Node": { + "type": "object", + "properties": { + "_id": { + "type": "string" + }, + "create_ts": { + "type": "string" + }, + "description": { + "type": "string" + }, + "hostname": { + "type": "string" + }, + "ip": { + "type": "string" + }, + "is_master": { + "description": "前端展示", + "type": "boolean" + }, + "key": { + "description": "用于唯一标识节点,可能是mac地址,可能是ip地址", + "type": "string" + }, + "mac": { + "type": "string" + }, + "name": { + "type": "string" + }, + "port": { + "type": "string" + }, + "status": { + "type": "string" + }, + "update_ts": { + "type": "string" + }, + "update_ts_unix": { + "type": "integer" + } + } + }, + "model.Project": { + "type": "object", + "properties": { + "_id": { + "type": "string" + }, + "create_ts": { + "type": "string" + }, + "description": { + "type": "string" + }, + "name": { + "type": "string" + }, + "spiders": { + "description": "前端展示", + "type": "array", + "items": { + "$ref": "#/definitions/model.Spider" + } + }, + "tags": { + "type": "array", + "items": { + "type": "string" + } + }, + "update_ts": { + "type": "string" + }, + "user_id": { + "type": "string" + }, + "username": { + "type": "string" + } + } + }, + "model.Schedule": { + "type": "object", + "properties": { + "_id": { + "type": "string" + }, + "create_ts": { + "type": "string" + }, + "cron": { + "type": "string" + }, + "description": { + "type": "string" + }, + "enabled": { + "type": "boolean" + }, + "entry_id": { + "type": "integer" + }, + "message": { + "type": "string" + }, + "name": { + "type": "string" + }, + "node_ids": { + "type": "array", + "items": { + "type": "string" + } + }, + "nodes": { + "type": "array", + "items": { + "$ref": "#/definitions/model.Node" + } + }, + "param": { + "type": "string" + }, + "run_type": { + "type": "string" + }, + "scrapy_log_level": { + "type": "string" + }, + "scrapy_spider": { + "type": "string" + }, + "spider_id": { + "type": "string" + }, + "spider_name": { + "description": "前端展示", + "type": "string" + }, + "status": { + "type": "string" + }, + "update_ts": { + "type": "string" + }, + "user_id": { + "type": "string" + }, + "user_name": { + "type": "string" + } + } + }, + "model.Spider": { + "type": "object", + "properties": { + "_id": { + "description": "爬虫ID", + "type": "string" + }, + "cmd": { + "description": "自定义爬虫", + "type": "string" + }, + "col": { + "description": "结果储存位置", + "type": "string" + }, + "config": { + "description": "可配置爬虫配置", + "type": "object", + "$ref": "#/definitions/entity.ConfigSpiderData" + }, + "create_ts": { + "type": "string" + }, + "dedup_field": { + "description": "去重字段", + "type": "string" + }, + "dedup_method": { + "description": "去重方式", + "type": "string" + }, + "display_name": { + "description": "爬虫显示名称", + "type": "string" + }, + "envs": { + "description": "环境变量", + "type": "array", + "items": { + "$ref": "#/definitions/model.Env" + } + }, + "file_id": { + "description": "GridFS文件ID", + "type": "string" + }, + "git_auto_sync": { + "description": "Git 是否自动同步", + "type": "boolean" + }, + "git_branch": { + "description": "Git 分支", + "type": "string" + }, + "git_has_credential": { + "description": "Git 是否加密", + "type": "boolean" + }, + "git_password": { + "description": "Git 密码", + "type": "string" + }, + "git_sync_error": { + "description": "Git 同步错误", + "type": "string" + }, + "git_sync_frequency": { + "description": "Git 同步频率", + "type": "string" + }, + "git_url": { + "description": "Git URL", + "type": "string" + }, + "git_username": { + "description": "Git 用户名", + "type": "string" + }, + "is_dedup": { + "description": "去重", + "type": "boolean" + }, + "is_git": { + "description": "Git 设置", + "type": "boolean" + }, + "is_long_task": { + "description": "长任务", + "type": "boolean" + }, + "is_public": { + "description": "是否公开", + "type": "boolean" + }, + "is_scrapy": { + "description": "Scrapy 爬虫(属于自定义爬虫)", + "type": "boolean" + }, + "is_web_hook": { + "description": "Web Hook", + "type": "boolean" + }, + "last_run_ts": { + "description": "前端展示", + "type": "string" + }, + "last_status": { + "description": "最后执行状态", + "type": "string" + }, + "latest_tasks": { + "description": "最近任务列表", + "type": "array", + "items": { + "$ref": "#/definitions/model.Task" + } + }, + "name": { + "description": "爬虫名称(唯一)", + "type": "string" + }, + "project_id": { + "description": "项目ID", + "type": "string" + }, + "remark": { + "description": "备注", + "type": "string" + }, + "site": { + "description": "爬虫网站", + "type": "string" + }, + "spider_names": { + "description": "爬虫名称列表", + "type": "array", + "items": { + "type": "string" + } + }, + "src": { + "description": "源码位置", + "type": "string" + }, + "template": { + "description": "可配置爬虫", + "type": "string" + }, + "type": { + "description": "爬虫类别", + "type": "string" + }, + "update_ts": { + "type": "string" + }, + "user_id": { + "description": "时间", + "type": "string" + }, + "username": { + "description": "用户名称", + "type": "string" + }, + "web_hook_url": { + "description": "Web Hook URL", + "type": "string" + } + } + }, + "model.Task": { + "type": "object", + "properties": { + "_id": { + "type": "string" + }, + "cmd": { + "type": "string" + }, + "create_ts": { + "type": "string" + }, + "error": { + "type": "string" + }, + "error_log_count": { + "type": "integer" + }, + "finish_ts": { + "type": "string" + }, + "log_path": { + "type": "string" + }, + "node_id": { + "type": "string" + }, + "node_name": { + "type": "string" + }, + "param": { + "type": "string" + }, + "pid": { + "type": "integer" + }, + "result_count": { + "type": "integer" + }, + "run_type": { + "type": "string" + }, + "runtime_duration": { + "type": "number" + }, + "schedule_id": { + "type": "string" + }, + "spider_id": { + "type": "string" + }, + "spider_name": { + "description": "前端数据", + "type": "string" + }, + "start_ts": { + "type": "string" + }, + "status": { + "type": "string" + }, + "total_duration": { + "type": "number" + }, + "update_ts": { + "type": "string" + }, + "user_id": { + "type": "string" + }, + "username": { + "type": "string" + }, + "wait_duration": { + "type": "number" + } + } + }, + "routes.SpiderFileReqBody": { + "type": "object", + "properties": { + "content": { + "type": "string" + }, + "new_path": { + "type": "string" + }, + "path": { + "type": "string" + } + } + } + } +}` + +type swaggerInfo struct { + Version string + Host string + BasePath string + Schemes []string + Title string + Description string +} + +// SwaggerInfo holds exported Swagger Info so clients can modify it +var SwaggerInfo = swaggerInfo{ + Version: "", + Host: "", + BasePath: "", + Schemes: []string{}, + Title: "", + Description: "", +} + +type s struct{} + +func (s *s) ReadDoc() string { + sInfo := SwaggerInfo + sInfo.Description = strings.Replace(sInfo.Description, "\n", "\\n", -1) + + t, err := template.New("swagger_info").Funcs(template.FuncMap{ + "marshal": func(v interface{}) string { + a, _ := json.Marshal(v) + return string(a) + }, + }).Parse(doc) + if err != nil { + return doc + } + + var tpl bytes.Buffer + if err := t.Execute(&tpl, sInfo); err != nil { + return doc + } + + return tpl.String() +} + +func init() { + swag.Register(swag.Name, &s{}) +} diff --git a/backend/docs/swagger.json b/backend/docs/swagger.json new file mode 100644 index 00000000..8ed42bcd --- /dev/null +++ b/backend/docs/swagger.json @@ -0,0 +1,2748 @@ +{ + "swagger": "2.0", + "info": { + "contact": {}, + "license": {} + }, + "paths": { + "/nodes": { + "get": { + "description": "Get nodes", + "produces": [ + "application/json" + ], + "tags": [ + "node" + ], + "summary": "Get nodes", + "parameters": [ + { + "type": "string", + "description": "With the bearer started", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/nodes/{id}": { + "get": { + "description": "Get node", + "produces": [ + "application/json" + ], + "tags": [ + "node" + ], + "summary": "Get node", + "parameters": [ + { + "type": "string", + "description": "With the bearer started", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "post": { + "description": "Post node", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "node" + ], + "summary": "Post node", + "parameters": [ + { + "type": "string", + "description": "With the bearer started", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "post node", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + }, + "delete": { + "description": "Delete node", + "produces": [ + "application/json" + ], + "tags": [ + "node" + ], + "summary": "Delete node", + "parameters": [ + { + "type": "string", + "description": "With the bearer started", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "node id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/nodes/{id}/system": { + "get": { + "description": "Get system info", + "produces": [ + "application/json" + ], + "tags": [ + "node" + ], + "summary": "Get system info", + "parameters": [ + { + "type": "string", + "description": "With the bearer started", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "node id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/nodes/{id}/tasks": { + "get": { + "description": "Get tasks on node", + "produces": [ + "application/json" + ], + "tags": [ + "node" + ], + "summary": "Get tasks on node", + "parameters": [ + { + "type": "string", + "description": "With the bearer started", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "node id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/projects": { + "get": { + "description": "Get projects", + "produces": [ + "application/json" + ], + "tags": [ + "project" + ], + "summary": "Get projects", + "parameters": [ + { + "type": "string", + "description": "With the bearer started", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "projects", + "name": "tag", + "in": "query", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "put": { + "description": "Put project", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "project" + ], + "summary": "Put project", + "parameters": [ + { + "type": "string", + "description": "With the bearer started", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "post project", + "name": "p", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.Project" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/projects/tags": { + "get": { + "description": "Get projects tags", + "produces": [ + "application/json" + ], + "tags": [ + "project" + ], + "summary": "Get project tags", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/projects/{id}": { + "post": { + "description": "Post project", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "project" + ], + "summary": "Post project", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "project id", + "name": "id", + "in": "path", + "required": true + }, + { + "description": "project item", + "name": "item", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.Project" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + }, + "delete": { + "description": "Delete project", + "produces": [ + "application/json" + ], + "tags": [ + "project" + ], + "summary": "Delete project", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "project id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/schedules": { + "get": { + "description": "Get schedule list", + "produces": [ + "application/json" + ], + "tags": [ + "schedule" + ], + "summary": "Get schedule list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "put": { + "description": "Put schedule", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "schedule" + ], + "summary": "Put schedule", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "schedule item", + "name": "item", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.Schedule" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/schedules/{id}": { + "get": { + "description": "Get schedule by id", + "produces": [ + "application/json" + ], + "tags": [ + "schedule" + ], + "summary": "Get schedule by id", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "schedule id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "post": { + "description": "Post schedule", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "schedule" + ], + "summary": "Post schedule", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "schedule id", + "name": "id", + "in": "path", + "required": true + }, + { + "description": "schedule item", + "name": "newItem", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.Schedule" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + }, + "delete": { + "description": "Delete schedule", + "produces": [ + "application/json" + ], + "tags": [ + "schedule" + ], + "summary": "Delete schedule", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "schedule id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/schedules/{id}/disable": { + "post": { + "description": "disable schedule", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "schedule" + ], + "summary": "disable schedule", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "schedule id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/schedules/{id}/enable": { + "post": { + "description": "enable schedule", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "schedule" + ], + "summary": "enable schedule", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "schedule id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/setting": { + "get": { + "description": "Get setting", + "produces": [ + "application/json" + ], + "tags": [ + "setting" + ], + "summary": "Get setting", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders": { + "put": { + "description": "Put spider", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Put spider", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "spider item", + "name": "spider", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.Spider" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + }, + "post": { + "description": "delete spider", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "delete spider", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders-cancel": { + "post": { + "description": "cancel spider", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "cancel spider", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders-run": { + "post": { + "description": "run spider", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "run spider", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}": { + "get": { + "description": "Get spider by id", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get spider by id", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "schedule id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "post": { + "description": "Post spider", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Post spider", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "schedule id", + "name": "id", + "in": "path", + "required": true + }, + { + "description": "spider item", + "name": "item", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.Spider" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + }, + "delete": { + "description": "Delete spider by id", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Delete spider by id", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/copy": { + "post": { + "description": "Copy spider", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Copy spider", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "schedule id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/dir": { + "get": { + "description": "Get spider dir", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get spider dir", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "path", + "name": "path", + "in": "query", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/file": { + "get": { + "description": "Get spider file", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get spider file", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "path", + "name": "path", + "in": "query", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "put": { + "description": "Post spider dir", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Post spider dir", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + }, + { + "description": "path", + "name": "reqBody", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/routes.SpiderFileReqBody" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "post": { + "description": "Put spider file", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Put spider file", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + }, + { + "description": "path", + "name": "reqBody", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/routes.SpiderFileReqBody" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "delete": { + "description": "Delete spider file", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Delete spider file", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + }, + { + "description": "path", + "name": "reqBody", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/routes.SpiderFileReqBody" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/file/rename": { + "post": { + "description": "Rename spider file", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Rename spider file", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + }, + { + "description": "path", + "name": "reqBody", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/routes.SpiderFileReqBody" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/file/tree": { + "get": { + "description": "Get spider dir", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get spider dir", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/git/reset": { + "post": { + "description": "Post spider reset git", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Post spider reset git", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/git/sync": { + "post": { + "description": "Post spider sync git", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Post spider sync git", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/publish": { + "post": { + "description": "Publish spider", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Publish spider", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "schedule id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/schedules": { + "get": { + "description": "Get schedules", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get schedules", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/scrapy/items": { + "get": { + "description": "Get scrapy spider items", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get scrapy spider items", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "post": { + "description": "Post scrapy spider items", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Post scrapy spider items", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + }, + { + "description": "req data", + "name": "reqData", + "in": "body", + "required": true, + "schema": { + "type": "entity.ScrapyItem", + "items": { + "$ref": "#/definitions/entity.ScrapyItem" + } + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/scrapy/pipelines": { + "get": { + "description": "Get scrapy spider pipelines", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get scrapy spider pipelines", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/scrapy/settings": { + "get": { + "description": "Get scrapy spider settings", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get scrapy spider settings", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "post": { + "description": "Get scrapy spider file", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get scrapy spider file", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + }, + { + "description": "req data", + "name": "reqData", + "in": "body", + "required": true, + "schema": { + "type": "entity.ScrapySettingParam", + "items": { + "$ref": "#/definitions/entity.ScrapySettingParam" + } + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/scrapy/spider/filepath": { + "get": { + "description": "Get scrapy spider file path", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get scrapy spider file path", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/scrapy/spiders": { + "get": { + "description": "Get scrapy spider file", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get scrapy spider file", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "put": { + "description": "Put scrapy spider file", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Put scrapy spider file", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/stats": { + "get": { + "description": "Get spider stats", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get spider stats", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/tasks": { + "get": { + "description": "Get task list", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get task list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/spiders/{id}/upload": { + "post": { + "description": "Upload spider by id", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Upload spider by id", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "file", + "description": "spider file to upload", + "name": "file", + "in": "formData", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/version": { + "get": { + "description": "Get version", + "produces": [ + "application/json" + ], + "tags": [ + "setting" + ], + "summary": "Get version", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + } + }, + "definitions": { + "entity.ConfigSpiderData": { + "type": "object", + "properties": { + "cmd": { + "description": "自定义爬虫", + "type": "string" + }, + "col": { + "type": "string" + }, + "display_name": { + "type": "string" + }, + "engine": { + "description": "可配置爬虫", + "type": "string" + }, + "name": { + "description": "通用", + "type": "string" + }, + "remark": { + "type": "string" + }, + "settings": { + "type": "object" + }, + "stages": { + "type": "array", + "items": { + "$ref": "#/definitions/entity.Stage" + } + }, + "start_stage": { + "type": "string" + }, + "start_url": { + "type": "string" + }, + "type": { + "type": "string" + } + } + }, + "entity.Field": { + "type": "object", + "properties": { + "attr": { + "type": "string" + }, + "css": { + "type": "string" + }, + "name": { + "type": "string" + }, + "next_stage": { + "type": "string" + }, + "remark": { + "type": "string" + }, + "xpath": { + "type": "string" + } + } + }, + "entity.ScrapyItem": { + "type": "object", + "properties": { + "fields": { + "type": "array", + "items": { + "type": "string" + } + }, + "name": { + "type": "string" + } + } + }, + "entity.ScrapySettingParam": { + "type": "object", + "properties": { + "key": { + "type": "string" + }, + "type": { + "type": "string" + }, + "value": { + "type": "object" + } + } + }, + "entity.Stage": { + "type": "object", + "properties": { + "fields": { + "type": "array", + "items": { + "$ref": "#/definitions/entity.Field" + } + }, + "is_list": { + "type": "boolean" + }, + "list_css": { + "type": "string" + }, + "list_xpath": { + "type": "string" + }, + "name": { + "type": "string" + }, + "page_attr": { + "type": "string" + }, + "page_css": { + "type": "string" + }, + "page_xpath": { + "type": "string" + } + } + }, + "model.Env": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "value": { + "type": "string" + } + } + }, + "model.Node": { + "type": "object", + "properties": { + "_id": { + "type": "string" + }, + "create_ts": { + "type": "string" + }, + "description": { + "type": "string" + }, + "hostname": { + "type": "string" + }, + "ip": { + "type": "string" + }, + "is_master": { + "description": "前端展示", + "type": "boolean" + }, + "key": { + "description": "用于唯一标识节点,可能是mac地址,可能是ip地址", + "type": "string" + }, + "mac": { + "type": "string" + }, + "name": { + "type": "string" + }, + "port": { + "type": "string" + }, + "status": { + "type": "string" + }, + "update_ts": { + "type": "string" + }, + "update_ts_unix": { + "type": "integer" + } + } + }, + "model.Project": { + "type": "object", + "properties": { + "_id": { + "type": "string" + }, + "create_ts": { + "type": "string" + }, + "description": { + "type": "string" + }, + "name": { + "type": "string" + }, + "spiders": { + "description": "前端展示", + "type": "array", + "items": { + "$ref": "#/definitions/model.Spider" + } + }, + "tags": { + "type": "array", + "items": { + "type": "string" + } + }, + "update_ts": { + "type": "string" + }, + "user_id": { + "type": "string" + }, + "username": { + "type": "string" + } + } + }, + "model.Schedule": { + "type": "object", + "properties": { + "_id": { + "type": "string" + }, + "create_ts": { + "type": "string" + }, + "cron": { + "type": "string" + }, + "description": { + "type": "string" + }, + "enabled": { + "type": "boolean" + }, + "entry_id": { + "type": "integer" + }, + "message": { + "type": "string" + }, + "name": { + "type": "string" + }, + "node_ids": { + "type": "array", + "items": { + "type": "string" + } + }, + "nodes": { + "type": "array", + "items": { + "$ref": "#/definitions/model.Node" + } + }, + "param": { + "type": "string" + }, + "run_type": { + "type": "string" + }, + "scrapy_log_level": { + "type": "string" + }, + "scrapy_spider": { + "type": "string" + }, + "spider_id": { + "type": "string" + }, + "spider_name": { + "description": "前端展示", + "type": "string" + }, + "status": { + "type": "string" + }, + "update_ts": { + "type": "string" + }, + "user_id": { + "type": "string" + }, + "user_name": { + "type": "string" + } + } + }, + "model.Spider": { + "type": "object", + "properties": { + "_id": { + "description": "爬虫ID", + "type": "string" + }, + "cmd": { + "description": "自定义爬虫", + "type": "string" + }, + "col": { + "description": "结果储存位置", + "type": "string" + }, + "config": { + "description": "可配置爬虫配置", + "type": "object", + "$ref": "#/definitions/entity.ConfigSpiderData" + }, + "create_ts": { + "type": "string" + }, + "dedup_field": { + "description": "去重字段", + "type": "string" + }, + "dedup_method": { + "description": "去重方式", + "type": "string" + }, + "display_name": { + "description": "爬虫显示名称", + "type": "string" + }, + "envs": { + "description": "环境变量", + "type": "array", + "items": { + "$ref": "#/definitions/model.Env" + } + }, + "file_id": { + "description": "GridFS文件ID", + "type": "string" + }, + "git_auto_sync": { + "description": "Git 是否自动同步", + "type": "boolean" + }, + "git_branch": { + "description": "Git 分支", + "type": "string" + }, + "git_has_credential": { + "description": "Git 是否加密", + "type": "boolean" + }, + "git_password": { + "description": "Git 密码", + "type": "string" + }, + "git_sync_error": { + "description": "Git 同步错误", + "type": "string" + }, + "git_sync_frequency": { + "description": "Git 同步频率", + "type": "string" + }, + "git_url": { + "description": "Git URL", + "type": "string" + }, + "git_username": { + "description": "Git 用户名", + "type": "string" + }, + "is_dedup": { + "description": "去重", + "type": "boolean" + }, + "is_git": { + "description": "Git 设置", + "type": "boolean" + }, + "is_long_task": { + "description": "长任务", + "type": "boolean" + }, + "is_public": { + "description": "是否公开", + "type": "boolean" + }, + "is_scrapy": { + "description": "Scrapy 爬虫(属于自定义爬虫)", + "type": "boolean" + }, + "is_web_hook": { + "description": "Web Hook", + "type": "boolean" + }, + "last_run_ts": { + "description": "前端展示", + "type": "string" + }, + "last_status": { + "description": "最后执行状态", + "type": "string" + }, + "latest_tasks": { + "description": "最近任务列表", + "type": "array", + "items": { + "$ref": "#/definitions/model.Task" + } + }, + "name": { + "description": "爬虫名称(唯一)", + "type": "string" + }, + "project_id": { + "description": "项目ID", + "type": "string" + }, + "remark": { + "description": "备注", + "type": "string" + }, + "site": { + "description": "爬虫网站", + "type": "string" + }, + "spider_names": { + "description": "爬虫名称列表", + "type": "array", + "items": { + "type": "string" + } + }, + "src": { + "description": "源码位置", + "type": "string" + }, + "template": { + "description": "可配置爬虫", + "type": "string" + }, + "type": { + "description": "爬虫类别", + "type": "string" + }, + "update_ts": { + "type": "string" + }, + "user_id": { + "description": "时间", + "type": "string" + }, + "username": { + "description": "用户名称", + "type": "string" + }, + "web_hook_url": { + "description": "Web Hook URL", + "type": "string" + } + } + }, + "model.Task": { + "type": "object", + "properties": { + "_id": { + "type": "string" + }, + "cmd": { + "type": "string" + }, + "create_ts": { + "type": "string" + }, + "error": { + "type": "string" + }, + "error_log_count": { + "type": "integer" + }, + "finish_ts": { + "type": "string" + }, + "log_path": { + "type": "string" + }, + "node_id": { + "type": "string" + }, + "node_name": { + "type": "string" + }, + "param": { + "type": "string" + }, + "pid": { + "type": "integer" + }, + "result_count": { + "type": "integer" + }, + "run_type": { + "type": "string" + }, + "runtime_duration": { + "type": "number" + }, + "schedule_id": { + "type": "string" + }, + "spider_id": { + "type": "string" + }, + "spider_name": { + "description": "前端数据", + "type": "string" + }, + "start_ts": { + "type": "string" + }, + "status": { + "type": "string" + }, + "total_duration": { + "type": "number" + }, + "update_ts": { + "type": "string" + }, + "user_id": { + "type": "string" + }, + "username": { + "type": "string" + }, + "wait_duration": { + "type": "number" + } + } + }, + "routes.SpiderFileReqBody": { + "type": "object", + "properties": { + "content": { + "type": "string" + }, + "new_path": { + "type": "string" + }, + "path": { + "type": "string" + } + } + } + } +} \ No newline at end of file diff --git a/backend/docs/swagger.yaml b/backend/docs/swagger.yaml new file mode 100644 index 00000000..0ced87e0 --- /dev/null +++ b/backend/docs/swagger.yaml @@ -0,0 +1,1848 @@ +definitions: + entity.ConfigSpiderData: + properties: + cmd: + description: 自定义爬虫 + type: string + col: + type: string + display_name: + type: string + engine: + description: 可配置爬虫 + type: string + name: + description: 通用 + type: string + remark: + type: string + settings: + type: object + stages: + items: + $ref: '#/definitions/entity.Stage' + type: array + start_stage: + type: string + start_url: + type: string + type: + type: string + type: object + entity.Field: + properties: + attr: + type: string + css: + type: string + name: + type: string + next_stage: + type: string + remark: + type: string + xpath: + type: string + type: object + entity.ScrapyItem: + properties: + fields: + items: + type: string + type: array + name: + type: string + type: object + entity.ScrapySettingParam: + properties: + key: + type: string + type: + type: string + value: + type: object + type: object + entity.Stage: + properties: + fields: + items: + $ref: '#/definitions/entity.Field' + type: array + is_list: + type: boolean + list_css: + type: string + list_xpath: + type: string + name: + type: string + page_attr: + type: string + page_css: + type: string + page_xpath: + type: string + type: object + model.Env: + properties: + name: + type: string + value: + type: string + type: object + model.Node: + properties: + _id: + type: string + create_ts: + type: string + description: + type: string + hostname: + type: string + ip: + type: string + is_master: + description: 前端展示 + type: boolean + key: + description: 用于唯一标识节点,可能是mac地址,可能是ip地址 + type: string + mac: + type: string + name: + type: string + port: + type: string + status: + type: string + update_ts: + type: string + update_ts_unix: + type: integer + type: object + model.Project: + properties: + _id: + type: string + create_ts: + type: string + description: + type: string + name: + type: string + spiders: + description: 前端展示 + items: + $ref: '#/definitions/model.Spider' + type: array + tags: + items: + type: string + type: array + update_ts: + type: string + user_id: + type: string + username: + type: string + type: object + model.Schedule: + properties: + _id: + type: string + create_ts: + type: string + cron: + type: string + description: + type: string + enabled: + type: boolean + entry_id: + type: integer + message: + type: string + name: + type: string + node_ids: + items: + type: string + type: array + nodes: + items: + $ref: '#/definitions/model.Node' + type: array + param: + type: string + run_type: + type: string + scrapy_log_level: + type: string + scrapy_spider: + type: string + spider_id: + type: string + spider_name: + description: 前端展示 + type: string + status: + type: string + update_ts: + type: string + user_id: + type: string + user_name: + type: string + type: object + model.Spider: + properties: + _id: + description: 爬虫ID + type: string + cmd: + description: 自定义爬虫 + type: string + col: + description: 结果储存位置 + type: string + config: + $ref: '#/definitions/entity.ConfigSpiderData' + description: 可配置爬虫配置 + type: object + create_ts: + type: string + dedup_field: + description: 去重字段 + type: string + dedup_method: + description: 去重方式 + type: string + display_name: + description: 爬虫显示名称 + type: string + envs: + description: 环境变量 + items: + $ref: '#/definitions/model.Env' + type: array + file_id: + description: GridFS文件ID + type: string + git_auto_sync: + description: Git 是否自动同步 + type: boolean + git_branch: + description: Git 分支 + type: string + git_has_credential: + description: Git 是否加密 + type: boolean + git_password: + description: Git 密码 + type: string + git_sync_error: + description: Git 同步错误 + type: string + git_sync_frequency: + description: Git 同步频率 + type: string + git_url: + description: Git URL + type: string + git_username: + description: Git 用户名 + type: string + is_dedup: + description: 去重 + type: boolean + is_git: + description: Git 设置 + type: boolean + is_long_task: + description: 长任务 + type: boolean + is_public: + description: 是否公开 + type: boolean + is_scrapy: + description: Scrapy 爬虫(属于自定义爬虫) + type: boolean + is_web_hook: + description: Web Hook + type: boolean + last_run_ts: + description: 前端展示 + type: string + last_status: + description: 最后执行状态 + type: string + latest_tasks: + description: 最近任务列表 + items: + $ref: '#/definitions/model.Task' + type: array + name: + description: 爬虫名称(唯一) + type: string + project_id: + description: 项目ID + type: string + remark: + description: 备注 + type: string + site: + description: 爬虫网站 + type: string + spider_names: + description: 爬虫名称列表 + items: + type: string + type: array + src: + description: 源码位置 + type: string + template: + description: 可配置爬虫 + type: string + type: + description: 爬虫类别 + type: string + update_ts: + type: string + user_id: + description: 时间 + type: string + username: + description: 用户名称 + type: string + web_hook_url: + description: Web Hook URL + type: string + type: object + model.Task: + properties: + _id: + type: string + cmd: + type: string + create_ts: + type: string + error: + type: string + error_log_count: + type: integer + finish_ts: + type: string + log_path: + type: string + node_id: + type: string + node_name: + type: string + param: + type: string + pid: + type: integer + result_count: + type: integer + run_type: + type: string + runtime_duration: + type: number + schedule_id: + type: string + spider_id: + type: string + spider_name: + description: 前端数据 + type: string + start_ts: + type: string + status: + type: string + total_duration: + type: number + update_ts: + type: string + user_id: + type: string + username: + type: string + wait_duration: + type: number + type: object + routes.SpiderFileReqBody: + properties: + content: + type: string + new_path: + type: string + path: + type: string + type: object +info: + contact: {} + license: {} +paths: + /nodes: + get: + description: Get nodes + parameters: + - description: With the bearer started + in: header + name: Authorization + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get nodes + tags: + - node + /nodes/{id}: + delete: + description: Delete node + parameters: + - description: With the bearer started + in: header + name: Authorization + required: true + type: string + - description: node id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Delete node + tags: + - node + get: + description: Get node + parameters: + - description: With the bearer started + in: header + name: Authorization + required: true + type: string + - description: id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get node + tags: + - node + post: + consumes: + - application/json + description: Post node + parameters: + - description: With the bearer started + in: header + name: Authorization + required: true + type: string + - description: post node + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "500": + description: Internal Server Error + schema: + type: json + summary: Post node + tags: + - node + /nodes/{id}/system: + get: + description: Get system info + parameters: + - description: With the bearer started + in: header + name: Authorization + required: true + type: string + - description: node id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get system info + tags: + - node + /nodes/{id}/tasks: + get: + description: Get tasks on node + parameters: + - description: With the bearer started + in: header + name: Authorization + required: true + type: string + - description: node id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get tasks on node + tags: + - node + /projects: + get: + description: Get projects + parameters: + - description: With the bearer started + in: header + name: Authorization + required: true + type: string + - description: projects + in: query + name: tag + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get projects + tags: + - project + put: + consumes: + - application/json + description: Put project + parameters: + - description: With the bearer started + in: header + name: Authorization + required: true + type: string + - description: post project + in: body + name: p + required: true + schema: + $ref: '#/definitions/model.Project' + type: object + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "500": + description: Internal Server Error + schema: + type: json + summary: Put project + tags: + - project + /projects/{id}: + delete: + description: Delete project + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: project id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Delete project + tags: + - project + post: + consumes: + - application/json + description: Post project + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: project id + in: path + name: id + required: true + type: string + - description: project item + in: body + name: item + required: true + schema: + $ref: '#/definitions/model.Project' + type: object + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "500": + description: Internal Server Error + schema: + type: json + summary: Post project + tags: + - project + /projects/tags: + get: + description: Get projects tags + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get project tags + tags: + - project + /schedules: + get: + description: Get schedule list + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get schedule list + tags: + - schedule + put: + consumes: + - application/json + description: Put schedule + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: schedule item + in: body + name: item + required: true + schema: + $ref: '#/definitions/model.Schedule' + type: object + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "500": + description: Internal Server Error + schema: + type: json + summary: Put schedule + tags: + - schedule + /schedules/{id}: + delete: + description: Delete schedule + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: schedule id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Delete schedule + tags: + - schedule + get: + description: Get schedule by id + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: schedule id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get schedule by id + tags: + - schedule + post: + consumes: + - application/json + description: Post schedule + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: schedule id + in: path + name: id + required: true + type: string + - description: schedule item + in: body + name: newItem + required: true + schema: + $ref: '#/definitions/model.Schedule' + type: object + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "500": + description: Internal Server Error + schema: + type: json + summary: Post schedule + tags: + - schedule + /schedules/{id}/disable: + post: + consumes: + - application/json + description: disable schedule + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: schedule id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "500": + description: Internal Server Error + schema: + type: json + summary: disable schedule + tags: + - schedule + /schedules/{id}/enable: + post: + consumes: + - application/json + description: enable schedule + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: schedule id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "500": + description: Internal Server Error + schema: + type: json + summary: enable schedule + tags: + - schedule + /setting: + get: + description: Get setting + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get setting + tags: + - setting + /spiders: + post: + consumes: + - application/json + description: delete spider + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "500": + description: Internal Server Error + schema: + type: json + summary: delete spider + tags: + - spider + put: + consumes: + - application/json + description: Put spider + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider item + in: body + name: spider + required: true + schema: + $ref: '#/definitions/model.Spider' + type: object + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "500": + description: Internal Server Error + schema: + type: json + summary: Put spider + tags: + - spider + /spiders-cancel: + post: + consumes: + - application/json + description: cancel spider + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "500": + description: Internal Server Error + schema: + type: json + summary: cancel spider + tags: + - spider + /spiders-run: + post: + consumes: + - application/json + description: run spider + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "500": + description: Internal Server Error + schema: + type: json + summary: run spider + tags: + - spider + /spiders/{id}: + delete: + description: Delete spider by id + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Delete spider by id + tags: + - spider + get: + description: Get spider by id + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: schedule id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get spider by id + tags: + - spider + post: + consumes: + - application/json + description: Post spider + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: schedule id + in: path + name: id + required: true + type: string + - description: spider item + in: body + name: item + required: true + schema: + $ref: '#/definitions/model.Spider' + type: object + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "500": + description: Internal Server Error + schema: + type: json + summary: Post spider + tags: + - spider + /spiders/{id}/copy: + post: + consumes: + - application/json + description: Copy spider + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: schedule id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "500": + description: Internal Server Error + schema: + type: json + summary: Copy spider + tags: + - spider + /spiders/{id}/dir: + get: + description: Get spider dir + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + - description: path + in: query + name: path + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get spider dir + tags: + - spider + /spiders/{id}/file: + delete: + description: Delete spider file + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + - description: path + in: body + name: reqBody + required: true + schema: + $ref: '#/definitions/routes.SpiderFileReqBody' + type: object + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Delete spider file + tags: + - spider + get: + description: Get spider file + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + - description: path + in: query + name: path + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get spider file + tags: + - spider + post: + description: Put spider file + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + - description: path + in: body + name: reqBody + required: true + schema: + $ref: '#/definitions/routes.SpiderFileReqBody' + type: object + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Put spider file + tags: + - spider + put: + description: Post spider dir + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + - description: path + in: body + name: reqBody + required: true + schema: + $ref: '#/definitions/routes.SpiderFileReqBody' + type: object + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Post spider dir + tags: + - spider + /spiders/{id}/file/rename: + post: + description: Rename spider file + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + - description: path + in: body + name: reqBody + required: true + schema: + $ref: '#/definitions/routes.SpiderFileReqBody' + type: object + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Rename spider file + tags: + - spider + /spiders/{id}/file/tree: + get: + description: Get spider dir + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get spider dir + tags: + - spider + /spiders/{id}/git/reset: + post: + description: Post spider reset git + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Post spider reset git + tags: + - spider + /spiders/{id}/git/sync: + post: + description: Post spider sync git + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Post spider sync git + tags: + - spider + /spiders/{id}/publish: + post: + consumes: + - application/json + description: Publish spider + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: schedule id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "500": + description: Internal Server Error + schema: + type: json + summary: Publish spider + tags: + - spider + /spiders/{id}/schedules: + get: + description: Get schedules + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get schedules + tags: + - spider + /spiders/{id}/scrapy/items: + get: + description: Get scrapy spider items + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get scrapy spider items + tags: + - spider + post: + description: Post scrapy spider items + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + - description: req data + in: body + name: reqData + required: true + schema: + items: + $ref: '#/definitions/entity.ScrapyItem' + type: entity.ScrapyItem + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Post scrapy spider items + tags: + - spider + /spiders/{id}/scrapy/pipelines: + get: + description: Get scrapy spider pipelines + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get scrapy spider pipelines + tags: + - spider + /spiders/{id}/scrapy/settings: + get: + description: Get scrapy spider settings + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get scrapy spider settings + tags: + - spider + post: + description: Get scrapy spider file + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + - description: req data + in: body + name: reqData + required: true + schema: + items: + $ref: '#/definitions/entity.ScrapySettingParam' + type: entity.ScrapySettingParam + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get scrapy spider file + tags: + - spider + /spiders/{id}/scrapy/spider/filepath: + get: + description: Get scrapy spider file path + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get scrapy spider file path + tags: + - spider + /spiders/{id}/scrapy/spiders: + get: + description: Get scrapy spider file + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get scrapy spider file + tags: + - spider + put: + description: Put scrapy spider file + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Put scrapy spider file + tags: + - spider + /spiders/{id}/stats: + get: + description: Get spider stats + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get spider stats + tags: + - spider + /spiders/{id}/tasks: + get: + description: Get task list + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get task list + tags: + - spider + /spiders/{id}/upload: + post: + consumes: + - application/json + description: Upload spider by id + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider file to upload + in: formData + name: file + required: true + type: file + - description: spider id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "500": + description: Internal Server Error + schema: + type: json + summary: Upload spider by id + tags: + - spider + /version: + get: + description: Get version + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get version + tags: + - setting +swagger: "2.0" diff --git a/backend/go.mod b/backend/go.mod index 7503389a..6a6e65c1 100644 --- a/backend/go.mod +++ b/backend/go.mod @@ -6,6 +6,7 @@ require ( github.com/Masterminds/semver v1.4.2 // indirect github.com/Masterminds/sprig v2.16.0+incompatible // indirect github.com/Unknwon/goconfig v0.0.0-20191126170842-860a72fb44fd + github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc github.com/aokoli/goutils v1.0.1 // indirect github.com/apex/log v1.1.1 github.com/dgrijalva/jwt-go v3.2.0+incompatible @@ -30,6 +31,8 @@ require ( github.com/smartystreets/goconvey v0.0.0-20190731233626-505e41936337 github.com/spf13/viper v1.4.0 github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf // indirect + github.com/swaggo/gin-swagger v1.2.0 + github.com/swaggo/swag v1.5.1 gopkg.in/alexcesaro/quotedprintable.v3 v3.0.0-20150716171945-2caba252f4dc // indirect gopkg.in/go-playground/validator.v9 v9.29.1 gopkg.in/gomail.v2 v2.0.0-20150902115704-41f357289737 diff --git a/backend/go.sum b/backend/go.sum index 1a253f5d..181899a8 100644 --- a/backend/go.sum +++ b/backend/go.sum @@ -6,10 +6,15 @@ github.com/Masterminds/semver v1.4.2/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF0 github.com/Masterminds/sprig v2.16.0+incompatible h1:QZbMUPxRQ50EKAq3LFMnxddMu88/EUUG3qmxwtDmPsY= github.com/Masterminds/sprig v2.16.0+incompatible/go.mod h1:y6hNFY5UBTIWBxnzTeuNhlNS5hqE0NB0E6fgfo2Br3o= github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= +github.com/PuerkitoBio/purell v1.1.0 h1:rmGxhojJlM0tuKtfdvliR84CFHljx9ag64t2xmVkjK4= +github.com/PuerkitoBio/purell v1.1.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= +github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 h1:d+Bc7a5rLufV/sSk/8dngufqelfh6jnri85riMAaF/M= +github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= github.com/Unknwon/goconfig v0.0.0-20191126170842-860a72fb44fd h1:+CYOsXi89xOqBkj7CuEJjA2It+j+R3ngUZEydr6mtkw= github.com/Unknwon/goconfig v0.0.0-20191126170842-860a72fb44fd/go.mod h1:wngxua9XCNjvHjDiTiV26DaKDT+0c63QR6H5hjVUUxw= github.com/alcortesm/tgz v0.0.0-20161220082320-9c5fe88206d7 h1:uSoVVbwJiQipAclBbw+8quDsfcvFjOpI5iCf4p/cqCs= github.com/alcortesm/tgz v0.0.0-20161220082320-9c5fe88206d7/go.mod h1:6zEj6s6u/ghQa61ZWa/C2Aw3RkjiTBOix7dkqa1VLIs= +github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc h1:cAKDfWh5VpdgMhJosfJnn5/FoN2SRZ4p7fJNX58YPaU= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239 h1:kFOfPq6dUM1hTo4JG6LR5AXSUEsOjtdm0kw0FtQtMJA= @@ -52,8 +57,14 @@ github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHqu github.com/fsnotify/fsnotify v1.4.7 h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= +github.com/gin-contrib/gzip v0.0.1 h1:ezvKOL6jH+jlzdHNE4h9h8q8uMpDQjyl0NN0Jd7jozc= +github.com/gin-contrib/gzip v0.0.1/go.mod h1:fGBJBCdt6qCZuCAOwWuFhBB4OOq9EFqlo5dEaFhhu5w= +github.com/gin-contrib/sse v0.0.0-20170109093832-22d885f9ecc7/go.mod h1:VJ0WA2NBN22VlZ2dKZQPAPnyWw5XTlK1KymzLKsr59s= github.com/gin-contrib/sse v0.0.0-20190301062529-5545eab6dad3 h1:t8FVkw33L+wilf2QiWkw0UV77qRpcH/JHPKGpKa2E8g= github.com/gin-contrib/sse v0.0.0-20190301062529-5545eab6dad3/go.mod h1:VJ0WA2NBN22VlZ2dKZQPAPnyWw5XTlK1KymzLKsr59s= +github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE= +github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI= +github.com/gin-gonic/gin v1.3.0/go.mod h1:7cKuhb5qV2ggCFctp2fJQ+ErvciLZrIeoOSOm6mUr7Y= github.com/gin-gonic/gin v1.4.0 h1:3tMoCCfM7ppqsR0ptz/wi1impNpT7/9wQtMZ8lr1mCQ= github.com/gin-gonic/gin v1.4.0/go.mod h1:OW2EZn3DO8Ln9oIKOvM++LBO+5UPHJJDH72/q/3rZdM= github.com/gliderlabs/ssh v0.2.2 h1:6zsha5zo/TWhRhwqCD3+EarCAgZ2yN28ipRnGPnwkI0= @@ -63,6 +74,15 @@ github.com/globalsign/mgo v0.0.0-20181015135952-eeefdecb41b8/go.mod h1:xkRDCp4j0 github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE= github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= +github.com/go-openapi/jsonpointer v0.17.0 h1:nH6xp8XdXHx8dqveo0ZuJBluCO2qGrPbDNZ0dwoRHP0= +github.com/go-openapi/jsonpointer v0.17.0/go.mod h1:cOnomiV+CVVwFLk0A/MExoFMjwdsUdVpsRhURCKh+3M= +github.com/go-openapi/jsonreference v0.17.0/go.mod h1:g4xxGn04lDIRh0GJb5QlpE3HfopLOL6uZrK/VgnsK9I= +github.com/go-openapi/jsonreference v0.19.0 h1:BqWKpV1dFd+AuiKlgtddwVIFQsuMpxfBDBHGfM2yNpk= +github.com/go-openapi/jsonreference v0.19.0/go.mod h1:g4xxGn04lDIRh0GJb5QlpE3HfopLOL6uZrK/VgnsK9I= +github.com/go-openapi/spec v0.19.0 h1:A4SZ6IWh3lnjH0rG0Z5lkxazMGBECtrZcbyYQi+64k4= +github.com/go-openapi/spec v0.19.0/go.mod h1:XkF/MOi14NmjsfZ8VtAKf8pIlbZzyoTvZsdfssdxcBI= +github.com/go-openapi/swag v0.17.0 h1:iqrgMg7Q7SvtbWLlltPrkMs0UBJI6oTSs79JFRUi880= +github.com/go-openapi/swag v0.17.0/go.mod h1:AByQ+nYG6gQg71GINrmuDXCPWdL640yX49/kXLo40Tg= github.com/go-playground/locales v0.12.1 h1:2FITxuFt/xuCNP1Acdhv62OzaCiviiE4kotfhkmOqEc= github.com/go-playground/locales v0.12.1/go.mod h1:IUMDtCfWo/w/mtMfIE/IG2K+Ey3ygWanZIBtBW0W2TM= github.com/go-playground/universal-translator v0.16.0 h1:X++omBR/4cE2MNg91AoC3rmGrCjJ8eAeUP/K/EKx4DM= @@ -112,6 +132,7 @@ github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht github.com/jmespath/go-jmespath v0.3.0/go.mod h1:9QtRXoHjLGCJ5IBSaohpXITPlowMeeYCZ7fLUTSywik= github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo= github.com/jpillora/backoff v0.0.0-20180909062703-3050d21c67d7/go.mod h1:2iMrUgbbvHEiQClaW2NsSzMyGHqN+rDFqY705q49KG0= +github.com/json-iterator/go v1.1.5/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= github.com/json-iterator/go v1.1.6 h1:MrUvLMLTMxbqFJ9kzlvat/rYZqZnW3u4wkLzWTaFwKs= github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo= @@ -133,12 +154,14 @@ github.com/leodido/go-urn v1.1.0 h1:Sm1gr51B1kKyfD2BlRcLSiEkffoG96g6TPv6eRoEiB8= github.com/leodido/go-urn v1.1.0/go.mod h1:+cyI34gQWZcE1eQU7NVgKkkzdXDQHr1dBMtdAPozLkw= github.com/magiconair/properties v1.8.0 h1:LLgXmsheXeRoUOBOjtwPQCWIYqM/LU1ayDtDePerRcY= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= +github.com/mailru/easyjson v0.0.0-20180823135443-60711f1a8329/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.7.1 h1:mdxE1MF9o53iCb2Ghj1VfWvh7ZOwHpnVG/xwXrV90U8= github.com/mailru/easyjson v0.7.1/go.mod h1:KAzv3t3aY1NaHWoQz1+4F1ccyAH66Jk7yos7ldAVICs= github.com/matcornic/hermes v1.2.0 h1:AuqZpYcTOtTB7cahdevLfnhIpfzmpqw5Czv8vpdnFDU= github.com/matcornic/hermes v1.2.0/go.mod h1:lujJomb016Xjv8wBnWlNvUdtmvowjjfkqri5J/+1hYc= github.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcnceauSikq3lYCQ= github.com/mattn/go-colorable v0.1.2/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= +github.com/mattn/go-isatty v0.0.4/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4= github.com/mattn/go-isatty v0.0.5/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= github.com/mattn/go-isatty v0.0.7/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= github.com/mattn/go-isatty v0.0.8 h1:HLtExJ+uU2HOZ+wI0Tt5DtUDrx8yhUqDcp7fYERX4CE= @@ -224,6 +247,10 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H4= github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= +github.com/swaggo/gin-swagger v1.2.0 h1:YskZXEiv51fjOMTsXrOetAjrMDfFaXD79PEoQBOe2W0= +github.com/swaggo/gin-swagger v1.2.0/go.mod h1:qlH2+W7zXGZkczuL+r2nEBR2JTT+/lX05Nn6vPhc7OI= +github.com/swaggo/swag v1.5.1 h1:2Agm8I4K5qb00620mHq0VJ05/KT4FtmALPIcQR9lEZM= +github.com/swaggo/swag v1.5.1/go.mod h1:1Bl9F/ZBpVWh22nY0zmYyASPO1lI/zIwRDrpZU+tv8Y= github.com/tj/assert v0.0.0-20171129193455-018094318fb0/go.mod h1:mZ9/Rh9oLWpLLDRpvE+3b7gP/C2YyLFYxNmcLnPTMe0= github.com/tj/go-elastic v0.0.0-20171221160941-36157cbbebc2/go.mod h1:WjeM0Oo1eNAjXGDx2yma7uG2XoyRZTq1uv3M/o7imD0= github.com/tj/go-kinesis v0.0.0-20171128231115-08b17f58cb1b/go.mod h1:/yhzCV0xPfx6jb1bBgRFjl5lytqVqZXEaeqWP8lTEao= @@ -231,6 +258,12 @@ github.com/tj/go-spin v1.1.0/go.mod h1:Mg1mzmePZm4dva8Qz60H2lHwmJ2loum4VIrLgVnKw github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/ugorji/go v1.1.4 h1:j4s+tAvLfL3bZyefP2SEWmhBzmuIlH/eqNuPdFPgngw= github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGrc= +github.com/ugorji/go v1.1.5-pre h1:jyJKFOSEbdOc2HODrf2qcCkYOdq7zzXqA9bhW5oV4fM= +github.com/ugorji/go v1.1.5-pre/go.mod h1:FwP/aQVg39TXzItUBMwnWp9T9gPQnXw4Poh4/oBQZ/0= +github.com/ugorji/go/codec v0.0.0-20181022190402-e5e69e061d4f/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0= +github.com/ugorji/go/codec v1.1.5-pre h1:5YV9PsFAN+ndcCtTM7s60no7nY7eTG3LPtxhSwuxzCs= +github.com/ugorji/go/codec v1.1.5-pre/go.mod h1:tULtS6Gy1AE1yCENaw4Vb//HLH5njI2tfCQDUqRd8fI= +github.com/urfave/cli v1.20.0/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijbERA= github.com/xanzy/ssh-agent v0.2.1 h1:TCbipTQL2JiiCprBWx9frJ2eJlCYT00NmctrHxVAr70= github.com/xanzy/ssh-agent v0.2.1/go.mod h1:mLlQY/MoOhWBj+gOGMQkOeiEvkx+8pJSI+0Bx9h2kr4= github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU= @@ -254,6 +287,7 @@ golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHl golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20181005035420-146acd28ed58/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181220203305-927f97764cc3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -261,6 +295,7 @@ golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190522155817-f3200d17e092/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190611141213-3f473d35a33a/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190620200207-3b0461eec859 h1:R/3boaszxrf1GEUWTVDzSKVwLmSJpwZ1yqXm8j0v2QI= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190724013045-ca1201d0de80 h1:Ao/3l156eZf2AW5wK8a7/smtodRU+gha3+BeqJ69lRk= @@ -278,12 +313,14 @@ golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181107165924-66b7b1311ac8/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20181228144115-9a3f9b0469bb/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190221075227-b4e8571b14e0/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d h1:+R4KGOnez64A81RvjARKc4UT5/tI9ujCIVX+P5KiHuI= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190610200419-93c9922d18ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e h1:D5TXcfTk7xF7hvieo4QErS3qqCB4teTffacDWr7CI+0= golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= @@ -297,6 +334,9 @@ golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGm golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190606050223-4d9ae51c2468/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190611222205-d73e1c7e250b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190729092621-ff9f1409240a h1:mEQZbbaBjWyLNy0tmZmgEuQAR8XOQ3hL8GYi3J/NG64= golang.org/x/tools v0.0.0-20190729092621-ff9f1409240a/go.mod h1:jcCCGcm9btYwXyDqrUWc6MKQKKGJCWEQ3AfLSRIbEuI= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/backend/main.go b/backend/main.go index 7ca18335..0c787ce6 100644 --- a/backend/main.go +++ b/backend/main.go @@ -4,6 +4,7 @@ import ( "context" "crawlab/config" "crawlab/database" + _ "crawlab/docs" "crawlab/lib/validate_bridge" "crawlab/middlewares" "crawlab/model" @@ -16,6 +17,8 @@ import ( "github.com/gin-gonic/gin/binding" "github.com/olivere/elastic/v7" "github.com/spf13/viper" + "github.com/swaggo/gin-swagger" + "github.com/swaggo/gin-swagger/swaggerFiles" "net" "net/http" "os" @@ -25,9 +28,17 @@ import ( "time" ) +var swagHandler gin.HandlerFunc + +func init() { + swagHandler = ginSwagger.WrapHandler(swaggerFiles.Handler) +} func main() { binding.Validator = new(validate_bridge.DefaultValidator) app := gin.Default() + if swagHandler != nil { + app.GET("/swagger/*any", swagHandler) + } // 初始化配置 if err := config.InitConfig(""); err != nil { diff --git a/backend/routes/node.go b/backend/routes/node.go index 7d030773..dfa593eb 100644 --- a/backend/routes/node.go +++ b/backend/routes/node.go @@ -8,6 +8,14 @@ import ( "net/http" ) +// @Summary Get nodes +// @Description Get nodes +// @Tags node +// @Produce json +// @Param Authorization header string true "With the bearer started" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /nodes [get] func GetNodeList(c *gin.Context) { nodes, err := model.GetNodeList(nil) if err != nil { @@ -26,6 +34,15 @@ func GetNodeList(c *gin.Context) { }) } +// @Summary Get node +// @Description Get node +// @Tags node +// @Produce json +// @Param Authorization header string true "With the bearer started" +// @Param id path string true "id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /nodes/{id} [get] func GetNode(c *gin.Context) { id := c.Param("id") @@ -54,6 +71,17 @@ func Ping(c *gin.Context) { }) } + +// @Summary Post node +// @Description Post node +// @Tags node +// @Accept json +// @Produce json +// @Param Authorization header string true "With the bearer started" +// @Param id path string true "post node" +// @Success 200 json string Response +// @Failure 500 json string Response +// @Router /nodes/{id} [post] func PostNode(c *gin.Context) { id := c.Param("id") @@ -81,6 +109,15 @@ func PostNode(c *gin.Context) { }) } +// @Summary Get tasks on node +// @Description Get tasks on node +// @Tags node +// @Produce json +// @Param Authorization header string true "With the bearer started" +// @Param id path string true "node id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /nodes/{id}/tasks [get] func GetNodeTaskList(c *gin.Context) { id := c.Param("id") @@ -97,6 +134,15 @@ func GetNodeTaskList(c *gin.Context) { }) } +// @Summary Get system info +// @Description Get system info +// @Tags node +// @Produce json +// @Param Authorization header string true "With the bearer started" +// @Param id path string true "node id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /nodes/{id}/system [get] func GetSystemInfo(c *gin.Context) { id := c.Param("id") @@ -109,6 +155,15 @@ func GetSystemInfo(c *gin.Context) { }) } +// @Summary Delete node +// @Description Delete node +// @Tags node +// @Produce json +// @Param Authorization header string true "With the bearer started" +// @Param id path string true "node id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /nodes/{id} [delete] func DeleteNode(c *gin.Context) { id := c.Param("id") node, err := model.GetNode(bson.ObjectIdHex(id)) diff --git a/backend/routes/project.go b/backend/routes/project.go index f0dd1198..86a8286d 100644 --- a/backend/routes/project.go +++ b/backend/routes/project.go @@ -10,6 +10,15 @@ import ( "net/http" ) +// @Summary Get projects +// @Description Get projects +// @Tags project +// @Produce json +// @Param Authorization header string true "With the bearer started" +// @Param tag query string true "projects" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /projects [get] func GetProjectList(c *gin.Context) { tag := c.Query("tag") @@ -70,6 +79,16 @@ func GetProjectList(c *gin.Context) { }) } +// @Summary Put project +// @Description Put project +// @Tags project +// @Accept json +// @Produce json +// @Param Authorization header string true "With the bearer started" +// @Param p body model.Project true "post project" +// @Success 200 json string Response +// @Failure 500 json string Response +// @Router /projects [put] func PutProject(c *gin.Context) { // 绑定请求数据 var p model.Project @@ -92,6 +111,17 @@ func PutProject(c *gin.Context) { }) } +// @Summary Post project +// @Description Post project +// @Tags project +// @Accept json +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "project id" +// @Param item body model.Project true "project item" +// @Success 200 json string Response +// @Failure 500 json string Response +// @Router /projects/{id} [post] func PostProject(c *gin.Context) { id := c.Param("id") @@ -116,6 +146,15 @@ func PostProject(c *gin.Context) { }) } +// @Summary Delete project +// @Description Delete project +// @Tags project +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "project id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /projects/{id} [delete] func DeleteProject(c *gin.Context) { id := c.Param("id") @@ -154,6 +193,14 @@ func DeleteProject(c *gin.Context) { }) } +// @Summary Get project tags +// @Description Get projects tags +// @Tags project +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /projects/tags [get] func GetProjectTags(c *gin.Context) { type Result struct { Tag string `json:"tag" bson:"tag"` diff --git a/backend/routes/schedule.go b/backend/routes/schedule.go index 27ad7825..e1e2fef3 100644 --- a/backend/routes/schedule.go +++ b/backend/routes/schedule.go @@ -8,6 +8,14 @@ import ( "net/http" ) +// @Summary Get schedule list +// @Description Get schedule list +// @Tags schedule +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /schedules [get] func GetScheduleList(c *gin.Context) { query := bson.M{} @@ -22,6 +30,15 @@ func GetScheduleList(c *gin.Context) { HandleSuccessData(c, results) } +// @Summary Get schedule by id +// @Description Get schedule by id +// @Tags schedule +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "schedule id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /schedules/{id} [get] func GetSchedule(c *gin.Context) { id := c.Param("id") @@ -34,6 +51,17 @@ func GetSchedule(c *gin.Context) { HandleSuccessData(c, result) } +// @Summary Post schedule +// @Description Post schedule +// @Tags schedule +// @Accept json +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "schedule id" +// @Param newItem body model.Schedule true "schedule item" +// @Success 200 json string Response +// @Failure 500 json string Response +// @Router /schedules/{id} [post] func PostSchedule(c *gin.Context) { id := c.Param("id") @@ -66,6 +94,16 @@ func PostSchedule(c *gin.Context) { HandleSuccess(c) } +// @Summary Put schedule +// @Description Put schedule +// @Tags schedule +// @Accept json +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param item body model.Schedule true "schedule item" +// @Success 200 json string Response +// @Failure 500 json string Response +// @Router /schedules [put] func PutSchedule(c *gin.Context) { var item model.Schedule @@ -99,6 +137,15 @@ func PutSchedule(c *gin.Context) { HandleSuccess(c) } +// @Summary Delete schedule +// @Description Delete schedule +// @Tags schedule +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "schedule id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /schedules/{id} [delete] func DeleteSchedule(c *gin.Context) { id := c.Param("id") @@ -118,6 +165,16 @@ func DeleteSchedule(c *gin.Context) { } // 停止定时任务 +// @Summary disable schedule +// @Description disable schedule +// @Tags schedule +// @Accept json +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "schedule id" +// @Success 200 json string Response +// @Failure 500 json string Response +// @Router /schedules/{id}/disable [post] func DisableSchedule(c *gin.Context) { id := c.Param("id") if err := services.Sched.Disable(bson.ObjectIdHex(id)); err != nil { @@ -128,6 +185,16 @@ func DisableSchedule(c *gin.Context) { } // 运行定时任务 +// @Summary enable schedule +// @Description enable schedule +// @Tags schedule +// @Accept json +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "schedule id" +// @Success 200 json string Response +// @Failure 500 json string Response +// @Router /schedules/{id}/enable [post] func EnableSchedule(c *gin.Context) { id := c.Param("id") if err := services.Sched.Enable(bson.ObjectIdHex(id)); err != nil { diff --git a/backend/routes/setting.go b/backend/routes/setting.go index 36bc46ca..5faea750 100644 --- a/backend/routes/setting.go +++ b/backend/routes/setting.go @@ -13,6 +13,14 @@ type SettingBody struct { EnableDemoSpiders string `json:"enable_demo_spiders"` } +// @Summary Get version +// @Description Get version +// @Tags setting +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /version [get] func GetVersion(c *gin.Context) { version := viper.GetString("version") @@ -23,6 +31,14 @@ func GetVersion(c *gin.Context) { }) } +// @Summary Get setting +// @Description Get setting +// @Tags setting +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /setting [get] func GetSetting(c *gin.Context) { body := SettingBody{ AllowRegister: viper.GetString("setting.allowRegister"), diff --git a/backend/routes/spider.go b/backend/routes/spider.go index b8feef6f..f229088e 100644 --- a/backend/routes/spider.go +++ b/backend/routes/spider.go @@ -28,6 +28,22 @@ import ( // ======== 爬虫管理 ======== +// @Summary Get spider list +// @Description Get spider list +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param page_num query string false "page num" +// @Param page_size query string false "page size" +// @Param keyword query string false "keyword" +// @Param project_id query string false "project_id" +// @Param type query string false "type" +// @Param sort_key query string false "sort_key" +// @Param sort_direction query string false "sort_direction" +// @Param owner_type query string false "owner_type" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /schedules [get] func GetSpiderList(c *gin.Context) { pageNum := c.Query("page_num") pageSize := c.Query("page_size") @@ -109,6 +125,15 @@ func GetSpiderList(c *gin.Context) { }) } +// @Summary Get spider by id +// @Description Get spider by id +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "schedule id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /spiders/{id} [get] func GetSpider(c *gin.Context) { id := c.Param("id") @@ -129,6 +154,17 @@ func GetSpider(c *gin.Context) { }) } +// @Summary Post spider +// @Description Post spider +// @Tags spider +// @Accept json +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "schedule id" +// @Param item body model.Spider true "spider item" +// @Success 200 json string Response +// @Failure 500 json string Response +// @Router /spiders/{id} [post] func PostSpider(c *gin.Context) { id := c.Param("id") @@ -177,6 +213,16 @@ func PostSpider(c *gin.Context) { }) } +// @Summary Publish spider +// @Description Publish spider +// @Tags spider +// @Accept json +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "schedule id" +// @Success 200 json string Response +// @Failure 500 json string Response +// @Router /spiders/{id}/publish [post] func PublishSpider(c *gin.Context) { id := c.Param("id") @@ -198,6 +244,16 @@ func PublishSpider(c *gin.Context) { }) } +// @Summary Put spider +// @Description Put spider +// @Tags spider +// @Accept json +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param spider body model.Spider true "spider item" +// @Success 200 json string Response +// @Failure 500 json string Response +// @Router /spiders [put] func PutSpider(c *gin.Context) { var spider model.Spider if err := c.ShouldBindJSON(&spider); err != nil { @@ -279,6 +335,16 @@ func PutSpider(c *gin.Context) { }) } +// @Summary Copy spider +// @Description Copy spider +// @Tags spider +// @Accept json +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "schedule id" +// @Success 200 json string Response +// @Failure 500 json string Response +// @Router /spiders/{id}/copy [post] func CopySpider(c *gin.Context) { type ReqBody struct { Name string `json:"name"` @@ -326,6 +392,20 @@ func CopySpider(c *gin.Context) { }) } +// @Summary Upload spider +// @Description Upload spider +// @Tags spider +// @Accept json +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param file formData file true "spider file to upload" +// @Param name formData string true "spider name" +// @Param display_name formData string true "display name" +// @Param col formData string true "col" +// @Param cmd formData string true "cmd" +// @Success 200 json string Response +// @Failure 500 json string Response +// @Router /spiders [post] func UploadSpider(c *gin.Context) { // 从body中获取文件 uploadFile, err := c.FormFile("file") @@ -467,6 +547,17 @@ func UploadSpider(c *gin.Context) { }) } +// @Summary Upload spider by id +// @Description Upload spider by id +// @Tags spider +// @Accept json +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param file formData file true "spider file to upload" +// @Param id path string true "spider id" +// @Success 200 json string Response +// @Failure 500 json string Response +// @Router /spiders/{id}/upload [post] func UploadSpiderFromId(c *gin.Context) { // TODO: 与 UploadSpider 部分逻辑重复,需要优化代码 // 爬虫ID @@ -560,6 +651,15 @@ func UploadSpiderFromId(c *gin.Context) { }) } +// @Summary Delete spider by id +// @Description Delete spider by id +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /spiders/{id} [delete] func DeleteSpider(c *gin.Context) { id := c.Param("id") @@ -585,6 +685,15 @@ func DeleteSpider(c *gin.Context) { }) } +// @Summary delete spider +// @Description delete spider +// @Tags spider +// @Accept json +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Success 200 json string Response +// @Failure 500 json string Response +// @Router /spiders [post] func DeleteSelectedSpider(c *gin.Context) { type ReqBody struct { SpiderIds []string `json:"spider_ids"` @@ -615,6 +724,15 @@ func DeleteSelectedSpider(c *gin.Context) { }) } +// @Summary cancel spider +// @Description cancel spider +// @Tags spider +// @Accept json +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Success 200 json string Response +// @Failure 500 json string Response +// @Router /spiders-cancel [post] func CancelSelectedSpider(c *gin.Context) { type ReqBody struct { SpiderIds []string `json:"spider_ids"` @@ -639,6 +757,15 @@ func CancelSelectedSpider(c *gin.Context) { }) } +// @Summary run spider +// @Description run spider +// @Tags spider +// @Accept json +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Success 200 json string Response +// @Failure 500 json string Response +// @Router /spiders-run [post] func RunSelectedSpider(c *gin.Context) { type TaskParam struct { SpiderId bson.ObjectId `json:"spider_id"` @@ -734,6 +861,15 @@ func RunSelectedSpider(c *gin.Context) { }) } +// @Summary Get task list +// @Description Get task list +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /spiders/{id}/tasks [get] func GetSpiderTasks(c *gin.Context) { id := c.Param("id") @@ -756,6 +892,15 @@ func GetSpiderTasks(c *gin.Context) { }) } +// @Summary Get spider stats +// @Description Get spider stats +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /spiders/{id}/stats [get] func GetSpiderStats(c *gin.Context) { type Overview struct { TaskCount int `json:"task_count" bson:"task_count"` @@ -876,6 +1021,15 @@ func GetSpiderStats(c *gin.Context) { }) } +// @Summary Get schedules +// @Description Get schedules +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /spiders/{id}/schedules [get] func GetSpiderSchedules(c *gin.Context) { id := c.Param("id") @@ -902,6 +1056,16 @@ func GetSpiderSchedules(c *gin.Context) { // ======== 爬虫文件管理 ======== +// @Summary Get spider dir +// @Description Get spider dir +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Param path query string true "path" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /spiders/{id}/dir [get] func GetSpiderDir(c *gin.Context) { // 爬虫ID id := c.Param("id") @@ -949,6 +1113,16 @@ type SpiderFileReqBody struct { NewPath string `json:"new_path"` } +// @Summary Get spider file +// @Description Get spider file +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Param path query string true "path" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /spiders/{id}/file [get] func GetSpiderFile(c *gin.Context) { // 爬虫ID id := c.Param("id") @@ -977,6 +1151,15 @@ func GetSpiderFile(c *gin.Context) { }) } +// @Summary Get spider dir +// @Description Get spider dir +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /spiders/{id}/file/tree [get] func GetSpiderFileTree(c *gin.Context) { // 爬虫ID id := c.Param("id") @@ -1007,6 +1190,16 @@ func GetSpiderFileTree(c *gin.Context) { }) } +// @Summary Post spider file +// @Description Post spider file +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Param reqBody body routes.SpiderFileReqBody true "path" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /spiders/{id}/file [post] func PostSpiderFile(c *gin.Context) { // 爬虫ID id := c.Param("id") @@ -1044,6 +1237,16 @@ func PostSpiderFile(c *gin.Context) { }) } +// @Summary Put spider file +// @Description Put spider file +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Param reqBody body routes.SpiderFileReqBody true "path" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /spiders/{id}/file [post] func PutSpiderFile(c *gin.Context) { spiderId := c.Param("id") var reqBody SpiderFileReqBody @@ -1084,6 +1287,16 @@ func PutSpiderFile(c *gin.Context) { }) } +// @Summary Post spider dir +// @Description Post spider dir +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Param reqBody body routes.SpiderFileReqBody true "path" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /spiders/{id}/file [put] func PutSpiderDir(c *gin.Context) { spiderId := c.Param("id") var reqBody SpiderFileReqBody @@ -1124,6 +1337,16 @@ func PutSpiderDir(c *gin.Context) { }) } +// @Summary Delete spider file +// @Description Delete spider file +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Param reqBody body routes.SpiderFileReqBody true "path" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /spiders/{id}/file [delete] func DeleteSpiderFile(c *gin.Context) { spiderId := c.Param("id") var reqBody SpiderFileReqBody @@ -1154,6 +1377,16 @@ func DeleteSpiderFile(c *gin.Context) { }) } +// @Summary Rename spider file +// @Description Rename spider file +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Param reqBody body routes.SpiderFileReqBody true "path" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /spiders/{id}/file/rename [post] func RenameSpiderFile(c *gin.Context) { spiderId := c.Param("id") var reqBody SpiderFileReqBody @@ -1203,6 +1436,15 @@ func RenameSpiderFile(c *gin.Context) { // ======== Scrapy 部分 ======== +// @Summary Get scrapy spider file +// @Description Get scrapy spider file +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /spiders/{id}/scrapy/spiders [get] func GetSpiderScrapySpiders(c *gin.Context) { id := c.Param("id") @@ -1230,6 +1472,15 @@ func GetSpiderScrapySpiders(c *gin.Context) { }) } +// @Summary Put scrapy spider file +// @Description Put scrapy spider file +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /spiders/{id}/scrapy/spiders [put] func PutSpiderScrapySpiders(c *gin.Context) { type ReqBody struct { Name string `json:"name"` @@ -1267,6 +1518,15 @@ func PutSpiderScrapySpiders(c *gin.Context) { }) } +// @Summary Get scrapy spider settings +// @Description Get scrapy spider settings +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /spiders/{id}/scrapy/settings [get] func GetSpiderScrapySettings(c *gin.Context) { id := c.Param("id") @@ -1294,6 +1554,16 @@ func GetSpiderScrapySettings(c *gin.Context) { }) } +// @Summary Get scrapy spider file +// @Description Get scrapy spider file +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Param reqData body []entity.ScrapySettingParam true "req data" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /spiders/{id}/scrapy/settings [post] func PostSpiderScrapySettings(c *gin.Context) { id := c.Param("id") @@ -1325,6 +1595,15 @@ func PostSpiderScrapySettings(c *gin.Context) { }) } +// @Summary Get scrapy spider items +// @Description Get scrapy spider items +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /spiders/{id}/scrapy/items [get] func GetSpiderScrapyItems(c *gin.Context) { id := c.Param("id") @@ -1352,6 +1631,16 @@ func GetSpiderScrapyItems(c *gin.Context) { }) } +// @Summary Post scrapy spider items +// @Description Post scrapy spider items +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Param reqData body []entity.ScrapyItem true "req data" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /spiders/{id}/scrapy/items [post] func PostSpiderScrapyItems(c *gin.Context) { id := c.Param("id") @@ -1383,6 +1672,16 @@ func PostSpiderScrapyItems(c *gin.Context) { }) } + +// @Summary Get scrapy spider pipelines +// @Description Get scrapy spider pipelines +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /spiders/{id}/scrapy/pipelines [get] func GetSpiderScrapyPipelines(c *gin.Context) { id := c.Param("id") @@ -1410,6 +1709,15 @@ func GetSpiderScrapyPipelines(c *gin.Context) { }) } +// @Summary Get scrapy spider file path +// @Description Get scrapy spider file path +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /spiders/{id}/scrapy/spider/filepath [get] func GetSpiderScrapySpiderFilepath(c *gin.Context) { id := c.Param("id") @@ -1447,6 +1755,15 @@ func GetSpiderScrapySpiderFilepath(c *gin.Context) { // ======== Git 部分 ======== +// @Summary Post spider sync git +// @Description Post spider sync git +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /spiders/{id}/git/sync [post] func PostSpiderSyncGit(c *gin.Context) { id := c.Param("id") @@ -1472,6 +1789,15 @@ func PostSpiderSyncGit(c *gin.Context) { }) } +// @Summary Post spider reset git +// @Description Post spider reset git +// @Tags spider +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /spiders/{id}/git/reset [post] func PostSpiderResetGit(c *gin.Context) { id := c.Param("id") From 83357f3100df575d97c2ae944db07c717c20b21b Mon Sep 17 00:00:00 2001 From: hantmac Date: Sun, 3 May 2020 19:50:35 +0800 Subject: [PATCH 07/11] add swagger docs for 'tasks' --- backend/docs/docs.go | 535 +++++++++++++++++++++++++++++++++++++- backend/docs/swagger.json | 533 +++++++++++++++++++++++++++++++++++++ backend/docs/swagger.yaml | 355 +++++++++++++++++++++++++ backend/routes/task.go | 111 ++++++++ 4 files changed, 1533 insertions(+), 1 deletion(-) diff --git a/backend/docs/docs.go b/backend/docs/docs.go index bd40f33a..0a3c620d 100644 --- a/backend/docs/docs.go +++ b/backend/docs/docs.go @@ -1,6 +1,6 @@ // GENERATED BY THE COMMAND ABOVE; DO NOT EDIT // This file was generated by swaggo/swag at -// 2020-05-01 23:10:59.173446 +0800 CST m=+0.074737526 +// 2020-05-03 19:45:55.093832 +0800 CST m=+0.085718059 package docs @@ -2172,6 +2172,505 @@ var doc = `{ } } }, + "/task/{id}": { + "delete": { + "description": "Delete task", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Delete task", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "task id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/tasks": { + "get": { + "description": "Get task list", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Get task list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "req data", + "name": "data", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/routes.TaskListRequestData" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "put": { + "description": "Put task", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Put task", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "delete": { + "description": "Delete tasks", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Delete tasks", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/tasks/{id}": { + "get": { + "description": "Get task", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Get task", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "task id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/tasks/{id}/cancel": { + "post": { + "description": "Cancel task", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Cancel task", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "task id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/tasks/{id}/error-log": { + "delete": { + "description": "Get task error log", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Get task error log", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "task id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/tasks/{id}/log": { + "delete": { + "description": "Get task log", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Get task log", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "task id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/tasks/{id}/restart": { + "post": { + "description": "Restart task", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Restart task", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "task id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/tasks/{id}/results": { + "get": { + "description": "Get task list", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Get task list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "req data", + "name": "data", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/routes.TaskResultsRequestData" + } + }, + { + "type": "string", + "description": "task id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/tasks/{id}/results/download": { + "get": { + "description": "Get task results", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Get task results", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "task id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/tasks_by_status": { + "delete": { + "description": "Delete task", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Delete task", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "task status", + "name": "status", + "in": "query", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, "/version": { "get": { "description": "Get version", @@ -2764,6 +3263,40 @@ var doc = `{ "type": "string" } } + }, + "routes.TaskListRequestData": { + "type": "object", + "properties": { + "nodeId": { + "type": "string" + }, + "pageNum": { + "type": "integer" + }, + "pageSize": { + "type": "integer" + }, + "scheduleId": { + "type": "string" + }, + "spiderId": { + "type": "string" + }, + "status": { + "type": "string" + } + } + }, + "routes.TaskResultsRequestData": { + "type": "object", + "properties": { + "pageNum": { + "type": "integer" + }, + "pageSize": { + "type": "integer" + } + } } } }` diff --git a/backend/docs/swagger.json b/backend/docs/swagger.json index 8ed42bcd..f82d1ba8 100644 --- a/backend/docs/swagger.json +++ b/backend/docs/swagger.json @@ -2151,6 +2151,505 @@ } } }, + "/task/{id}": { + "delete": { + "description": "Delete task", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Delete task", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "task id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/tasks": { + "get": { + "description": "Get task list", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Get task list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "req data", + "name": "data", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/routes.TaskListRequestData" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "put": { + "description": "Put task", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Put task", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "delete": { + "description": "Delete tasks", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Delete tasks", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/tasks/{id}": { + "get": { + "description": "Get task", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Get task", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "task id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/tasks/{id}/cancel": { + "post": { + "description": "Cancel task", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Cancel task", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "task id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/tasks/{id}/error-log": { + "delete": { + "description": "Get task error log", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Get task error log", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "task id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/tasks/{id}/log": { + "delete": { + "description": "Get task log", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Get task log", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "task id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/tasks/{id}/restart": { + "post": { + "description": "Restart task", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Restart task", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "task id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/tasks/{id}/results": { + "get": { + "description": "Get task list", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Get task list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "req data", + "name": "data", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/routes.TaskResultsRequestData" + } + }, + { + "type": "string", + "description": "task id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/tasks/{id}/results/download": { + "get": { + "description": "Get task results", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Get task results", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "task id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/tasks_by_status": { + "delete": { + "description": "Delete task", + "produces": [ + "application/json" + ], + "tags": [ + "task" + ], + "summary": "Delete task", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "task status", + "name": "status", + "in": "query", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, "/version": { "get": { "description": "Get version", @@ -2743,6 +3242,40 @@ "type": "string" } } + }, + "routes.TaskListRequestData": { + "type": "object", + "properties": { + "nodeId": { + "type": "string" + }, + "pageNum": { + "type": "integer" + }, + "pageSize": { + "type": "integer" + }, + "scheduleId": { + "type": "string" + }, + "spiderId": { + "type": "string" + }, + "status": { + "type": "string" + } + } + }, + "routes.TaskResultsRequestData": { + "type": "object", + "properties": { + "pageNum": { + "type": "integer" + }, + "pageSize": { + "type": "integer" + } + } } } } \ No newline at end of file diff --git a/backend/docs/swagger.yaml b/backend/docs/swagger.yaml index 0ced87e0..129aa4a1 100644 --- a/backend/docs/swagger.yaml +++ b/backend/docs/swagger.yaml @@ -381,6 +381,28 @@ definitions: path: type: string type: object + routes.TaskListRequestData: + properties: + nodeId: + type: string + pageNum: + type: integer + pageSize: + type: integer + scheduleId: + type: string + spiderId: + type: string + status: + type: string + type: object + routes.TaskResultsRequestData: + properties: + pageNum: + type: integer + pageSize: + type: integer + type: object info: contact: {} license: {} @@ -1822,6 +1844,339 @@ paths: summary: Upload spider by id tags: - spider + /task/{id}: + delete: + description: Delete task + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: task id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Delete task + tags: + - task + /tasks: + delete: + description: Delete tasks + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Delete tasks + tags: + - task + get: + description: Get task list + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: req data + in: body + name: data + required: true + schema: + $ref: '#/definitions/routes.TaskListRequestData' + type: object + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get task list + tags: + - task + put: + description: Put task + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Put task + tags: + - task + /tasks/{id}: + get: + description: Get task + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: task id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get task + tags: + - task + /tasks/{id}/cancel: + post: + description: Cancel task + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: task id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Cancel task + tags: + - task + /tasks/{id}/error-log: + delete: + description: Get task error log + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: task id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get task error log + tags: + - task + /tasks/{id}/log: + delete: + description: Get task log + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: task id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get task log + tags: + - task + /tasks/{id}/restart: + post: + description: Restart task + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: task id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Restart task + tags: + - task + /tasks/{id}/results: + get: + description: Get task list + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: req data + in: body + name: data + required: true + schema: + $ref: '#/definitions/routes.TaskResultsRequestData' + type: object + - description: task id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get task list + tags: + - task + /tasks/{id}/results/download: + get: + description: Get task results + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: task id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get task results + tags: + - task + /tasks_by_status: + delete: + description: Delete task + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: task status + in: query + name: status + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Delete task + tags: + - task /version: get: description: Get version diff --git a/backend/routes/task.go b/backend/routes/task.go index 2484b300..6f3f2182 100644 --- a/backend/routes/task.go +++ b/backend/routes/task.go @@ -26,6 +26,15 @@ type TaskResultsRequestData struct { PageSize int `form:"page_size"` } +// @Summary Get task list +// @Description Get task list +// @Tags task +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param data body routes.TaskListRequestData true "req data" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /tasks [get] func GetTaskList(c *gin.Context) { // 绑定数据 data := TaskListRequestData{} @@ -81,6 +90,15 @@ func GetTaskList(c *gin.Context) { }) } +// @Summary Get task +// @Description Get task +// @Tags task +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "task id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /tasks/{id} [get] func GetTask(c *gin.Context) { id := c.Param("id") @@ -92,6 +110,14 @@ func GetTask(c *gin.Context) { HandleSuccessData(c, result) } +// @Summary Put task +// @Description Put task +// @Tags task +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /tasks [put] func PutTask(c *gin.Context) { type TaskRequestBody struct { SpiderId bson.ObjectId `json:"spider_id"` @@ -177,6 +203,15 @@ func PutTask(c *gin.Context) { HandleSuccessData(c, taskIds) } +// @Summary Delete task +// @Description Delete task +// @Tags task +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param status query string true "task status" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /tasks_by_status [delete] func DeleteTaskByStatus(c *gin.Context) { status := c.Query("status") @@ -196,6 +231,15 @@ func DeleteTaskByStatus(c *gin.Context) { } // 删除多个任务 + +// @Summary Delete tasks +// @Description Delete tasks +// @Tags task +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /tasks [delete] func DeleteSelectedTask(c *gin.Context) { ids := make(map[string][]string) if err := c.ShouldBindJSON(&ids); err != nil { @@ -217,6 +261,16 @@ func DeleteSelectedTask(c *gin.Context) { } // 删除单个任务 + +// @Summary Delete task +// @Description Delete task +// @Tags task +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "task id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /task/{id} [delete] func DeleteTask(c *gin.Context) { id := c.Param("id") @@ -233,6 +287,15 @@ func DeleteTask(c *gin.Context) { HandleSuccess(c) } +// @Summary Get task log +// @Description Get task log +// @Tags task +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "task id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /tasks/{id}/log [delete] func GetTaskLog(c *gin.Context) { type RequestData struct { PageNum int `form:"page_num"` @@ -258,6 +321,15 @@ func GetTaskLog(c *gin.Context) { }) } +// @Summary Get task error log +// @Description Get task error log +// @Tags task +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "task id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /tasks/{id}/error-log [delete] func GetTaskErrorLog(c *gin.Context) { id := c.Param("id") u := services.GetCurrentUser(c) @@ -273,6 +345,16 @@ func GetTaskErrorLog(c *gin.Context) { }) } +// @Summary Get task list +// @Description Get task list +// @Tags task +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param data body routes.TaskResultsRequestData true "req data" +// @Param id path string true "task id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /tasks/{id}/results [get] func GetTaskResults(c *gin.Context) { id := c.Param("id") @@ -305,6 +387,16 @@ func GetTaskResults(c *gin.Context) { }) } + +// @Summary Get task results +// @Description Get task results +// @Tags task +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "task id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /tasks/{id}/results/download [get] func DownloadTaskResultsCsv(c *gin.Context) { id := c.Param("id") @@ -374,6 +466,16 @@ func DownloadTaskResultsCsv(c *gin.Context) { c.Data(http.StatusOK, "text/csv", bytesBuffer.Bytes()) } + +// @Summary Cancel task +// @Description Cancel task +// @Tags task +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "task id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /tasks/{id}/cancel [post] func CancelTask(c *gin.Context) { id := c.Param("id") @@ -384,6 +486,15 @@ func CancelTask(c *gin.Context) { HandleSuccess(c) } +// @Summary Restart task +// @Description Restart task +// @Tags task +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "task id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /tasks/{id}/restart [post] func RestartTask(c *gin.Context) { id := c.Param("id") From eee81c57c6c7954dcf0f6d63b761bde9cb1c9671 Mon Sep 17 00:00:00 2001 From: hantmac Date: Mon, 4 May 2020 07:48:02 +0800 Subject: [PATCH 08/11] add swagger for system --- backend/docs/docs.go | 427 +++++++++++++++++++++++++++++++++++++- backend/docs/swagger.json | 425 ++++++++++++++++++++++++++++++++++++- backend/docs/swagger.yaml | 287 ++++++++++++++++++++++++- backend/routes/system.go | 77 +++++++ 4 files changed, 1206 insertions(+), 10 deletions(-) diff --git a/backend/docs/docs.go b/backend/docs/docs.go index 0a3c620d..21b2c79a 100644 --- a/backend/docs/docs.go +++ b/backend/docs/docs.go @@ -1,6 +1,6 @@ // GENERATED BY THE COMMAND ABOVE; DO NOT EDIT // This file was generated by swaggo/swag at -// 2020-05-03 19:45:55.093832 +0800 CST m=+0.085718059 +// 2020-05-04 07:44:51.372978 +0800 CST m=+0.135701027 package docs @@ -186,6 +186,279 @@ var doc = `{ } } }, + "/nodes/{id}/deps": { + "get": { + "description": "Get dep list", + "produces": [ + "application/json" + ], + "tags": [ + "system" + ], + "summary": "Get dep list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "node id", + "name": "id", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "language", + "name": "lang", + "in": "query", + "required": true + }, + { + "type": "string", + "description": "dep name", + "name": "dep_name", + "in": "query", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/nodes/{id}/deps/install": { + "post": { + "description": "Install dep", + "produces": [ + "application/json" + ], + "tags": [ + "system" + ], + "summary": "Install dep", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "node id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/nodes/{id}/deps/installed": { + "get": { + "description": "Get installed dep list", + "produces": [ + "application/json" + ], + "tags": [ + "system" + ], + "summary": "Get installed dep list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "node id", + "name": "id", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "language", + "name": "lang", + "in": "query", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/nodes/{id}/deps/uninstall": { + "post": { + "description": "Uninstall dep", + "produces": [ + "application/json" + ], + "tags": [ + "system" + ], + "summary": "Uninstall dep", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "node id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/nodes/{id}/langs": { + "get": { + "description": "Get language list", + "produces": [ + "application/json" + ], + "tags": [ + "system" + ], + "summary": "Get language list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "node id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/nodes/{id}/langs/install": { + "post": { + "description": "Install language", + "produces": [ + "application/json" + ], + "tags": [ + "system" + ], + "summary": "Install language", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "node id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, "/nodes/{id}/system": { "get": { "description": "Get system info", @@ -490,14 +763,14 @@ var doc = `{ }, "/schedules": { "get": { - "description": "Get schedule list", + "description": "Get spider list", "produces": [ "application/json" ], "tags": [ - "schedule" + "spider" ], - "summary": "Get schedule list", + "summary": "Get spider list", "parameters": [ { "type": "string", @@ -505,6 +778,54 @@ var doc = `{ "name": "Authorization", "in": "header", "required": true + }, + { + "type": "string", + "description": "page num", + "name": "page_num", + "in": "query" + }, + { + "type": "string", + "description": "page size", + "name": "page_size", + "in": "query" + }, + { + "type": "string", + "description": "keyword", + "name": "keyword", + "in": "query" + }, + { + "type": "string", + "description": "project_id", + "name": "project_id", + "in": "query" + }, + { + "type": "string", + "description": "type", + "name": "type", + "in": "query" + }, + { + "type": "string", + "description": "sort_key", + "name": "sort_key", + "in": "query" + }, + { + "type": "string", + "description": "sort_direction", + "name": "sort_direction", + "in": "query" + }, + { + "type": "string", + "description": "owner_type", + "name": "owner_type", + "in": "query" } ], "responses": { @@ -2172,6 +2493,104 @@ var doc = `{ } } }, + "/system/deps/": { + "get": { + "description": "Get all dep list", + "produces": [ + "application/json" + ], + "tags": [ + "system" + ], + "summary": "Get all dep list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "language", + "name": "lang", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "dep name", + "name": "dep_nane", + "in": "query", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/system/deps/{lang}/{dep_name}/json": { + "get": { + "description": "Get dep json", + "produces": [ + "application/json" + ], + "tags": [ + "system" + ], + "summary": "Get dep json", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "language", + "name": "lang", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "dep name", + "name": "dep_name", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, "/task/{id}": { "delete": { "description": "Delete task", diff --git a/backend/docs/swagger.json b/backend/docs/swagger.json index f82d1ba8..41a0a3ef 100644 --- a/backend/docs/swagger.json +++ b/backend/docs/swagger.json @@ -165,6 +165,279 @@ } } }, + "/nodes/{id}/deps": { + "get": { + "description": "Get dep list", + "produces": [ + "application/json" + ], + "tags": [ + "system" + ], + "summary": "Get dep list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "node id", + "name": "id", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "language", + "name": "lang", + "in": "query", + "required": true + }, + { + "type": "string", + "description": "dep name", + "name": "dep_name", + "in": "query", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/nodes/{id}/deps/install": { + "post": { + "description": "Install dep", + "produces": [ + "application/json" + ], + "tags": [ + "system" + ], + "summary": "Install dep", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "node id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/nodes/{id}/deps/installed": { + "get": { + "description": "Get installed dep list", + "produces": [ + "application/json" + ], + "tags": [ + "system" + ], + "summary": "Get installed dep list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "node id", + "name": "id", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "language", + "name": "lang", + "in": "query", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/nodes/{id}/deps/uninstall": { + "post": { + "description": "Uninstall dep", + "produces": [ + "application/json" + ], + "tags": [ + "system" + ], + "summary": "Uninstall dep", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "node id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/nodes/{id}/langs": { + "get": { + "description": "Get language list", + "produces": [ + "application/json" + ], + "tags": [ + "system" + ], + "summary": "Get language list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "node id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/nodes/{id}/langs/install": { + "post": { + "description": "Install language", + "produces": [ + "application/json" + ], + "tags": [ + "system" + ], + "summary": "Install language", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "node id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, "/nodes/{id}/system": { "get": { "description": "Get system info", @@ -469,14 +742,14 @@ }, "/schedules": { "get": { - "description": "Get schedule list", + "description": "Get spider list", "produces": [ "application/json" ], "tags": [ - "schedule" + "spider" ], - "summary": "Get schedule list", + "summary": "Get spider list", "parameters": [ { "type": "string", @@ -484,6 +757,54 @@ "name": "Authorization", "in": "header", "required": true + }, + { + "type": "string", + "description": "page num", + "name": "page_num", + "in": "query" + }, + { + "type": "string", + "description": "page size", + "name": "page_size", + "in": "query" + }, + { + "type": "string", + "description": "keyword", + "name": "keyword", + "in": "query" + }, + { + "type": "string", + "description": "project_id", + "name": "project_id", + "in": "query" + }, + { + "type": "string", + "description": "type", + "name": "type", + "in": "query" + }, + { + "type": "string", + "description": "sort_key", + "name": "sort_key", + "in": "query" + }, + { + "type": "string", + "description": "sort_direction", + "name": "sort_direction", + "in": "query" + }, + { + "type": "string", + "description": "owner_type", + "name": "owner_type", + "in": "query" } ], "responses": { @@ -2151,6 +2472,104 @@ } } }, + "/system/deps/": { + "get": { + "description": "Get all dep list", + "produces": [ + "application/json" + ], + "tags": [ + "system" + ], + "summary": "Get all dep list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "language", + "name": "lang", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "dep name", + "name": "dep_nane", + "in": "query", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/system/deps/{lang}/{dep_name}/json": { + "get": { + "description": "Get dep json", + "produces": [ + "application/json" + ], + "tags": [ + "system" + ], + "summary": "Get dep json", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "language", + "name": "lang", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "dep name", + "name": "dep_name", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, "/task/{id}": { "delete": { "description": "Delete task", diff --git a/backend/docs/swagger.yaml b/backend/docs/swagger.yaml index 129aa4a1..e14439be 100644 --- a/backend/docs/swagger.yaml +++ b/backend/docs/swagger.yaml @@ -514,6 +514,189 @@ paths: summary: Post node tags: - node + /nodes/{id}/deps: + get: + description: Get dep list + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: node id + in: path + name: id + required: true + type: string + - description: language + in: query + name: lang + required: true + type: string + - description: dep name + in: query + name: dep_name + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get dep list + tags: + - system + /nodes/{id}/deps/install: + post: + description: Install dep + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: node id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Install dep + tags: + - system + /nodes/{id}/deps/installed: + get: + description: Get installed dep list + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: node id + in: path + name: id + required: true + type: string + - description: language + in: query + name: lang + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get installed dep list + tags: + - system + /nodes/{id}/deps/uninstall: + post: + description: Uninstall dep + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: node id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Uninstall dep + tags: + - system + /nodes/{id}/langs: + get: + description: Get language list + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: node id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get language list + tags: + - system + /nodes/{id}/langs/install: + post: + description: Install language + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: node id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Install language + tags: + - system /nodes/{id}/system: get: description: Get system info @@ -718,13 +901,45 @@ paths: - project /schedules: get: - description: Get schedule list + description: Get spider list parameters: - description: Authorization token in: header name: Authorization required: true type: string + - description: page num + in: query + name: page_num + type: string + - description: page size + in: query + name: page_size + type: string + - description: keyword + in: query + name: keyword + type: string + - description: project_id + in: query + name: project_id + type: string + - description: type + in: query + name: type + type: string + - description: sort_key + in: query + name: sort_key + type: string + - description: sort_direction + in: query + name: sort_direction + type: string + - description: owner_type + in: query + name: owner_type + type: string produces: - application/json responses: @@ -736,9 +951,9 @@ paths: description: Bad Request schema: type: json - summary: Get schedule list + summary: Get spider list tags: - - schedule + - spider put: consumes: - application/json @@ -1844,6 +2059,72 @@ paths: summary: Upload spider by id tags: - spider + /system/deps/: + get: + description: Get all dep list + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: language + in: path + name: lang + required: true + type: string + - description: dep name + in: query + name: dep_nane + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get all dep list + tags: + - system + /system/deps/{lang}/{dep_name}/json: + get: + description: Get dep json + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: language + in: path + name: lang + required: true + type: string + - description: dep name + in: path + name: dep_name + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get dep json + tags: + - system /task/{id}: delete: description: Delete task diff --git a/backend/routes/system.go b/backend/routes/system.go index 2caff7e4..5f3ab7c5 100644 --- a/backend/routes/system.go +++ b/backend/routes/system.go @@ -11,6 +11,15 @@ import ( "strings" ) +// @Summary Get language list +// @Description Get language list +// @Tags system +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "node id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /nodes/{id}/langs [get] func GetLangList(c *gin.Context) { nodeId := c.Param("id") c.JSON(http.StatusOK, Response{ @@ -20,6 +29,17 @@ func GetLangList(c *gin.Context) { }) } +// @Summary Get dep list +// @Description Get dep list +// @Tags system +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "node id" +// @Param lang query string true "language" +// @Param dep_name query string true "dep name" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /nodes/{id}/deps [get] func GetDepList(c *gin.Context) { nodeId := c.Param("id") lang := c.Query("lang") @@ -52,6 +72,16 @@ func GetDepList(c *gin.Context) { }) } +// @Summary Get installed dep list +// @Description Get installed dep list +// @Tags system +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "node id" +// @Param lang query string true "language" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /nodes/{id}/deps/installed [get] func GetInstalledDepList(c *gin.Context) { nodeId := c.Param("id") lang := c.Query("lang") @@ -79,6 +109,16 @@ func GetInstalledDepList(c *gin.Context) { }) } +// @Summary Get all dep list +// @Description Get all dep list +// @Tags system +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param lang path string true "language" +// @Param dep_nane query string true "dep name" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /system/deps/:lang [get] func GetAllDepList(c *gin.Context) { lang := c.Param("lang") depName := c.Query("dep_name") @@ -121,6 +161,15 @@ func GetAllDepList(c *gin.Context) { }) } +// @Summary Install dep +// @Description Install dep +// @Tags system +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "node id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /nodes/{id}/deps/install [Post] func InstallDep(c *gin.Context) { type ReqBody struct { Lang string `json:"lang"` @@ -153,6 +202,15 @@ func InstallDep(c *gin.Context) { }) } +// @Summary Uninstall dep +// @Description Uninstall dep +// @Tags system +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "node id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /nodes/{id}/deps/uninstall [Post] func UninstallDep(c *gin.Context) { type ReqBody struct { Lang string `json:"lang"` @@ -184,6 +242,16 @@ func UninstallDep(c *gin.Context) { }) } +// @Summary Get dep json +// @Description Get dep json +// @Tags system +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param lang path string true "language" +// @Param dep_name path string true "dep name" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /system/deps/{lang}/{dep_name}/json [get] func GetDepJson(c *gin.Context) { depName := c.Param("dep_name") lang := c.Param("lang") @@ -209,6 +277,15 @@ func GetDepJson(c *gin.Context) { }) } +// @Summary Install language +// @Description Install language +// @Tags system +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "node id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /nodes/{id}/langs/install [Post] func InstallLang(c *gin.Context) { type ReqBody struct { Lang string `json:"lang"` From 422fbf7b0d1c7678e88f882c2d544694ef95b385 Mon Sep 17 00:00:00 2001 From: zkqiang Date: Mon, 4 May 2020 15:48:32 +0800 Subject: [PATCH 09/11] fixed bug --- backend/model/schedule.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/backend/model/schedule.go b/backend/model/schedule.go index ee4028af..5faba03b 100644 --- a/backend/model/schedule.go +++ b/backend/model/schedule.go @@ -76,10 +76,14 @@ func GetScheduleList(filter interface{}) ([]Schedule, error) { // 获取爬虫名称 spider, err := GetSpider(schedule.SpiderId) - if err != nil && err == mgo.ErrNotFound { + if err != nil { log.Errorf("get spider by id: %s, error: %s", schedule.SpiderId.Hex(), err.Error()) schedule.Status = constants.ScheduleStatusError - schedule.Message = constants.ScheduleStatusErrorNotFoundSpider + if err == mgo.ErrNotFound { + schedule.Message = constants.ScheduleStatusErrorNotFoundSpider + } else { + schedule.Message = err.Error() + } } else { schedule.SpiderName = spider.Name } From 46ea790cabf6348ffd2e925463c0847069ce6a26 Mon Sep 17 00:00:00 2001 From: hantmac Date: Mon, 4 May 2020 17:54:41 +0800 Subject: [PATCH 10/11] - Add swagger api docs for `user`, `token`, and `variable` --- backend/docs/docs.go | 665 ++++++++++++++++++++++++++++++++++--- backend/docs/swagger.json | 663 +++++++++++++++++++++++++++++++++--- backend/docs/swagger.yaml | 444 +++++++++++++++++++++++-- backend/routes/token.go | 25 ++ backend/routes/user.go | 46 +++ backend/routes/variable.go | 40 +++ 6 files changed, 1745 insertions(+), 138 deletions(-) diff --git a/backend/docs/docs.go b/backend/docs/docs.go index 21b2c79a..f2bcfeb2 100644 --- a/backend/docs/docs.go +++ b/backend/docs/docs.go @@ -1,6 +1,6 @@ // GENERATED BY THE COMMAND ABOVE; DO NOT EDIT // This file was generated by swaggo/swag at -// 2020-05-04 07:44:51.372978 +0800 CST m=+0.135701027 +// 2020-05-04 17:53:13.810815 +0800 CST m=+0.105728870 package docs @@ -763,14 +763,14 @@ var doc = `{ }, "/schedules": { "get": { - "description": "Get spider list", + "description": "Get schedule list", "produces": [ "application/json" ], "tags": [ - "spider" + "schedule" ], - "summary": "Get spider list", + "summary": "Get schedule list", "parameters": [ { "type": "string", @@ -778,54 +778,6 @@ var doc = `{ "name": "Authorization", "in": "header", "required": true - }, - { - "type": "string", - "description": "page num", - "name": "page_num", - "in": "query" - }, - { - "type": "string", - "description": "page size", - "name": "page_size", - "in": "query" - }, - { - "type": "string", - "description": "keyword", - "name": "keyword", - "in": "query" - }, - { - "type": "string", - "description": "project_id", - "name": "project_id", - "in": "query" - }, - { - "type": "string", - "description": "type", - "name": "type", - "in": "query" - }, - { - "type": "string", - "description": "sort_key", - "name": "sort_key", - "in": "query" - }, - { - "type": "string", - "description": "sort_direction", - "name": "sort_direction", - "in": "query" - }, - { - "type": "string", - "description": "owner_type", - "name": "owner_type", - "in": "query" } ], "responses": { @@ -3090,6 +3042,508 @@ var doc = `{ } } }, + "/tokens": { + "get": { + "description": "token", + "produces": [ + "application/json" + ], + "tags": [ + "token" + ], + "summary": "Get token", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "put": { + "description": "token", + "produces": [ + "application/json" + ], + "tags": [ + "token" + ], + "summary": "Put token", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/tokens/{id}": { + "delete": { + "description": "Delete token", + "produces": [ + "application/json" + ], + "tags": [ + "token" + ], + "summary": "Delete token", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "token id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/users": { + "get": { + "description": "Get user list", + "produces": [ + "application/json" + ], + "tags": [ + "token" + ], + "summary": "Get user list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "data body", + "name": "data", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/routes.UserListRequestData" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "put": { + "description": "Put user", + "produces": [ + "application/json" + ], + "tags": [ + "user" + ], + "summary": "Put user", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "reqData body", + "name": "reqData", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/routes.UserRequestData" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/users/{id}": { + "get": { + "description": "user", + "produces": [ + "application/json" + ], + "tags": [ + "user" + ], + "summary": "Get user", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "user id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "post": { + "description": "Post user", + "produces": [ + "application/json" + ], + "tags": [ + "user" + ], + "summary": "Post user", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "user body", + "name": "item", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.User" + } + }, + { + "type": "string", + "description": "user id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "delete": { + "description": "Delete user", + "produces": [ + "application/json" + ], + "tags": [ + "user" + ], + "summary": "Delete user", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "user id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/variable": { + "put": { + "description": "Put variable", + "produces": [ + "application/json" + ], + "tags": [ + "variable" + ], + "summary": "Put variable", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "reqData body", + "name": "variable", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.Variable" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/variable/{id}": { + "post": { + "description": "Post variable", + "produces": [ + "application/json" + ], + "tags": [ + "variable" + ], + "summary": "Post variable", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "reqData body", + "name": "variable", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.Variable" + } + }, + { + "type": "string", + "description": "variable id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "delete": { + "description": "Delete variable", + "produces": [ + "application/json" + ], + "tags": [ + "variable" + ], + "summary": "Delete variable", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "variable id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/variables": { + "get": { + "description": "Get variable list", + "produces": [ + "application/json" + ], + "tags": [ + "variable" + ], + "summary": "Get variable list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, "/version": { "get": { "description": "Get version", @@ -3669,6 +4123,85 @@ var doc = `{ } } }, + "model.User": { + "type": "object", + "properties": { + "_id": { + "type": "string" + }, + "create_ts": { + "type": "string" + }, + "email": { + "type": "string" + }, + "password": { + "type": "string" + }, + "role": { + "type": "string" + }, + "setting": { + "type": "object", + "$ref": "#/definitions/model.UserSetting" + }, + "update_ts": { + "type": "string" + }, + "user_id": { + "type": "string" + }, + "username": { + "type": "string" + } + } + }, + "model.UserSetting": { + "type": "object", + "properties": { + "ding_talk_robot_webhook": { + "type": "string" + }, + "enabled_notifications": { + "type": "array", + "items": { + "type": "string" + } + }, + "error_regex_pattern": { + "type": "string" + }, + "log_expire_duration": { + "type": "integer" + }, + "max_error_log": { + "type": "integer" + }, + "notification_trigger": { + "type": "string" + }, + "wechat_robot_webhook": { + "type": "string" + } + } + }, + "model.Variable": { + "type": "object", + "properties": { + "_id": { + "type": "string" + }, + "key": { + "type": "string" + }, + "remark": { + "type": "string" + }, + "value": { + "type": "string" + } + } + }, "routes.SpiderFileReqBody": { "type": "object", "properties": { @@ -3716,6 +4249,34 @@ var doc = `{ "type": "integer" } } + }, + "routes.UserListRequestData": { + "type": "object", + "properties": { + "pageNum": { + "type": "integer" + }, + "pageSize": { + "type": "integer" + } + } + }, + "routes.UserRequestData": { + "type": "object", + "properties": { + "email": { + "type": "string" + }, + "password": { + "type": "string" + }, + "role": { + "type": "string" + }, + "username": { + "type": "string" + } + } } } }` diff --git a/backend/docs/swagger.json b/backend/docs/swagger.json index 41a0a3ef..bdfc7a72 100644 --- a/backend/docs/swagger.json +++ b/backend/docs/swagger.json @@ -742,14 +742,14 @@ }, "/schedules": { "get": { - "description": "Get spider list", + "description": "Get schedule list", "produces": [ "application/json" ], "tags": [ - "spider" + "schedule" ], - "summary": "Get spider list", + "summary": "Get schedule list", "parameters": [ { "type": "string", @@ -757,54 +757,6 @@ "name": "Authorization", "in": "header", "required": true - }, - { - "type": "string", - "description": "page num", - "name": "page_num", - "in": "query" - }, - { - "type": "string", - "description": "page size", - "name": "page_size", - "in": "query" - }, - { - "type": "string", - "description": "keyword", - "name": "keyword", - "in": "query" - }, - { - "type": "string", - "description": "project_id", - "name": "project_id", - "in": "query" - }, - { - "type": "string", - "description": "type", - "name": "type", - "in": "query" - }, - { - "type": "string", - "description": "sort_key", - "name": "sort_key", - "in": "query" - }, - { - "type": "string", - "description": "sort_direction", - "name": "sort_direction", - "in": "query" - }, - { - "type": "string", - "description": "owner_type", - "name": "owner_type", - "in": "query" } ], "responses": { @@ -3069,6 +3021,508 @@ } } }, + "/tokens": { + "get": { + "description": "token", + "produces": [ + "application/json" + ], + "tags": [ + "token" + ], + "summary": "Get token", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "put": { + "description": "token", + "produces": [ + "application/json" + ], + "tags": [ + "token" + ], + "summary": "Put token", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/tokens/{id}": { + "delete": { + "description": "Delete token", + "produces": [ + "application/json" + ], + "tags": [ + "token" + ], + "summary": "Delete token", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "token id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/users": { + "get": { + "description": "Get user list", + "produces": [ + "application/json" + ], + "tags": [ + "token" + ], + "summary": "Get user list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "data body", + "name": "data", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/routes.UserListRequestData" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "put": { + "description": "Put user", + "produces": [ + "application/json" + ], + "tags": [ + "user" + ], + "summary": "Put user", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "reqData body", + "name": "reqData", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/routes.UserRequestData" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/users/{id}": { + "get": { + "description": "user", + "produces": [ + "application/json" + ], + "tags": [ + "user" + ], + "summary": "Get user", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "user id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "post": { + "description": "Post user", + "produces": [ + "application/json" + ], + "tags": [ + "user" + ], + "summary": "Post user", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "user body", + "name": "item", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.User" + } + }, + { + "type": "string", + "description": "user id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "delete": { + "description": "Delete user", + "produces": [ + "application/json" + ], + "tags": [ + "user" + ], + "summary": "Delete user", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "user id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/variable": { + "put": { + "description": "Put variable", + "produces": [ + "application/json" + ], + "tags": [ + "variable" + ], + "summary": "Put variable", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "reqData body", + "name": "variable", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.Variable" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/variable/{id}": { + "post": { + "description": "Post variable", + "produces": [ + "application/json" + ], + "tags": [ + "variable" + ], + "summary": "Post variable", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "reqData body", + "name": "variable", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.Variable" + } + }, + { + "type": "string", + "description": "variable id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + }, + "delete": { + "description": "Delete variable", + "produces": [ + "application/json" + ], + "tags": [ + "variable" + ], + "summary": "Delete variable", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "variable id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/variables": { + "get": { + "description": "Get variable list", + "produces": [ + "application/json" + ], + "tags": [ + "variable" + ], + "summary": "Get variable list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, "/version": { "get": { "description": "Get version", @@ -3648,6 +4102,85 @@ } } }, + "model.User": { + "type": "object", + "properties": { + "_id": { + "type": "string" + }, + "create_ts": { + "type": "string" + }, + "email": { + "type": "string" + }, + "password": { + "type": "string" + }, + "role": { + "type": "string" + }, + "setting": { + "type": "object", + "$ref": "#/definitions/model.UserSetting" + }, + "update_ts": { + "type": "string" + }, + "user_id": { + "type": "string" + }, + "username": { + "type": "string" + } + } + }, + "model.UserSetting": { + "type": "object", + "properties": { + "ding_talk_robot_webhook": { + "type": "string" + }, + "enabled_notifications": { + "type": "array", + "items": { + "type": "string" + } + }, + "error_regex_pattern": { + "type": "string" + }, + "log_expire_duration": { + "type": "integer" + }, + "max_error_log": { + "type": "integer" + }, + "notification_trigger": { + "type": "string" + }, + "wechat_robot_webhook": { + "type": "string" + } + } + }, + "model.Variable": { + "type": "object", + "properties": { + "_id": { + "type": "string" + }, + "key": { + "type": "string" + }, + "remark": { + "type": "string" + }, + "value": { + "type": "string" + } + } + }, "routes.SpiderFileReqBody": { "type": "object", "properties": { @@ -3695,6 +4228,34 @@ "type": "integer" } } + }, + "routes.UserListRequestData": { + "type": "object", + "properties": { + "pageNum": { + "type": "integer" + }, + "pageSize": { + "type": "integer" + } + } + }, + "routes.UserRequestData": { + "type": "object", + "properties": { + "email": { + "type": "string" + }, + "password": { + "type": "string" + }, + "role": { + "type": "string" + }, + "username": { + "type": "string" + } + } } } } \ No newline at end of file diff --git a/backend/docs/swagger.yaml b/backend/docs/swagger.yaml index e14439be..16d6aa18 100644 --- a/backend/docs/swagger.yaml +++ b/backend/docs/swagger.yaml @@ -372,6 +372,58 @@ definitions: wait_duration: type: number type: object + model.User: + properties: + _id: + type: string + create_ts: + type: string + email: + type: string + password: + type: string + role: + type: string + setting: + $ref: '#/definitions/model.UserSetting' + type: object + update_ts: + type: string + user_id: + type: string + username: + type: string + type: object + model.UserSetting: + properties: + ding_talk_robot_webhook: + type: string + enabled_notifications: + items: + type: string + type: array + error_regex_pattern: + type: string + log_expire_duration: + type: integer + max_error_log: + type: integer + notification_trigger: + type: string + wechat_robot_webhook: + type: string + type: object + model.Variable: + properties: + _id: + type: string + key: + type: string + remark: + type: string + value: + type: string + type: object routes.SpiderFileReqBody: properties: content: @@ -403,6 +455,24 @@ definitions: pageSize: type: integer type: object + routes.UserListRequestData: + properties: + pageNum: + type: integer + pageSize: + type: integer + type: object + routes.UserRequestData: + properties: + email: + type: string + password: + type: string + role: + type: string + username: + type: string + type: object info: contact: {} license: {} @@ -901,45 +971,13 @@ paths: - project /schedules: get: - description: Get spider list + description: Get schedule list parameters: - description: Authorization token in: header name: Authorization required: true type: string - - description: page num - in: query - name: page_num - type: string - - description: page size - in: query - name: page_size - type: string - - description: keyword - in: query - name: keyword - type: string - - description: project_id - in: query - name: project_id - type: string - - description: type - in: query - name: type - type: string - - description: sort_key - in: query - name: sort_key - type: string - - description: sort_direction - in: query - name: sort_direction - type: string - - description: owner_type - in: query - name: owner_type - type: string produces: - application/json responses: @@ -951,9 +989,9 @@ paths: description: Bad Request schema: type: json - summary: Get spider list + summary: Get schedule list tags: - - spider + - schedule put: consumes: - application/json @@ -2458,6 +2496,342 @@ paths: summary: Delete task tags: - task + /tokens: + get: + description: token + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get token + tags: + - token + put: + description: token + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Put token + tags: + - token + /tokens/{id}: + delete: + description: Delete token + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: token id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Delete token + tags: + - token + /users: + get: + description: Get user list + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: data body + in: body + name: data + required: true + schema: + $ref: '#/definitions/routes.UserListRequestData' + type: object + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get user list + tags: + - token + put: + description: Put user + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: reqData body + in: body + name: reqData + required: true + schema: + $ref: '#/definitions/routes.UserRequestData' + type: object + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Put user + tags: + - user + /users/{id}: + delete: + description: Delete user + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: user id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Delete user + tags: + - user + get: + description: user + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: user id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get user + tags: + - user + post: + description: Post user + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: user body + in: body + name: item + required: true + schema: + $ref: '#/definitions/model.User' + type: object + - description: user id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Post user + tags: + - user + /variable: + put: + description: Put variable + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: reqData body + in: body + name: variable + required: true + schema: + $ref: '#/definitions/model.Variable' + type: object + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Put variable + tags: + - variable + /variable/{id}: + delete: + description: Delete variable + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: variable id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Delete variable + tags: + - variable + post: + description: Post variable + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: reqData body + in: body + name: variable + required: true + schema: + $ref: '#/definitions/model.Variable' + type: object + - description: variable id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Post variable + tags: + - variable + /variables: + get: + description: Get variable list + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get variable list + tags: + - variable /version: get: description: Get version diff --git a/backend/routes/token.go b/backend/routes/token.go index 57ad5990..8ad25aed 100644 --- a/backend/routes/token.go +++ b/backend/routes/token.go @@ -9,6 +9,14 @@ import ( "time" ) +// @Summary Get token +// @Description token +// @Tags token +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /tokens [get] func GetTokens(c *gin.Context) { u := services.GetCurrentUser(c) @@ -25,6 +33,14 @@ func GetTokens(c *gin.Context) { }) } +// @Summary Put token +// @Description token +// @Tags token +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /tokens [put] func PutToken(c *gin.Context) { u := services.GetCurrentUser(c) @@ -53,6 +69,15 @@ func PutToken(c *gin.Context) { }) } +// @Summary Delete token +// @Description Delete token +// @Tags token +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "token id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /tokens/{id} [delete] func DeleteToken(c *gin.Context) { id := c.Param("id") diff --git a/backend/routes/user.go b/backend/routes/user.go index 56a8cb2c..a195cd75 100644 --- a/backend/routes/user.go +++ b/backend/routes/user.go @@ -25,6 +25,15 @@ type UserRequestData struct { Email string `json:"email"` } +// @Summary Get user +// @Description user +// @Tags user +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "user id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /users/{id} [get] func GetUser(c *gin.Context) { id := c.Param("id") @@ -41,6 +50,15 @@ func GetUser(c *gin.Context) { }) } +// @Summary Get user list +// @Description Get user list +// @Tags token +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param data body routes.UserListRequestData true "data body" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /users [get] func GetUserList(c *gin.Context) { // 绑定数据 data := UserListRequestData{} @@ -82,6 +100,15 @@ func GetUserList(c *gin.Context) { }) } +// @Summary Put user +// @Description Put user +// @Tags user +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param reqData body routes.UserRequestData true "reqData body" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /users [put] func PutUser(c *gin.Context) { // 绑定请求数据 var reqData UserRequestData @@ -115,6 +142,16 @@ func PutUser(c *gin.Context) { }) } +// @Summary Post user +// @Description Post user +// @Tags user +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param item body model.User true "user body" +// @Param id path string true "user id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /users/{id} [post] func PostUser(c *gin.Context) { id := c.Param("id") @@ -143,6 +180,15 @@ func PostUser(c *gin.Context) { }) } +// @Summary Delete user +// @Description Delete user +// @Tags user +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "user id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /users/{id} [delete] func DeleteUser(c *gin.Context) { id := c.Param("id") diff --git a/backend/routes/variable.go b/backend/routes/variable.go index c35c16ab..c55652ca 100644 --- a/backend/routes/variable.go +++ b/backend/routes/variable.go @@ -8,6 +8,16 @@ import ( ) // 新增 + +// @Summary Put variable +// @Description Put variable +// @Tags variable +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param variable body model.Variable true "reqData body" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /variable [put] func PutVariable(c *gin.Context) { var variable model.Variable if err := c.ShouldBindJSON(&variable); err != nil { @@ -22,6 +32,17 @@ func PutVariable(c *gin.Context) { } // 修改 + +// @Summary Post variable +// @Description Post variable +// @Tags variable +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param variable body model.Variable true "reqData body" +// @Param id path string true "variable id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /variable/{id} [post] func PostVariable(c *gin.Context) { var id = c.Param("id") var variable model.Variable @@ -38,6 +59,16 @@ func PostVariable(c *gin.Context) { } // 删除 + +// @Summary Delete variable +// @Description Delete variable +// @Tags variable +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "variable id" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /variable/{id} [delete] func DeleteVariable(c *gin.Context) { var idStr = c.Param("id") var id = bson.ObjectIdHex(idStr) @@ -56,6 +87,15 @@ func DeleteVariable(c *gin.Context) { } // 列表 + +// @Summary Get variable list +// @Description Get variable list +// @Tags variable +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /variables [get] func GetVariableList(c *gin.Context) { list := model.GetVariableList() HandleSuccessData(c, list) From 1537aa46dbefa748796058d8e1590252121d619a Mon Sep 17 00:00:00 2001 From: hantmac Date: Tue, 5 May 2020 11:10:56 +0800 Subject: [PATCH 11/11] add swagger for 'config spider','docs', 'file', 'version' --- backend/docs/docs.go | 489 +++++++++++++++++++++++++++++++- backend/docs/swagger.json | 487 ++++++++++++++++++++++++++++++- backend/docs/swagger.yaml | 327 ++++++++++++++++++++- backend/routes/config_spider.go | 65 +++++ backend/routes/doc.go | 8 + backend/routes/file.go | 8 + backend/routes/stats.go | 8 + backend/routes/version.go | 8 + 8 files changed, 1387 insertions(+), 13 deletions(-) diff --git a/backend/docs/docs.go b/backend/docs/docs.go index f2bcfeb2..feaa3214 100644 --- a/backend/docs/docs.go +++ b/backend/docs/docs.go @@ -1,6 +1,6 @@ // GENERATED BY THE COMMAND ABOVE; DO NOT EDIT // This file was generated by swaggo/swag at -// 2020-05-04 17:53:13.810815 +0800 CST m=+0.105728870 +// 2020-05-05 11:09:10.499886 +0800 CST m=+0.084916029 package docs @@ -26,6 +26,367 @@ var doc = `{ "host": "{{.Host}}", "basePath": "{{.BasePath}}", "paths": { + "/config_spiders": { + "put": { + "description": "Put config spider", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "config spider" + ], + "summary": "Put config spider", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "spider item", + "name": "spider", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.Spider" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/config_spiders/{id}/config": { + "get": { + "description": "Get config spider", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "config spider" + ], + "summary": "Get config spider", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + }, + "post": { + "description": "Post config spider config", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "config spider" + ], + "summary": "Post config spider config", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "spider item", + "name": "spider", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.Spider" + } + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/config_spiders/{id}/spiderfile": { + "post": { + "description": "Post config spider", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "config spider" + ], + "summary": "Post config spider", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/config_spiders/{id}/upload": { + "post": { + "description": "Upload config spider", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "config spider" + ], + "summary": "Upload config spider", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "spider item", + "name": "spider", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.Spider" + } + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/config_spiders_templates": { + "get": { + "description": "Get config spider template list", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "config spider" + ], + "summary": "Get config spider template list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/docs": { + "get": { + "description": "Get docs", + "produces": [ + "application/json" + ], + "tags": [ + "docs" + ], + "summary": "Get docs", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/file": { + "get": { + "description": "Get file", + "produces": [ + "application/json" + ], + "tags": [ + "file" + ], + "summary": "Get file", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, "/nodes": { "get": { "description": "Get nodes", @@ -761,16 +1122,16 @@ var doc = `{ } } }, - "/schedules": { + "/releases/latest": { "get": { - "description": "Get schedule list", + "description": "Get latest release", "produces": [ "application/json" ], "tags": [ - "schedule" + "version" ], - "summary": "Get schedule list", + "summary": "Get latest release", "parameters": [ { "type": "string", @@ -794,6 +1155,89 @@ var doc = `{ } } } + } + }, + "/schedules": { + "get": { + "description": "Get spider list", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get spider list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "page num", + "name": "page_num", + "in": "query" + }, + { + "type": "string", + "description": "page size", + "name": "page_size", + "in": "query" + }, + { + "type": "string", + "description": "keyword", + "name": "keyword", + "in": "query" + }, + { + "type": "string", + "description": "project_id", + "name": "project_id", + "in": "query" + }, + { + "type": "string", + "description": "type", + "name": "type", + "in": "query" + }, + { + "type": "string", + "description": "sort_key", + "name": "sort_key", + "in": "query" + }, + { + "type": "string", + "description": "sort_direction", + "name": "sort_direction", + "in": "query" + }, + { + "type": "string", + "description": "owner_type", + "name": "owner_type", + "in": "query" + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } }, "put": { "description": "Put schedule", @@ -2445,6 +2889,41 @@ var doc = `{ } } }, + "/stats/home": { + "get": { + "description": "Get home stats", + "produces": [ + "application/json" + ], + "tags": [ + "version" + ], + "summary": "Get home stats", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, "/system/deps/": { "get": { "description": "Get all dep list", diff --git a/backend/docs/swagger.json b/backend/docs/swagger.json index bdfc7a72..47986662 100644 --- a/backend/docs/swagger.json +++ b/backend/docs/swagger.json @@ -5,6 +5,367 @@ "license": {} }, "paths": { + "/config_spiders": { + "put": { + "description": "Put config spider", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "config spider" + ], + "summary": "Put config spider", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "spider item", + "name": "spider", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.Spider" + } + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/config_spiders/{id}/config": { + "get": { + "description": "Get config spider", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "config spider" + ], + "summary": "Get config spider", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + }, + "post": { + "description": "Post config spider config", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "config spider" + ], + "summary": "Post config spider config", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "spider item", + "name": "spider", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.Spider" + } + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/config_spiders/{id}/spiderfile": { + "post": { + "description": "Post config spider", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "config spider" + ], + "summary": "Post config spider", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/config_spiders/{id}/upload": { + "post": { + "description": "Upload config spider", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "config spider" + ], + "summary": "Upload config spider", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "description": "spider item", + "name": "spider", + "in": "body", + "required": true, + "schema": { + "type": "object", + "$ref": "#/definitions/model.Spider" + } + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/config_spiders_templates": { + "get": { + "description": "Get config spider template list", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "config spider" + ], + "summary": "Get config spider template list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "spider id", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "type": "json" + } + } + } + } + }, + "/docs": { + "get": { + "description": "Get docs", + "produces": [ + "application/json" + ], + "tags": [ + "docs" + ], + "summary": "Get docs", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, + "/file": { + "get": { + "description": "Get file", + "produces": [ + "application/json" + ], + "tags": [ + "file" + ], + "summary": "Get file", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, "/nodes": { "get": { "description": "Get nodes", @@ -740,16 +1101,16 @@ } } }, - "/schedules": { + "/releases/latest": { "get": { - "description": "Get schedule list", + "description": "Get latest release", "produces": [ "application/json" ], "tags": [ - "schedule" + "version" ], - "summary": "Get schedule list", + "summary": "Get latest release", "parameters": [ { "type": "string", @@ -773,6 +1134,89 @@ } } } + } + }, + "/schedules": { + "get": { + "description": "Get spider list", + "produces": [ + "application/json" + ], + "tags": [ + "spider" + ], + "summary": "Get spider list", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + }, + { + "type": "string", + "description": "page num", + "name": "page_num", + "in": "query" + }, + { + "type": "string", + "description": "page size", + "name": "page_size", + "in": "query" + }, + { + "type": "string", + "description": "keyword", + "name": "keyword", + "in": "query" + }, + { + "type": "string", + "description": "project_id", + "name": "project_id", + "in": "query" + }, + { + "type": "string", + "description": "type", + "name": "type", + "in": "query" + }, + { + "type": "string", + "description": "sort_key", + "name": "sort_key", + "in": "query" + }, + { + "type": "string", + "description": "sort_direction", + "name": "sort_direction", + "in": "query" + }, + { + "type": "string", + "description": "owner_type", + "name": "owner_type", + "in": "query" + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } }, "put": { "description": "Put schedule", @@ -2424,6 +2868,41 @@ } } }, + "/stats/home": { + "get": { + "description": "Get home stats", + "produces": [ + "application/json" + ], + "tags": [ + "version" + ], + "summary": "Get home stats", + "parameters": [ + { + "type": "string", + "description": "Authorization token", + "name": "Authorization", + "in": "header", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "json" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "type": "json" + } + } + } + } + }, "/system/deps/": { "get": { "description": "Get all dep list", diff --git a/backend/docs/swagger.yaml b/backend/docs/swagger.yaml index 16d6aa18..26598b52 100644 --- a/backend/docs/swagger.yaml +++ b/backend/docs/swagger.yaml @@ -477,6 +477,247 @@ info: contact: {} license: {} paths: + /config_spiders: + put: + consumes: + - application/json + description: Put config spider + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider item + in: body + name: spider + required: true + schema: + $ref: '#/definitions/model.Spider' + type: object + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "500": + description: Internal Server Error + schema: + type: json + summary: Put config spider + tags: + - config spider + /config_spiders/{id}/config: + get: + consumes: + - application/json + description: Get config spider + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "500": + description: Internal Server Error + schema: + type: json + summary: Get config spider + tags: + - config spider + post: + consumes: + - application/json + description: Post config spider config + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider item + in: body + name: spider + required: true + schema: + $ref: '#/definitions/model.Spider' + type: object + - description: spider id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "500": + description: Internal Server Error + schema: + type: json + summary: Post config spider config + tags: + - config spider + /config_spiders/{id}/spiderfile: + post: + consumes: + - application/json + description: Post config spider + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "500": + description: Internal Server Error + schema: + type: json + summary: Post config spider + tags: + - config spider + /config_spiders/{id}/upload: + post: + consumes: + - application/json + description: Upload config spider + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider item + in: body + name: spider + required: true + schema: + $ref: '#/definitions/model.Spider' + type: object + - description: spider id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "500": + description: Internal Server Error + schema: + type: json + summary: Upload config spider + tags: + - config spider + /config_spiders_templates: + get: + consumes: + - application/json + description: Get config spider template list + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: spider id + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "500": + description: Internal Server Error + schema: + type: json + summary: Get config spider template list + tags: + - config spider + /docs: + get: + description: Get docs + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get docs + tags: + - docs + /file: + get: + description: Get file + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get file + tags: + - file /nodes: get: description: Get nodes @@ -969,9 +1210,9 @@ paths: summary: Get project tags tags: - project - /schedules: + /releases/latest: get: - description: Get schedule list + description: Get latest release parameters: - description: Authorization token in: header @@ -989,9 +1230,64 @@ paths: description: Bad Request schema: type: json - summary: Get schedule list + summary: Get latest release tags: - - schedule + - version + /schedules: + get: + description: Get spider list + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + - description: page num + in: query + name: page_num + type: string + - description: page size + in: query + name: page_size + type: string + - description: keyword + in: query + name: keyword + type: string + - description: project_id + in: query + name: project_id + type: string + - description: type + in: query + name: type + type: string + - description: sort_key + in: query + name: sort_key + type: string + - description: sort_direction + in: query + name: sort_direction + type: string + - description: owner_type + in: query + name: owner_type + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get spider list + tags: + - spider put: consumes: - application/json @@ -2097,6 +2393,29 @@ paths: summary: Upload spider by id tags: - spider + /stats/home: + get: + description: Get home stats + parameters: + - description: Authorization token + in: header + name: Authorization + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + type: json + "400": + description: Bad Request + schema: + type: json + summary: Get home stats + tags: + - version /system/deps/: get: description: Get all dep list diff --git a/backend/routes/config_spider.go b/backend/routes/config_spider.go index 52b61ff5..bb66c4a9 100644 --- a/backend/routes/config_spider.go +++ b/backend/routes/config_spider.go @@ -20,6 +20,17 @@ import ( ) // 添加可配置爬虫 + +// @Summary Put config spider +// @Description Put config spider +// @Tags config spider +// @Accept json +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param spider body model.Spider true "spider item" +// @Success 200 json string Response +// @Failure 500 json string Response +// @Router /config_spiders [put] func PutConfigSpider(c *gin.Context) { var spider model.Spider if err := c.ShouldBindJSON(&spider); err != nil { @@ -104,6 +115,18 @@ func PostConfigSpider(c *gin.Context) { } // 上传可配置爬虫Spiderfile + +// @Summary Upload config spider +// @Description Upload config spider +// @Tags config spider +// @Accept json +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param spider body model.Spider true "spider item" +// @Param id path string true "spider id" +// @Success 200 json string Response +// @Failure 500 json string Response +// @Router /config_spiders/{id}/upload [post] func UploadConfigSpider(c *gin.Context) { id := c.Param("id") @@ -190,6 +213,16 @@ func UploadConfigSpider(c *gin.Context) { }) } +// @Summary Post config spider +// @Description Post config spider +// @Tags config spider +// @Accept json +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Success 200 json string Response +// @Failure 500 json string Response +// @Router /config_spiders/{id}/spiderfile [post] func PostConfigSpiderSpiderfile(c *gin.Context) { type Body struct { Content string `json:"content"` @@ -249,6 +282,17 @@ func PostConfigSpiderSpiderfile(c *gin.Context) { }) } +// @Summary Post config spider config +// @Description Post config spider config +// @Tags config spider +// @Accept json +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param spider body model.Spider true "spider item" +// @Param id path string true "spider id" +// @Success 200 json string Response +// @Failure 500 json string Response +// @Router /config_spiders/{id}/config [post] func PostConfigSpiderConfig(c *gin.Context) { id := c.Param("id") @@ -296,6 +340,16 @@ func PostConfigSpiderConfig(c *gin.Context) { }) } +// @Summary Get config spider +// @Description Get config spider +// @Tags config spider +// @Accept json +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Success 200 json string Response +// @Failure 500 json string Response +// @Router /config_spiders/{id}/config [get] func GetConfigSpiderConfig(c *gin.Context) { id := c.Param("id") @@ -319,6 +373,17 @@ func GetConfigSpiderConfig(c *gin.Context) { } // 获取模版名称列表 + +// @Summary Get config spider template list +// @Description Get config spider template list +// @Tags config spider +// @Accept json +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Param id path string true "spider id" +// @Success 200 json string Response +// @Failure 500 json string Response +// @Router /config_spiders_templates [get] func GetConfigSpiderTemplateList(c *gin.Context) { var data []string for _, fInfo := range utils.ListDir("./template/spiderfile") { diff --git a/backend/routes/doc.go b/backend/routes/doc.go index 6426cdcc..f38c5431 100644 --- a/backend/routes/doc.go +++ b/backend/routes/doc.go @@ -8,6 +8,14 @@ import ( "runtime/debug" ) +// @Summary Get docs +// @Description Get docs +// @Tags docs +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /docs [get] func GetDocs(c *gin.Context) { type ResData struct { String string `json:"string"` diff --git a/backend/routes/file.go b/backend/routes/file.go index eaf43ab5..4c9f8576 100644 --- a/backend/routes/file.go +++ b/backend/routes/file.go @@ -7,6 +7,14 @@ import ( "net/http" ) +// @Summary Get file +// @Description Get file +// @Tags file +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /file [get] func GetFile(c *gin.Context) { path := c.Query("path") fileBytes, err := ioutil.ReadFile(path) diff --git a/backend/routes/stats.go b/backend/routes/stats.go index 46c1afc8..02e0993e 100644 --- a/backend/routes/stats.go +++ b/backend/routes/stats.go @@ -9,6 +9,14 @@ import ( "net/http" ) +// @Summary Get home stats +// @Description Get home stats +// @Tags version +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /stats/home [get] func GetHomeStats(c *gin.Context) { type DataOverview struct { TaskCount int `json:"task_count"` diff --git a/backend/routes/version.go b/backend/routes/version.go index f62d1387..8974e7fa 100644 --- a/backend/routes/version.go +++ b/backend/routes/version.go @@ -8,6 +8,14 @@ import ( "runtime/debug" ) +// @Summary Get latest release +// @Description Get latest release +// @Tags version +// @Produce json +// @Param Authorization header string true "Authorization token" +// @Success 200 json string Response +// @Failure 400 json string Response +// @Router /releases/latest [get] func GetLatestRelease(c *gin.Context) { latestRelease, err := services.GetLatestRelease() if err != nil {