diff --git a/CHANGELOG.md b/CHANGELOG.md index 70cd6e83..7c81377a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,6 @@ ### Features / Enhancement - **Documentation**: Better and much more detailed documentation. - **Better Crontab**: Make crontab expression through crontab UI. -- **High Concurrency**: `gevent` + `flask` to support high concurrency. [#70](https://github.com/tikazyq/crawlab/issues/70) ### Bugs Fixes - **Deleting Spider**. Deleting a spider does not only remove record in db but also removing related folder, tasks and schedules. [#69](https://github.com/tikazyq/crawlab/issues/69) diff --git a/Jenkinsfile b/Jenkinsfile index 9c893e51..b22488cc 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -37,7 +37,7 @@ pipeline { steps { echo 'Deploying....' sh """ - docker stop crawlab | true + docker rm -f crawlab | true docker run -d --rm --name crawlab \ -p 8080:8080 \ -p 8000:8000 \ diff --git a/spiders/example_juejin/juejin/__init__.py b/spiders/example_juejin/juejin/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/spiders/example_juejin/juejin/items.py b/spiders/example_juejin/juejin/items.py deleted file mode 100644 index 2c4717dd..00000000 --- a/spiders/example_juejin/juejin/items.py +++ /dev/null @@ -1,17 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# http://doc.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class JuejinItem(scrapy.Item): - # define the fields for your item here like: - _id = scrapy.Field() - title = scrapy.Field() - link = scrapy.Field() - like = scrapy.Field() - task_id = scrapy.Field() diff --git a/spiders/example_juejin/juejin/middlewares.py b/spiders/example_juejin/juejin/middlewares.py deleted file mode 100644 index 9d5225a2..00000000 --- a/spiders/example_juejin/juejin/middlewares.py +++ /dev/null @@ -1,56 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# http://doc.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class JuejinSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Response, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/spiders/example_juejin/juejin/pipelines.py b/spiders/example_juejin/juejin/pipelines.py deleted file mode 100644 index 1c4ffdc1..00000000 --- a/spiders/example_juejin/juejin/pipelines.py +++ /dev/null @@ -1,25 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html -import os - -from pymongo import MongoClient - -MONGO_HOST = os.environ['MONGO_HOST'] -MONGO_PORT = int(os.environ['MONGO_PORT']) -MONGO_DB = os.environ['MONGO_DB'] - - -class JuejinPipeline(object): - mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT) - db = mongo[MONGO_DB] - col_name = os.environ.get('CRAWLAB_COLLECTION','test') - col = db[col_name] - - def process_item(self, item, spider): - item['task_id'] = os.environ.get('CRAWLAB_TASK_ID') - self.col.save(item) - return item diff --git a/spiders/example_juejin/juejin/settings.py b/spiders/example_juejin/juejin/settings.py deleted file mode 100644 index 44f8866c..00000000 --- a/spiders/example_juejin/juejin/settings.py +++ /dev/null @@ -1,89 +0,0 @@ -# -*- coding: utf-8 -*- - -# Scrapy settings for juejin project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# http://doc.scrapy.org/en/latest/topics/settings.html -# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html -# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'juejin' - -SPIDER_MODULES = ['juejin.spiders'] -NEWSPIDER_MODULE = 'juejin.spiders' - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -# CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -# DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -# CONCURRENT_REQUESTS_PER_DOMAIN = 16 -# CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -# COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -# TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -# DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -# } - -# Enable or disable spider middlewares -# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html -# SPIDER_MIDDLEWARES = { -# 'juejin.middlewares.JuejinSpiderMiddleware': 543, -# } - -# Enable or disable downloader middlewares -# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html -# DOWNLOADER_MIDDLEWARES = { -# 'juejin.middlewares.MyCustomDownloaderMiddleware': 543, -# } - -# Enable or disable extensions -# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html -# EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -# } - -# Configure item pipelines -# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'juejin.pipelines.JuejinPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See http://doc.scrapy.org/en/latest/topics/autothrottle.html -# AUTOTHROTTLE_ENABLED = True -# The initial download delay -# AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -# AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -# AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -# HTTPCACHE_ENABLED = True -# HTTPCACHE_EXPIRATION_SECS = 0 -# HTTPCACHE_DIR = 'httpcache' -# HTTPCACHE_IGNORE_HTTP_CODES = [] -# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/spiders/example_juejin/juejin/spiders/__init__.py b/spiders/example_juejin/juejin/spiders/__init__.py deleted file mode 100644 index ebd689ac..00000000 --- a/spiders/example_juejin/juejin/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/spiders/example_juejin/juejin/spiders/juejin_spider.py b/spiders/example_juejin/juejin/spiders/juejin_spider.py deleted file mode 100644 index 28df5be7..00000000 --- a/spiders/example_juejin/juejin/spiders/juejin_spider.py +++ /dev/null @@ -1,17 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -from juejin.items import JuejinItem - - -class JuejinSpiderSpider(scrapy.Spider): - name = 'juejin_spider' - allowed_domains = ['juejin.com'] - start_urls = ['https://juejin.im/search?query=celery'] - - def parse(self, response): - for item in response.css('ul.main-list > li.item'): - yield JuejinItem( - title=item.css('.title span').extract_first(), - link=item.css('a::attr("href")').extract_first(), - like=item.css('.like .count::text').extract_first(), - ) diff --git a/spiders/example_juejin/scrapy.cfg b/spiders/example_juejin/scrapy.cfg deleted file mode 100644 index 38ba44f1..00000000 --- a/spiders/example_juejin/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.org/en/latest/deploy.html - -[settings] -default = juejin.settings - -[deploy] -#url = http://localhost:6800/ -project = juejin diff --git a/spiders/example_juejin/start.py b/spiders/example_juejin/start.py deleted file mode 100644 index ec2f47dd..00000000 --- a/spiders/example_juejin/start.py +++ /dev/null @@ -1,2 +0,0 @@ -from scrapy import cmdline -cmdline.execute(["scrapy","crawl","juejin_spider"]) \ No newline at end of file