Merge pull request #75 from tikazyq/develop

Develop
2026-01-22 17:31:03 +01:00 · 2019-06-29 11:53:04 +08:00
parent a1c1ba49ce 1de4281ec9
commit c79b1f8622
17 changed files with 172 additions and 93 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,7 +2,6 @@
 ### Features / Enhancement
 - **Documentation**: Better and much more detailed documentation.
 - **Better Crontab**: Make crontab expression through crontab UI.
- **High Concurrency**: `gevent` + `flask` to support high concurrency. [#70](https://github.com/tikazyq/crawlab/issues/70)

 ### Bugs Fixes
 - **Deleting Spider**. Deleting a spider does not only remove record in db but also removing related folder, tasks and schedules. [#69](https://github.com/tikazyq/crawlab/issues/69)
--- a/4
+++ b/4
@@ -35,8 +35,8 @@ RUN npm install -g yarn \
 	&& yarn install

 # install backend
-RUN pip install -U setuptools \
-	&& pip install -r /opt/crawlab/crawlab/requirements.txt
+RUN pip install -U setuptools -i https://pypi.tuna.tsinghua.edu.cn/simple \
+	&& pip install -r /opt/crawlab/crawlab/requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple

 # start backend
 EXPOSE 8080
--- a/9
+++ b/9
@@ -24,7 +24,7 @@ pipeline {
            steps {
                echo "Building..."
                sh """
-                docker build -t crawlab:latest .
+                docker build -t tikazyq/crawlab:latest .
                """
            }
        }
@@ -37,12 +37,13 @@ pipeline {
            steps {
                echo 'Deploying....'
                sh """
-                docker stop crawlab | true
-                docker run -d --rm --restart always --name crawlab \
+                docker rm -f crawlab | true
+                docker run -d --rm --name crawlab \
                    -p 8080:8080 \
                    -p 8000:8000 \
                    -v /home/yeqing/.env.production:/opt/crawlab/frontend/.env.production \
-                    -v /home/yeqing/config.py:/opt/crawlab/crawlab/config/config.py
+                    -v /home/yeqing/config.py:/opt/crawlab/crawlab/config/config.py \
+                    tikazyq/crawlab master
                """
            }
        }
--- a/crawlab/requirements.txt
+++ b/crawlab/requirements.txt
@@ -13,3 +13,4 @@ Werkzeug==0.15.2
 eventlet
 Celery
 Flower
+redis
--- a/crawlab/tasks/spider.py
+++ b/crawlab/tasks/spider.py
@@ -2,6 +2,7 @@ import os
 import sys
 from datetime import datetime
 from time import sleep
+import traceback

 from bson import ObjectId
 from pymongo import ASCENDING, DESCENDING
@@ -213,8 +214,10 @@ def execute_config_spider(self, id: str, params: str = None):
    env['MONGO_HOST'] = MONGO_HOST
    env['MONGO_PORT'] = str(MONGO_PORT)
    env['MONGO_DB'] = MONGO_DB
-    env['MONGO_USERNAME'] = MONGO_USERNAME
-    env['MONGO_PASSWORD'] = MONGO_PASSWORD
+    if MONGO_USERNAME is not None:
+        env['MONGO_USERNAME'] = MONGO_USERNAME
+    if MONGO_PASSWORD:
+        env['MONGO_PASSWORD'] = MONGO_PASSWORD

    cmd_arr = [
        sys.executable,
@@ -246,6 +249,7 @@ def execute_config_spider(self, id: str, params: str = None):
        else:
            status = TaskStatus.FAILURE
    except Exception as err:
+        traceback.print_exc()
        logger.error(err)
        stderr.write(str(err))
        status = TaskStatus.FAILURE
--- a/spiders/example_juejin/juejin/items.py
+++ b/spiders/example_juejin/juejin/items.py
@@ -1,17 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Define here the models for your scraped items
-#
-# See documentation in:
-# http://doc.scrapy.org/en/latest/topics/items.html
-
-import scrapy
-
-
-class JuejinItem(scrapy.Item):
-    # define the fields for your item here like:
-    _id = scrapy.Field()
-    title = scrapy.Field()
-    link = scrapy.Field()
-    like = scrapy.Field()
-    task_id = scrapy.Field()
--- a/spiders/example_juejin/juejin/pipelines.py
+++ b/spiders/example_juejin/juejin/pipelines.py
@@ -1,25 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Define your item pipelines here
-#
-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
-import os
-
-from pymongo import MongoClient
-
-MONGO_HOST = os.environ['MONGO_HOST']
-MONGO_PORT = int(os.environ['MONGO_PORT'])
-MONGO_DB = os.environ['MONGO_DB']
-
-
-class JuejinPipeline(object):
-    mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
-    db = mongo[MONGO_DB]
-    col_name = os.environ.get('CRAWLAB_COLLECTION','test')
-    col = db[col_name]
-
-    def process_item(self, item, spider):
-        item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
-        self.col.save(item)
-        return item
--- a/spiders/example_juejin/juejin/spiders/juejin_spider.py
+++ b/spiders/example_juejin/juejin/spiders/juejin_spider.py
@@ -1,17 +0,0 @@
-# -*- coding: utf-8 -*-
-import scrapy
-from juejin.items import JuejinItem
-
-
-class JuejinSpiderSpider(scrapy.Spider):
-    name = 'juejin_spider'
-    allowed_domains = ['juejin.com']
-    start_urls = ['https://juejin.im/search?query=celery']
-
-    def parse(self, response):
-        for item in response.css('ul.main-list > li.item'):
-            yield JuejinItem(
-                title=item.css('.title span').extract_first(),
-                link=item.css('a::attr("href")').extract_first(),
-                like=item.css('.like .count::text').extract_first(),
-            )
--- a/spiders/example_juejin/start.py
+++ b/spiders/example_juejin/start.py
@@ -1,2 +0,0 @@
-from scrapy import cmdline
-cmdline.execute(["scrapy","crawl","juejin_spider"])
--- a/spiders/example_juejin/scrapy.cfg
+++ b/spiders/example_juejin/scrapy.cfg
@@ -1,11 +1,11 @@
 # Automatically created by: scrapy startproject
 #
 # For more information about the [deploy] section see:
-# https://scrapyd.readthedocs.org/en/latest/deploy.html
+# https://scrapyd.readthedocs.io/en/latest/deploy.html

 [settings]
-default = juejin.settings
+default = xueqiu.settings

 [deploy]
 #url = http://localhost:6800/
-project = juejin
+project = xueqiu
--- a/spiders/example_juejin/juejin/init.py
+++ b/spiders/example_juejin/juejin/init.py
--- a/spiders/xueqiu/xueqiu/items.py
+++ b/spiders/xueqiu/xueqiu/items.py
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class XueqiuItem(scrapy.Item):
+    # define the fields for your item here like:
+    _id = scrapy.Field()
+    task_id = scrapy.Field()
+    id = scrapy.Field()
+    text = scrapy.Field()
+    target = scrapy.Field()
+    view_count = scrapy.Field()
+    mark = scrapy.Field()
+    created_at = scrapy.Field()
--- a/spiders/example_juejin/juejin/middlewares.py
+++ b/spiders/example_juejin/juejin/middlewares.py
@@ -3,12 +3,12 @@
 # Define here the models for your spider middleware
 #
 # See documentation in:
-# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

 from scrapy import signals


-class JuejinSpiderMiddleware(object):
+class XueqiuSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.
@@ -54,3 +54,50 @@ class JuejinSpiderMiddleware(object):

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class XueqiuDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/spiders/xueqiu/xueqiu/pipelines.py
+++ b/spiders/xueqiu/xueqiu/pipelines.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import os
+
+from pymongo import MongoClient
+
+
+class XueqiuPipeline(object):
+    mongo = MongoClient(
+        host=os.environ.get('MONGO_HOST') or 'localhost',
+        port=int(os.environ.get('MONGO_PORT') or 27017)
+    )
+    db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test']
+    col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'results_xueqiu')
+
+    def process_item(self, item, spider):
+        item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
+        item['_id'] = item['id']
+        if self.col.find_one({'_id': item['_id']}) is None:
+            self.col.save(item)
+            return item
--- a/spiders/example_juejin/juejin/settings.py
+++ b/spiders/example_juejin/juejin/settings.py
@@ -1,21 +1,21 @@
 # -*- coding: utf-8 -*-

-# Scrapy settings for juejin project
+# Scrapy settings for xueqiu project
 #
 # For simplicity, this file contains only settings considered important or
 # commonly used. You can find more settings consulting the documentation:
 #
-#     http://doc.scrapy.org/en/latest/topics/settings.html
-#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
-#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

-BOT_NAME = 'juejin'
+BOT_NAME = 'xueqiu'

-SPIDER_MODULES = ['juejin.spiders']
-NEWSPIDER_MODULE = 'juejin.spiders'
+SPIDER_MODULES = ['xueqiu.spiders']
+NEWSPIDER_MODULE = 'xueqiu.spiders'

 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
+USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'

 # Obey robots.txt rules
 ROBOTSTXT_OBEY = True
@@ -24,7 +24,7 @@ ROBOTSTXT_OBEY = True
 # CONCURRENT_REQUESTS = 32

 # Configure a delay for requests for the same website (default: 0)
-# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
 # DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
@@ -44,31 +44,31 @@ ROBOTSTXT_OBEY = True
 # }

 # Enable or disable spider middlewares
-# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 # SPIDER_MIDDLEWARES = {
-#    'juejin.middlewares.JuejinSpiderMiddleware': 543,
+#    'xueqiu.middlewares.XueqiuSpiderMiddleware': 543,
 # }

 # Enable or disable downloader middlewares
-# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 # DOWNLOADER_MIDDLEWARES = {
-#    'juejin.middlewares.MyCustomDownloaderMiddleware': 543,
+#    'xueqiu.middlewares.XueqiuDownloaderMiddleware': 543,
 # }

 # Enable or disable extensions
-# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
 # EXTENSIONS = {
 #    'scrapy.extensions.telnet.TelnetConsole': None,
 # }

 # Configure item pipelines
-# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
-    'juejin.pipelines.JuejinPipeline': 300,
+    'xueqiu.pipelines.XueqiuPipeline': 300,
 }

 # Enable and configure the AutoThrottle extension (disabled by default)
-# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 # AUTOTHROTTLE_ENABLED = True
 # The initial download delay
 # AUTOTHROTTLE_START_DELAY = 5
@@ -81,7 +81,7 @@ ITEM_PIPELINES = {
 # AUTOTHROTTLE_DEBUG = False

 # Enable and configure HTTP caching (disabled by default)
-# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 # HTTPCACHE_ENABLED = True
 # HTTPCACHE_EXPIRATION_SECS = 0
 # HTTPCACHE_DIR = 'httpcache'
--- a/spiders/example_juejin/juejin/spiders/init.py
+++ b/spiders/example_juejin/juejin/spiders/init.py
--- a/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py
+++ b/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+import json
+from time import sleep
+
+import scrapy
+
+from xueqiu.items import XueqiuItem
+
+
+class XueqiuSpiderSpider(scrapy.Spider):
+    name = 'xueqiu_spider'
+    allowed_domains = ['xueqiu.com']
+
+    def start_requests(self):
+        return [scrapy.Request(
+            url='https://xueqiu.com',
+            callback=self.parse_home
+        )]
+
+    def parse_home(self, response):
+        yield scrapy.Request(
+            url='https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=20&category=6'
+        )
+
+    def parse(self, response):
+        data = json.loads(response.body)
+        next_max_id = data.get('next_max_id')
+        sleep(1)
+        for row in data.get('list'):
+            d = json.loads(row.get('data'))
+            item = XueqiuItem(
+                id=d['id'],
+                text=d['text'],
+                mark=d['mark'],
+                target=d['target'],
+                created_at=d['created_at'],
+                view_count=d['view_count'],
+            )
+            yield item
+
+        yield scrapy.Request(
+            url=f'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id={next_max_id}&count=20&category=6'
+        )