diff --git a/crawlab/app.py b/crawlab/app.py index d90bc2d0..b64b9940 100644 --- a/crawlab/app.py +++ b/crawlab/app.py @@ -8,7 +8,7 @@ from celery import Celery from flask import Flask from flask_cors import CORS from flask_restful import Api - +from utils.log import other from constants.node import NodeStatus from db.manager import db_manager from routes.schedules import ScheduleApi @@ -78,7 +78,7 @@ def monitor_nodes_status(celery_app): }) def update_nodes_status_online(event): - print(event) + other.info(f"{event}") with celery_app.connection() as connection: recv = celery_app.events.Receiver(connection, handlers={ diff --git a/crawlab/bin/run_flower.py b/crawlab/bin/run_flower.py index da903c07..18f94ef8 100644 --- a/crawlab/bin/run_flower.py +++ b/crawlab/bin/run_flower.py @@ -6,11 +6,11 @@ import subprocess file_dir = os.path.dirname(os.path.realpath(__file__)) root_path = os.path.abspath(os.path.join(file_dir, '..')) sys.path.append(root_path) - +from utils.log import other from config import BROKER_URL if __name__ == '__main__': p = subprocess.Popen(['celery', 'flower', '-b', BROKER_URL], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) for line in iter(p.stdout.readline, 'b'): if line.decode('utf-8') != '': - print(line.decode('utf-8')) + other.info(line.decode('utf-8')) diff --git a/crawlab/config.py b/crawlab/config.py index 4a19dbfb..7467582e 100644 --- a/crawlab/config.py +++ b/crawlab/config.py @@ -1,7 +1,8 @@ # project variables # 爬虫源码路径 PROJECT_SOURCE_FILE_FOLDER = '../spiders' - +# 配置python虚拟环境的路径 +PYTHON_ENV_PATH="/Users/chennan/Desktop/2019/env/bin/python" # 爬虫部署路径 PROJECT_DEPLOY_FILE_FOLDER = '../deployfile' @@ -29,4 +30,5 @@ MONGO_DB = 'crawlab_test' # flask variables DEBUG = True FLASK_HOST = '127.0.0.1' -FLASK_PORT = 8000 \ No newline at end of file +FLASK_PORT = 8000 + diff --git a/crawlab/manage.py b/crawlab/manage.py index 9ea83d3c..5b9ae3d3 100644 --- a/crawlab/manage.py +++ b/crawlab/manage.py @@ -24,7 +24,7 @@ from routes.spiders import SpiderApi, SpiderImportApi, SpiderManageApi from routes.stats import StatsApi from routes.tasks import TaskApi from tasks.celery import celery_app - +from utils.log import other # flask app instance app = Flask(__name__) app.config.from_object('config') @@ -81,7 +81,7 @@ def run_flower(): p = subprocess.Popen(['celery', 'flower', '-b', BROKER_URL], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) for line in iter(p.stdout.readline, 'b'): if line.decode('utf-8') != '': - print(line.decode('utf-8')) + other.info(line.decode('utf-8')) def run_worker(): diff --git a/crawlab/tasks/spider.py b/crawlab/tasks/spider.py index 5e3bdec1..681ddab2 100644 --- a/crawlab/tasks/spider.py +++ b/crawlab/tasks/spider.py @@ -2,15 +2,12 @@ import os from datetime import datetime from bson import ObjectId -from celery.utils.log import get_logger - -from config import PROJECT_DEPLOY_FILE_FOLDER, PROJECT_LOGS_FOLDER +from config import PROJECT_DEPLOY_FILE_FOLDER, PROJECT_LOGS_FOLDER,PYTHON_ENV_PATH from constants.task import TaskStatus from db.manager import db_manager from .celery import celery_app import subprocess - -logger = get_logger(__name__) +from utils.log import other as logger @celery_app.task(bind=True) @@ -19,6 +16,8 @@ def execute_spider(self, id: str): hostname = self.request.hostname spider = db_manager.get('spiders', id=id) command = spider.get('cmd') + if command.startswith("env"): + command = PYTHON_ENV_PATH + command.replace("env","") current_working_directory = os.path.join(PROJECT_DEPLOY_FILE_FOLDER, str(spider.get('_id'))) diff --git a/crawlab/utils/deploy.py b/crawlab/utils/deploy.py index 305feb5a..f7879a68 100644 --- a/crawlab/utils/deploy.py +++ b/crawlab/utils/deploy.py @@ -1,5 +1,5 @@ import os, zipfile - +from utils.log import other # 打包目录为zip文件(未压缩) def zip_file(source_dir, output_filename): @@ -20,4 +20,4 @@ def unzip_file(zip_src, dst_dir): for file in fz.namelist(): fz.extract(file, dst_dir) else: - print('This is not zip') + other.info('This is not zip') diff --git a/crawlab/utils/spider.py b/crawlab/utils/spider.py index 3e7a438f..61616790 100644 --- a/crawlab/utils/spider.py +++ b/crawlab/utils/spider.py @@ -3,7 +3,6 @@ import os from constants.spider import FILE_SUFFIX_LANG_MAPPING, LangType, SUFFIX_IGNORE, SpiderType from db.manager import db_manager - def get_lang_by_stats(stats: dict) -> LangType: """ :param stats: stats is generated by utils.file.get_file_suffix_stats @@ -18,7 +17,7 @@ def get_lang_by_stats(stats: dict) -> LangType: return FILE_SUFFIX_LANG_MAPPING.get(top_suffix) return LangType.OTHER except IndexError as e: - print(e.args,stats) + pass diff --git a/deployfile/5ca46b508ffaed82c65f0f9c/juejin/__init__.py b/deployfile/5ca46b508ffaed82c65f0f9c/juejin/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/deployfile/5ca46b508ffaed82c65f0f9c/juejin/items.py b/deployfile/5ca46b508ffaed82c65f0f9c/juejin/items.py new file mode 100644 index 00000000..2c4717dd --- /dev/null +++ b/deployfile/5ca46b508ffaed82c65f0f9c/juejin/items.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# http://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class JuejinItem(scrapy.Item): + # define the fields for your item here like: + _id = scrapy.Field() + title = scrapy.Field() + link = scrapy.Field() + like = scrapy.Field() + task_id = scrapy.Field() diff --git a/deployfile/5ca46b508ffaed82c65f0f9c/juejin/middlewares.py b/deployfile/5ca46b508ffaed82c65f0f9c/juejin/middlewares.py new file mode 100644 index 00000000..9d5225a2 --- /dev/null +++ b/deployfile/5ca46b508ffaed82c65f0f9c/juejin/middlewares.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# http://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class JuejinSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/deployfile/5ca46b508ffaed82c65f0f9c/juejin/pipelines.py b/deployfile/5ca46b508ffaed82c65f0f9c/juejin/pipelines.py new file mode 100644 index 00000000..b34aac50 --- /dev/null +++ b/deployfile/5ca46b508ffaed82c65f0f9c/juejin/pipelines.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html +import os + +from pymongo import MongoClient + +MONGO_HOST = '127.0.0.1' +MONGO_PORT = 27017 +MONGO_DB = 'crawlab_test' + + +class JuejinPipeline(object): + mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT) + db = mongo[MONGO_DB] + col_name = os.environ.get('CRAWLAB_COLLECTION','test') + col = db[col_name] + + def process_item(self, item, spider): + item['task_id'] = os.environ.get('CRAWLAB_TASK_ID') + self.col.save(item) + return item diff --git a/deployfile/5ca46b508ffaed82c65f0f9c/juejin/settings.py b/deployfile/5ca46b508ffaed82c65f0f9c/juejin/settings.py new file mode 100644 index 00000000..44f8866c --- /dev/null +++ b/deployfile/5ca46b508ffaed82c65f0f9c/juejin/settings.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for juejin project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# http://doc.scrapy.org/en/latest/topics/settings.html +# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html +# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'juejin' + +SPIDER_MODULES = ['juejin.spiders'] +NEWSPIDER_MODULE = 'juejin.spiders' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +# CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +# DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +# } + +# Enable or disable spider middlewares +# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# 'juejin.middlewares.JuejinSpiderMiddleware': 543, +# } + +# Enable or disable downloader middlewares +# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html +# DOWNLOADER_MIDDLEWARES = { +# 'juejin.middlewares.MyCustomDownloaderMiddleware': 543, +# } + +# Enable or disable extensions +# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +# } + +# Configure item pipelines +# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'juejin.pipelines.JuejinPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See http://doc.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/deployfile/5ca46b508ffaed82c65f0f9c/juejin/spiders/__init__ 2.py b/deployfile/5ca46b508ffaed82c65f0f9c/juejin/spiders/__init__ 2.py new file mode 100644 index 00000000..ebd689ac --- /dev/null +++ b/deployfile/5ca46b508ffaed82c65f0f9c/juejin/spiders/__init__ 2.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/deployfile/5ca46b508ffaed82c65f0f9c/juejin/spiders/__init__.py b/deployfile/5ca46b508ffaed82c65f0f9c/juejin/spiders/__init__.py new file mode 100644 index 00000000..ebd689ac --- /dev/null +++ b/deployfile/5ca46b508ffaed82c65f0f9c/juejin/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/deployfile/5ca46b508ffaed82c65f0f9c/juejin/spiders/juejin_spider.py b/deployfile/5ca46b508ffaed82c65f0f9c/juejin/spiders/juejin_spider.py new file mode 100644 index 00000000..28df5be7 --- /dev/null +++ b/deployfile/5ca46b508ffaed82c65f0f9c/juejin/spiders/juejin_spider.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +import scrapy +from juejin.items import JuejinItem + + +class JuejinSpiderSpider(scrapy.Spider): + name = 'juejin_spider' + allowed_domains = ['juejin.com'] + start_urls = ['https://juejin.im/search?query=celery'] + + def parse(self, response): + for item in response.css('ul.main-list > li.item'): + yield JuejinItem( + title=item.css('.title span').extract_first(), + link=item.css('a::attr("href")').extract_first(), + like=item.css('.like .count::text').extract_first(), + ) diff --git a/deployfile/5ca46b508ffaed82c65f0f9c/scrapy.cfg b/deployfile/5ca46b508ffaed82c65f0f9c/scrapy.cfg new file mode 100644 index 00000000..38ba44f1 --- /dev/null +++ b/deployfile/5ca46b508ffaed82c65f0f9c/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.org/en/latest/deploy.html + +[settings] +default = juejin.settings + +[deploy] +#url = http://localhost:6800/ +project = juejin diff --git a/deployfile/5ca46b508ffaed82c65f0f9c/start.py b/deployfile/5ca46b508ffaed82c65f0f9c/start.py new file mode 100644 index 00000000..ec2f47dd --- /dev/null +++ b/deployfile/5ca46b508ffaed82c65f0f9c/start.py @@ -0,0 +1,2 @@ +from scrapy import cmdline +cmdline.execute(["scrapy","crawl","juejin_spider"]) \ No newline at end of file diff --git a/deployfile/5ca46b508ffaed82c65f0fa0/juejin_spider 2.js b/deployfile/5ca46b508ffaed82c65f0fa0/juejin_spider 2.js index 7a02f5a2..3cf2bcac 100644 --- a/deployfile/5ca46b508ffaed82c65f0fa0/juejin_spider 2.js +++ b/deployfile/5ca46b508ffaed82c65f0fa0/juejin_spider 2.js @@ -52,7 +52,7 @@ const MongoClient = require('mongodb').MongoClient; }); // open database connection - const client = await MongoClient.connect('mongodb://192.168.99.100:27017'); + const client = await MongoClient.connect('mongodb://127.0.0.1:27017'); let db = await client.db('crawlab_test'); const colName = process.env.CRAWLAB_COLLECTION || 'results_juejin'; const taskId = process.env.CRAWLAB_TASK_ID;