From 5daebcb39bd2232e90ced91db5f764efcd17d7e7 Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Sun, 26 May 2019 17:10:04 +0800 Subject: [PATCH] updated configurable spider --- crawlab/routes/spiders.py | 22 +++- crawlab/spiders/spiders/db.py | 14 ++ crawlab/spiders/spiders/items.py | 8 +- crawlab/spiders/spiders/pipelines.py | 6 + crawlab/spiders/spiders/settings.py | 62 ++++----- .../spiders/spiders/spiders/config_spider.py | 63 +++++++++ crawlab/tasks/spider.py | 123 +++++++++++++++++- frontend/src/components/Config/ConfigList.vue | 65 +++++++-- .../components/InfoView/SpiderInfoView.vue | 7 +- frontend/src/i18n/zh.js | 16 +++ frontend/src/store/modules/spider.js | 5 +- frontend/src/views/schedule/ScheduleList.vue | 2 +- frontend/src/views/spider/SpiderList.vue | 18 ++- frontend/src/views/task/TaskDetail.vue | 23 +++- 14 files changed, 371 insertions(+), 63 deletions(-) create mode 100644 crawlab/spiders/spiders/db.py create mode 100644 crawlab/spiders/spiders/spiders/config_spider.py diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py index ae36a69b..47776297 100644 --- a/crawlab/routes/spiders.py +++ b/crawlab/routes/spiders.py @@ -19,7 +19,7 @@ from constants.task import TaskStatus from db.manager import db_manager from routes.base import BaseApi from tasks.scheduler import scheduler -from tasks.spider import execute_spider +from tasks.spider import execute_spider, execute_config_spider from utils import jsonify from utils.deploy import zip_file, unzip_file from utils.file import get_file_suffix_stats, get_file_suffix @@ -83,8 +83,17 @@ class SpiderApi(BaseApi): # spider item selector ('item_selector', str), + # spider item selector type + ('item_selector_type', str), + # spider pagination selector ('pagination_selector', str), + + # spider pagination selector type + ('pagination_selector_type', str), + + # whether to obey robots.txt + ('obey_robots_txt', str), ) def get(self, id=None, action=None): @@ -251,7 +260,16 @@ class SpiderApi(BaseApi): spider = db_manager.get('spiders', id=ObjectId(id)) - job = execute_spider.delay(id, params) + # determine execute function + if spider['type'] == SpiderType.CONFIGURABLE: + # configurable spider + exec_func = execute_config_spider + else: + # customized spider + exec_func = execute_spider + + # trigger an asynchronous job + job = exec_func.delay(id, params) # create a new task db_manager.save('tasks', { diff --git a/crawlab/spiders/spiders/db.py b/crawlab/spiders/spiders/db.py new file mode 100644 index 00000000..18925f8d --- /dev/null +++ b/crawlab/spiders/spiders/db.py @@ -0,0 +1,14 @@ +import os + +from pymongo import MongoClient + +MONGO_HOST = os.environ.get('MONGO_HOST') +MONGO_PORT = int(os.environ.get('MONGO_PORT')) +MONGO_DB = os.environ.get('MONGO_DB') +mongo = MongoClient(host=MONGO_HOST, + port=MONGO_PORT) +db = mongo[MONGO_DB] +task_id = os.environ.get('CRAWLAB_TASK_ID') +col_name = os.environ.get('CRAWLAB_COLLECTION') +task = db['tasks'].find_one({'_id': task_id}) +spider = db['spiders'].find_one({'_id': task['spider_id']}) diff --git a/crawlab/spiders/spiders/items.py b/crawlab/spiders/spiders/items.py index 18cf9cfe..6f102a96 100644 --- a/crawlab/spiders/spiders/items.py +++ b/crawlab/spiders/spiders/items.py @@ -7,8 +7,10 @@ import scrapy +from spiders.db import spider + class SpidersItem(scrapy.Item): - # define the fields for your item here like: - # name = scrapy.Field() - pass + fields = {f['name']: scrapy.Field() for f in spider['fields']} + fields['_id'] = scrapy.Field() + fields['task_id'] = scrapy.Field() diff --git a/crawlab/spiders/spiders/pipelines.py b/crawlab/spiders/spiders/pipelines.py index 9e0dd2e9..69531067 100644 --- a/crawlab/spiders/spiders/pipelines.py +++ b/crawlab/spiders/spiders/pipelines.py @@ -4,8 +4,14 @@ # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html +from spiders.db import db, col_name, task_id class SpidersPipeline(object): + col = db[col_name] + def process_item(self, item, spider): + item['task_id'] = task_id + self.col.save(item) + return item diff --git a/crawlab/spiders/spiders/settings.py b/crawlab/spiders/spiders/settings.py index c7b40f78..d78d9281 100644 --- a/crawlab/spiders/spiders/settings.py +++ b/crawlab/spiders/spiders/settings.py @@ -8,83 +8,83 @@ # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html +from spiders.db import spider -BOT_NAME = 'spiders' +BOT_NAME = 'Crawlab Spider' SPIDER_MODULES = ['spiders.spiders'] NEWSPIDER_MODULE = 'spiders.spiders' - # Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'spiders (+http://www.yourdomain.com)' +# USER_AGENT = 'spiders (+http://www.yourdomain.com)' # Obey robots.txt rules -ROBOTSTXT_OBEY = True +ROBOTSTXT_OBEY = spider.get('obey_robots_txt') or True # Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 +# CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 +# DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) -#COOKIES_ENABLED = False +# COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False +# TELNETCONSOLE_ENABLED = False # Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { +# DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', -#} +# } # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { +# SPIDER_MIDDLEWARES = { # 'spiders.middlewares.SpidersSpiderMiddleware': 543, -#} +# } # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { +# DOWNLOADER_MIDDLEWARES = { # 'spiders.middlewares.SpidersDownloaderMiddleware': 543, -#} +# } # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { +# EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, -#} +# } # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html -#ITEM_PIPELINES = { -# 'spiders.pipelines.SpidersPipeline': 300, -#} +ITEM_PIPELINES = { + 'spiders.pipelines.SpidersPipeline': 300, +} # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True +# AUTOTHROTTLE_ENABLED = True # The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 +# AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 +# AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False +# AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/crawlab/spiders/spiders/spiders/config_spider.py b/crawlab/spiders/spiders/spiders/config_spider.py new file mode 100644 index 00000000..b25963b3 --- /dev/null +++ b/crawlab/spiders/spiders/spiders/config_spider.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- +from urllib.parse import urlparse + +import scrapy + +from spiders.db import spider +from spiders.items import SpidersItem + + +class NormalSpiderSpider(scrapy.Spider): + name = 'config_spider' + # allowed_domains = [] + start_urls = [spider['start_url']] + + def parse(self, response): + if spider['item_selector_type'] == 'xpath': + # xpath selector + items = response.xpath(spider['item_selector']) + else: + # css selector + items = response.css(spider['item_selector']) + for _item in items: + item = SpidersItem() + for f in spider['fields']: + if f['type'] == 'xpath': + # xpath selector + if f['extract_type'] == 'text': + # text content + query = f['query'] + '/text()' + else: + # attribute + attribute = f["attribute"] + query = f['query'] + f'/@("{attribute}")' + item[f['name']] = _item.xpath(query).extract_first() + + else: + # css selector + if f['extract_type'] == 'text': + # text content + query = f['query'] + '::text' + else: + # attribute + attribute = f["attribute"] + query = f['query'] + f'::attr("{attribute}")' + item[f['name']] = _item.css(query).extract_first() + + yield item + + # pagination + if spider.get('pagination_selector') is not None: + if spider['pagination_selector_type'] == 'xpath': + # xpath selector + next_url = response.xpath(spider['pagination_selector'] + '/@href').extract_first() + else: + # css selector + next_url = response.css(spider['pagination_selector'] + '::attr("href")').extract_first() + + # found next url + if next_url is not None: + if not next_url.startswith('http') and not next_url.startswith('//'): + u = urlparse(response.url) + next_url = f'{u.scheme}://{u.netloc}{next_url}' + yield scrapy.Request(url=next_url) diff --git a/crawlab/tasks/spider.py b/crawlab/tasks/spider.py index 48cafc27..3bdc65bc 100644 --- a/crawlab/tasks/spider.py +++ b/crawlab/tasks/spider.py @@ -6,13 +6,15 @@ from time import sleep from bson import ObjectId from pymongo import ASCENDING, DESCENDING -from config import PROJECT_DEPLOY_FILE_FOLDER, PROJECT_LOGS_FOLDER, PYTHON_ENV_PATH +from config import PROJECT_DEPLOY_FILE_FOLDER, PROJECT_LOGS_FOLDER, PYTHON_ENV_PATH, MONGO_HOST, MONGO_PORT, MONGO_DB from constants.task import TaskStatus from db.manager import db_manager from .celery import celery_app import subprocess from utils.log import other as logger +BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) + def get_task(id: str): i = 0 @@ -112,6 +114,125 @@ def execute_spider(self, id: str, params: str = None): env=env, bufsize=1) + # update pid + db_manager.update_one(col_name='tasks', id=task_id, values={ + 'pid': p.pid + }) + + # get output from the process + _stdout, _stderr = p.communicate() + + # get return code + code = p.poll() + if code == 0: + status = TaskStatus.SUCCESS + else: + status = TaskStatus.FAILURE + except Exception as err: + logger.error(err) + stderr.write(str(err)) + status = TaskStatus.FAILURE + + # save task when the task is finished + finish_ts = datetime.utcnow() + db_manager.update_one('tasks', id=task_id, values={ + 'finish_ts': finish_ts, + 'duration': (finish_ts - task['create_ts']).total_seconds(), + 'status': status + }) + task = db_manager.get('tasks', id=id) + + # close log file streams + stdout.flush() + stderr.flush() + stdout.close() + stderr.close() + + return task + + +@celery_app.task(bind=True) +def execute_config_spider(self, id: str, params: str = None): + task_id = self.request.id + hostname = self.request.hostname + spider = db_manager.get('spiders', id=id) + + # get task object and return if not found + task = get_task(task_id) + if task is None: + return + + # current working directory + current_working_directory = os.path.join(BASE_DIR, 'spiders') + + # log info + logger.info('task_id: %s' % task_id) + logger.info('hostname: %s' % hostname) + logger.info('current_working_directory: %s' % current_working_directory) + logger.info('spider_id: %s' % id) + + # make sure the log folder exists + log_path = os.path.join(PROJECT_LOGS_FOLDER, id) + if not os.path.exists(log_path): + os.makedirs(log_path) + + # open log file streams + log_file_path = os.path.join(log_path, '%s.log' % datetime.now().strftime('%Y%m%d%H%M%S')) + stdout = open(log_file_path, 'a') + stderr = open(log_file_path, 'a') + + # update task status as started + db_manager.update_one('tasks', id=task_id, values={ + 'start_ts': datetime.utcnow(), + 'node_id': hostname, + 'hostname': hostname, + 'log_file_path': log_file_path, + 'status': TaskStatus.STARTED + }) + + # pass params as env variables + env = os.environ.copy() + + # custom environment variables + if spider.get('envs'): + for _env in spider.get('envs'): + env[_env['name']] = _env['value'] + + # task id environment variable + env['CRAWLAB_TASK_ID'] = task_id + + # collection environment variable + if spider.get('col'): + env['CRAWLAB_COLLECTION'] = spider.get('col') + + # create index to speed results data retrieval + db_manager.create_index(spider.get('col'), [('task_id', ASCENDING)]) + + # mongodb environment variables + env['MONGO_HOST'] = MONGO_HOST + env['MONGO_PORT'] = str(MONGO_PORT) + env['MONGO_DB'] = MONGO_DB + + cmd_arr = [ + sys.executable, + '-m', + 'scrapy', + 'crawl', + 'config_spider' + ] + try: + p = subprocess.Popen(cmd_arr, + stdout=stdout.fileno(), + stderr=stderr.fileno(), + cwd=current_working_directory, + env=env, + bufsize=1) + + # update pid + db_manager.update_one(col_name='tasks', id=task_id, values={ + 'pid': p.pid + }) + # get output from the process _stdout, _stderr = p.communicate() diff --git a/frontend/src/components/Config/ConfigList.vue b/frontend/src/components/Config/ConfigList.vue index f1f26fbe..db3bfb99 100644 --- a/frontend/src/components/Config/ConfigList.vue +++ b/frontend/src/components/Config/ConfigList.vue @@ -20,9 +20,9 @@ - + - + + + + - + + + + + + - + + + + + + - -
- {{$t('Add Field')}} - {{$t('Preview')}} - {{$t('Save')}} + +
+
+ {{$t('Add Field')}} +
+
+ {{$t('Run')}} + {{$t('Preview')}} + {{$t('Save')}} +
@@ -200,6 +226,18 @@ export default { this.previewLoading = false }) }) + }, + onCrawl () { + this.$confirm(this.$t('Are you sure to run this spider?'), this.$t('Notification'), { + confirmButtonText: this.$t('Confirm'), + cancelButtonText: this.$t('Cancel') + }) + .then(() => { + this.$store.dispatch('spider/crawlSpider', this.spiderForm._id) + .then(() => { + this.$message.success(this.$t(`Spider task has been scheduled`)) + }) + }) } }, created () { @@ -215,13 +253,15 @@ export default { } if (!this.spiderForm.crawl_type) this.$set(this.spiderForm, 'crawl_type', 'list') if (!this.spiderForm.start_url) this.$set(this.spiderForm, 'start_url', 'http://example.com') + if (!this.spiderForm.item_selector_type) this.$set(this.spiderForm, 'item_selector_type', 'css') + if (!this.spiderForm.pagination_selector_type) this.$set(this.spiderForm, 'pagination_selector_type', 'css') + if (!this.spiderForm.obey_robots_txt) this.$set(this.spiderForm, 'obey_robots_txt', true) } }