diff --git a/crawlab/requirements.txt b/crawlab/requirements.txt index feb2cdd0..11af6f69 100644 --- a/crawlab/requirements.txt +++ b/crawlab/requirements.txt @@ -1,6 +1,7 @@ aiohttp==3.5.4 amqp==2.4.2 aniso8601==6.0.0 +Appium-Python-Client==0.40 APScheduler==3.6.0 asn1crypto==0.24.0 async-timeout==3.0.1 @@ -26,6 +27,8 @@ Flask-Cors==3.0.7 Flask-RESTful==0.3.7 flask-restplus==0.12.1 flower==0.9.3 +gevent==1.4.0 +greenlet==0.4.15 gunicorn==19.9.0 html5lib==1.0.1 humanfriendly==4.18 @@ -55,6 +58,8 @@ python-dateutil==2.8.0 pytz==2018.9 queuelib==1.5.0 redis==3.2.1 +redisbeat==1.1.4 +reppy==0.4.12 requests==2.21.0 Scrapy==1.6.0 selenium==3.141.0 diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py index 47776297..243f11a1 100644 --- a/crawlab/routes/spiders.py +++ b/crawlab/routes/spiders.py @@ -5,6 +5,7 @@ import subprocess from datetime import datetime from random import random +import gevent import requests from bson import ObjectId from flask import current_app, request @@ -23,7 +24,8 @@ from tasks.spider import execute_spider, execute_config_spider from utils import jsonify from utils.deploy import zip_file, unzip_file from utils.file import get_file_suffix_stats, get_file_suffix -from utils.spider import get_lang_by_stats, get_last_n_run_errors_count, get_last_n_day_tasks_count +from utils.spider import get_lang_by_stats, get_last_n_run_errors_count, get_last_n_day_tasks_count, get_list_page_data, \ + get_detail_page_data parser = reqparse.RequestParser() parser.add_argument('file', type=FileStorage, location='files') @@ -71,9 +73,12 @@ class SpiderApi(BaseApi): # Configurable Spider ######################## - # spider crawl fields + # spider crawl fields for list page ('fields', str), + # spider crawl fields for detail page + ('detail_fields', str), + # spider crawl type ('crawl_type', str), @@ -442,13 +447,22 @@ class SpiderApi(BaseApi): def update_fields(self, id: str): """ - Update fields variables for configurable spiders + Update list page fields variables for configurable spiders :param id: spider_id """ args = self.parser.parse_args() fields = json.loads(args.fields) db_manager.update_one(col_name='spiders', id=id, values={'fields': fields}) + def update_detail_fields(self, id: str): + """ + Update detail page fields variables for configurable spiders + :param id: spider_id + """ + args = self.parser.parse_args() + detail_fields = json.loads(args.detail_fields) + db_manager.update_one(col_name='spiders', id=id, values={'detail_fields': detail_fields}) + def preview_crawl(self, id: str): spider = db_manager.get(col_name='spiders', id=id) @@ -489,25 +503,8 @@ class SpiderApi(BaseApi): 'error': 'item_selector should not be empty' }, 400 - # TODO: enable xpath - data = [] - items = sel.cssselect(spider['item_selector']) - for item in items: - row = {} - for f in spider['fields']: - if f['type'] == QueryType.CSS: - # css selector - res = item.cssselect(f['query']) - else: - # xpath - res = item.xpath(f['query']) + data = get_list_page_data(spider, sel)[:10] - if len(res) > 0: - if f['extract_type'] == ExtractType.TEXT: - row[f['name']] = res[0].text - else: - row[f['name']] = res[0].get(f['attribute']) - data.append(row) return { 'status': 'ok', 'items': data @@ -517,7 +514,23 @@ class SpiderApi(BaseApi): pass elif spider['crawl_type'] == CrawlType.LIST_DETAIL: - pass + data = get_list_page_data(spider, sel)[:10] + + ev_list = [] + for idx, d in enumerate(data): + for f in spider['fields']: + if f.get('is_detail'): + url = d.get(f['name']) + if url is not None: + ev_list.append(gevent.spawn(get_detail_page_data, url, spider, idx, data)) + break + + gevent.joinall(ev_list) + + return { + 'status': 'ok', + 'items': data + } class SpiderImportApi(Resource): diff --git a/crawlab/spiders/spiders/items.py b/crawlab/spiders/spiders/items.py index 6f102a96..7163d6e3 100644 --- a/crawlab/spiders/spiders/items.py +++ b/crawlab/spiders/spiders/items.py @@ -11,6 +11,15 @@ from spiders.db import spider class SpidersItem(scrapy.Item): - fields = {f['name']: scrapy.Field() for f in spider['fields']} + if spider['crawl_type'] == 'list': + fields = {f['name']: scrapy.Field() for f in spider['fields']} + elif spider['crawl_type'] == 'detail': + fields = {f['name']: scrapy.Field() for f in spider['detail_fields']} + elif spider['crawl_type'] == 'list-detail': + fields = {f['name']: scrapy.Field() for f in (spider['fields'] + spider['detail_fields'])} + else: + fields = {} + + # basic fields fields['_id'] = scrapy.Field() fields['task_id'] = scrapy.Field() diff --git a/crawlab/spiders/spiders/spiders/config_spider.py b/crawlab/spiders/spiders/spiders/config_spider.py index b25963b3..bed976f5 100644 --- a/crawlab/spiders/spiders/spiders/config_spider.py +++ b/crawlab/spiders/spiders/spiders/config_spider.py @@ -7,57 +7,110 @@ from spiders.db import spider from spiders.items import SpidersItem -class NormalSpiderSpider(scrapy.Spider): +def get_detail_url(item): + for f in spider['fields']: + if f.get('is_detail'): + return item.get(f['name']) + return None + + +def get_spiders_item(sel, fields, item=None): + if item is None: + item = SpidersItem() + + for f in fields: + if f['type'] == 'xpath': + # xpath selector + if f['extract_type'] == 'text': + # text content + query = f['query'] + '/text()' + else: + # attribute + attribute = f["attribute"] + query = f['query'] + f'/@("{attribute}")' + item[f['name']] = sel.xpath(query).extract_first() + + else: + # css selector + if f['extract_type'] == 'text': + # text content + query = f['query'] + '::text' + else: + # attribute + attribute = f["attribute"] + query = f['query'] + f'::attr("{attribute}")' + item[f['name']] = sel.css(query).extract_first() + + return item + + +def get_list_items(response): + if spider['item_selector_type'] == 'xpath': + # xpath selector + items = response.xpath(spider['item_selector']) + else: + # css selector + items = response.css(spider['item_selector']) + return items + + +def get_next_url(response): + # pagination + if spider.get('pagination_selector') is not None: + if spider['pagination_selector_type'] == 'xpath': + # xpath selector + next_url = response.xpath(spider['pagination_selector'] + '/@href').extract_first() + else: + # css selector + next_url = response.css(spider['pagination_selector'] + '::attr("href")').extract_first() + + # found next url + if next_url is not None: + if not next_url.startswith('http') and not next_url.startswith('//'): + u = urlparse(response.url) + next_url = f'{u.scheme}://{u.netloc}{next_url}' + return next_url + return None + + +class ConfigSpiderSpider(scrapy.Spider): name = 'config_spider' # allowed_domains = [] start_urls = [spider['start_url']] def parse(self, response): - if spider['item_selector_type'] == 'xpath': - # xpath selector - items = response.xpath(spider['item_selector']) - else: - # css selector - items = response.css(spider['item_selector']) - for _item in items: - item = SpidersItem() - for f in spider['fields']: - if f['type'] == 'xpath': - # xpath selector - if f['extract_type'] == 'text': - # text content - query = f['query'] + '/text()' - else: - # attribute - attribute = f["attribute"] - query = f['query'] + f'/@("{attribute}")' - item[f['name']] = _item.xpath(query).extract_first() - - else: - # css selector - if f['extract_type'] == 'text': - # text content - query = f['query'] + '::text' - else: - # attribute - attribute = f["attribute"] - query = f['query'] + f'::attr("{attribute}")' - item[f['name']] = _item.css(query).extract_first() + if spider['crawl_type'] == 'list': + items = get_list_items(response) + # list page only + for _item in items: + item = get_spiders_item(sel=_item, fields=spider['fields']) yield item - - # pagination - if spider.get('pagination_selector') is not None: - if spider['pagination_selector_type'] == 'xpath': - # xpath selector - next_url = response.xpath(spider['pagination_selector'] + '/@href').extract_first() - else: - # css selector - next_url = response.css(spider['pagination_selector'] + '::attr("href")').extract_first() - - # found next url + next_url = get_next_url(response) if next_url is not None: - if not next_url.startswith('http') and not next_url.startswith('//'): - u = urlparse(response.url) - next_url = f'{u.scheme}://{u.netloc}{next_url}' yield scrapy.Request(url=next_url) + + elif spider['crawl_type'] == 'detail': + # TODO: detail page onlny + # detail page only + pass + + elif spider['crawl_type'] == 'list-detail': + # list page + detail page + items = get_list_items(response) + for _item in items: + item = get_spiders_item(sel=_item, fields=spider['fields']) + detail_url = get_detail_url(item) + if detail_url is not None: + yield scrapy.Request(url=detail_url, + callback=self.parse_detail, + meta={ + 'item': item + }) + next_url = get_next_url(response) + if next_url is not None: + yield scrapy.Request(url=next_url) + + def parse_detail(self, response): + item = get_spiders_item(sel=response, fields=spider['detail_fields'], item=response.meta['item']) + yield item diff --git a/crawlab/utils/spider.py b/crawlab/utils/spider.py index 6f7d4ef6..8720c50f 100644 --- a/crawlab/utils/spider.py +++ b/crawlab/utils/spider.py @@ -1,9 +1,11 @@ import os +import requests from datetime import datetime, timedelta from bson import ObjectId +from lxml import etree -from constants.spider import FILE_SUFFIX_LANG_MAPPING, LangType, SUFFIX_IGNORE, SpiderType +from constants.spider import FILE_SUFFIX_LANG_MAPPING, LangType, SUFFIX_IGNORE, SpiderType, QueryType, ExtractType from constants.task import TaskStatus from db.manager import db_manager @@ -69,3 +71,53 @@ def get_last_n_day_tasks_count(spider_id: ObjectId, n: int) -> list: '$gte': (datetime.now() - timedelta(n)) } }) + + +def get_list_page_data(spider, sel): + data = [] + if spider['item_selector_type'] == QueryType.XPATH: + items = sel.xpath(spider['item_selector']) + else: + items = sel.cssselect(spider['item_selector']) + for item in items: + row = {} + for f in spider['fields']: + if f['type'] == QueryType.CSS: + # css selector + res = item.cssselect(f['query']) + else: + # xpath + res = item.xpath(f['query']) + + if len(res) > 0: + if f['extract_type'] == ExtractType.TEXT: + row[f['name']] = res[0].text + else: + row[f['name']] = res[0].get(f['attribute']) + data.append(row) + return data + + +def get_detail_page_data(url, spider, idx, data): + r = requests.get(url) + + sel = etree.HTML(r.content) + + row = {} + for f in spider['detail_fields']: + if f['type'] == QueryType.CSS: + # css selector + res = sel.cssselect(f['query']) + else: + # xpath + res = sel.xpath(f['query']) + + if len(res) > 0: + if f['extract_type'] == ExtractType.TEXT: + row[f['name']] = res[0].text + else: + row[f['name']] = res[0].get(f['attribute']) + + # assign values + for k, v in row.items(): + data[idx][k] = v diff --git a/frontend/src/components/Config/ConfigList.vue b/frontend/src/components/Config/ConfigList.vue index db3bfb99..6c47570a 100644 --- a/frontend/src/components/Config/ConfigList.vue +++ b/frontend/src/components/Config/ConfigList.vue @@ -8,7 +8,7 @@ - @@ -20,8 +20,9 @@ + - + @@ -72,76 +73,40 @@ + - -
-
- {{$t('Add Field')}} -
-
- {{$t('Run')}} - {{$t('Preview')}} - {{$t('Save')}} -
+ +
+ {{$t('Run')}} + {{$t('Preview')}} + {{$t('Save')}}
- - - - - - - - - - - - - - - - - - - - - - + + + - + + + + + + +
@@ -149,9 +114,11 @@ import { mapState } from 'vuex' +import FieldsTableView from '../TableView/FieldsTableView' export default { name: 'ConfigList', + components: { FieldsTableView }, data () { return { crawlTypeList: [ @@ -168,18 +135,20 @@ export default { ...mapState('spider', [ 'spiderForm', 'previewCrawlData' - ]) + ]), + fields () { + if (this.spiderForm.crawl_type === 'list') { + return this.spiderForm.fields + } else if (this.spiderForm.crawl_type === 'detail') { + return this.spiderForm.detail_fields + } else if (this.spiderForm.crawl_type === 'list-detail') { + return this.spiderForm.fields.concat(this.spiderForm.detail_fields) + } else { + return [] + } + } }, methods: { - addField () { - this.spiderForm.fields.push({ - type: 'css', - extract_type: 'text' - }) - }, - deleteField (index) { - this.spiderForm.fields.splice(index, 1) - }, onSelectCrawlType (value) { this.spiderForm.crawl_type = value }, @@ -201,6 +170,9 @@ export default { this.saveLoading = false }) }) + .then(() => { + this.$store.dispatch('spider/updateSpiderDetailFields') + }) .catch(() => { this.$message.error(this.$t('Something wrong happened')) this.saveLoading = false @@ -241,6 +213,7 @@ export default { } }, created () { + // fields for list page if (!this.spiderForm.fields) { this.spiderForm.fields = [] for (let i = 0; i < 3; i++) { @@ -251,6 +224,19 @@ export default { }) } } + + // fields for detail page + if (!this.spiderForm.detail_fields) { + this.spiderForm.detail_fields = [] + for (let i = 0; i < 3; i++) { + this.spiderForm.detail_fields.push({ + name: `field_${i + 1}`, + type: 'css', + extract_type: 'text' + }) + } + } + if (!this.spiderForm.crawl_type) this.$set(this.spiderForm, 'crawl_type', 'list') if (!this.spiderForm.start_url) this.$set(this.spiderForm, 'start_url', 'http://example.com') if (!this.spiderForm.item_selector_type) this.$set(this.spiderForm, 'item_selector_type', 'css') @@ -261,43 +247,29 @@ export default { diff --git a/frontend/src/components/TableView/FieldsTableView.vue b/frontend/src/components/TableView/FieldsTableView.vue new file mode 100644 index 00000000..ab2376f7 --- /dev/null +++ b/frontend/src/components/TableView/FieldsTableView.vue @@ -0,0 +1,179 @@ + + + + + diff --git a/frontend/src/i18n/zh.js b/frontend/src/i18n/zh.js index 2de01073..e6db4795 100644 --- a/frontend/src/i18n/zh.js +++ b/frontend/src/i18n/zh.js @@ -128,6 +128,9 @@ export default { 'Pagination Selector Type': '分页项选择器类别', 'Preview Results': '预览结果', 'Obey robots.txt': '遵守Robots协议', + 'List Page Fields': '列表页字段', + 'Detail Page Fields': '详情页字段', + 'Detail Page URL': '详情页URL', // 爬虫列表 'Name': '名称',