diff --git a/crawlab/config/config.py b/crawlab/config/config.py index 08ab113c..4f0a85db 100644 --- a/crawlab/config/config.py +++ b/crawlab/config/config.py @@ -50,6 +50,6 @@ MONGO_PORT = 27017 MONGO_DB = 'crawlab_test' # Flask 变量 -DEBUG = True +DEBUG = False FLASK_HOST = '127.0.0.1' FLASK_PORT = 8000 diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py index 758cf29c..0bfe8e4c 100644 --- a/crawlab/routes/spiders.py +++ b/crawlab/routes/spiders.py @@ -4,6 +4,7 @@ import shutil import subprocess from datetime import datetime from random import random +from urllib.parse import urlparse import gevent import requests @@ -25,7 +26,7 @@ from utils import jsonify from utils.deploy import zip_file, unzip_file from utils.file import get_file_suffix_stats, get_file_suffix from utils.spider import get_lang_by_stats, get_last_n_run_errors_count, get_last_n_day_tasks_count, get_list_page_data, \ - get_detail_page_data + get_detail_page_data, generate_urls parser = reqparse.RequestParser() parser.add_argument('file', type=FileStorage, location='files') @@ -85,6 +86,9 @@ class SpiderApi(BaseApi): # spider start url ('start_url', str), + # url pattern: support generation of urls with patterns + ('url_pattern', str), + # spider item selector ('item_selector', str), @@ -98,7 +102,7 @@ class SpiderApi(BaseApi): ('pagination_selector_type', str), # whether to obey robots.txt - ('obey_robots_txt', str), + ('obey_robots_txt', bool), ) def get(self, id=None, action=None): @@ -478,20 +482,29 @@ class SpiderApi(BaseApi): }, 400 try: - r = requests.get(spider['start_url'], headers={ - 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' - }) + r = None + for url in generate_urls(spider['start_url']): + r = requests.get(url, headers={ + 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' + }) + break except Exception as err: return { 'status': 'ok', 'error': 'connection error' }, 500 - if r.status_code != 200: + if not r: return { - 'status': 'ok', - 'error': 'status code is not 200, but %s' % r.status_code - } + 'status': 'ok', + 'error': 'response is not returned' + }, 500 + + if r and r.status_code != 200: + return { + 'status': 'ok', + 'error': 'status code is not 200, but %s' % r.status_code + }, r.status_code # get html parse tree sel = etree.HTML(r.content) @@ -502,10 +515,21 @@ class SpiderApi(BaseApi): def _get_text_child_tags(sel): tags = [] for tag in sel.iter(): - if tag.text is not None: + if tag.text is not None and tag.text.strip() != '': tags.append(tag) return tags + @staticmethod + def _get_a_child_tags(sel): + tags = [] + for tag in sel.iter(): + if tag.tag == 'a': + if tag.get('href') is not None and not tag.get('href').startswith('#') and not tag.get( + 'href').startswith('javascript'): + tags.append(tag) + + return tags + def preview_crawl(self, id: str): spider = db_manager.get(col_name='spiders', id=id) @@ -544,6 +568,9 @@ class SpiderApi(BaseApi): if f.get('is_detail'): url = d.get(f['name']) if url is not None: + if not url.startswith('http') and not url.startswith('//'): + u = urlparse(spider['start_url']) + url = f'{u.scheme}://{u.netloc}{url}' ev_list.append(gevent.spawn(get_detail_page_data, url, spider, idx, data)) break @@ -566,7 +593,7 @@ class SpiderApi(BaseApi): sel = self._get_html(spider) # when error happens, return - if type(sel) == type(tuple): + if type(sel) == tuple: return sel list_tag_list = [] @@ -592,15 +619,54 @@ class SpiderApi(BaseApi): # find the list tag with the most child text tags _tag_list = [] - _max_tag = None - _max_num = 0 + max_tag = None + max_num = 0 for tag in list_tag_list: _child_text_tags = self._get_text_child_tags(tag[0]) - if len(_child_text_tags) > _max_num: - _max_tag = tag - _max_num = len(_child_text_tags) + if len(_child_text_tags) > max_num: + max_tag = tag + max_num = len(_child_text_tags) - # TODO: extract list fields + # get list item selector + item_selector = None + if max_tag.get('id') is not None: + item_selector = f'#{max_tag.get("id")} > {max_tag.getchildren()[0].tag}' + elif max_tag.get('class') is not None: + if len(sel.cssselect(f'.{max_tag.get("class")}')) == 1: + item_selector = f'.{max_tag.get("class")} > {max_tag.getchildren()[0].tag}' + + # get list fields + fields = [] + if item_selector is not None: + for i, tag in enumerate(self._get_text_child_tags(max_tag[0])): + if tag.get('class') is not None: + cls_str = '.'.join([x for x in tag.get("class").split(' ') if x != '']) + # print(tag.tag + '.' + cls_str) + if len(tag.cssselect(f'{tag.tag}.{cls_str}')) == 1: + fields.append({ + 'name': f'field{i + 1}', + 'type': 'css', + 'extract_type': 'text', + 'query': f'{tag.tag}.{cls_str}', + }) + + for i, tag in enumerate(self._get_a_child_tags(max_tag[0])): + # if the tag is , extract its href + if tag.get('class') is not None: + cls_str = '.'.join([x for x in tag.get("class").split(' ') if x != '']) + fields.append({ + 'name': f'field{i + 1}_url', + 'type': 'css', + 'extract_type': 'attribute', + 'attribute': 'href', + 'query': f'{tag.tag}.{cls_str}', + }) + + return { + 'status': 'ok', + 'item_selector': item_selector, + 'fields': fields + } class SpiderImportApi(Resource): diff --git a/crawlab/spiders/spiders/spiders/config_spider.py b/crawlab/spiders/spiders/spiders/config_spider.py index fe801f8e..13fa82bf 100644 --- a/crawlab/spiders/spiders/spiders/config_spider.py +++ b/crawlab/spiders/spiders/spiders/config_spider.py @@ -1,10 +1,15 @@ # -*- coding: utf-8 -*- +import os +import sys from urllib.parse import urlparse import scrapy from spiders.db import spider from spiders.items import SpidersItem +from spiders.utils import generate_urls + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))) def get_detail_url(item): @@ -75,8 +80,10 @@ def get_next_url(response): class ConfigSpiderSpider(scrapy.Spider): name = 'config_spider' - # allowed_domains = [] - start_urls = [spider['start_url']] + + def start_requests(self): + for url in generate_urls(spider['start_url']): + yield scrapy.Request(url=url) def parse(self, response): @@ -91,7 +98,7 @@ class ConfigSpiderSpider(scrapy.Spider): yield scrapy.Request(url=next_url) elif spider['crawl_type'] == 'detail': - # TODO: detail page onlny + # TODO: detail page only # detail page only pass diff --git a/crawlab/utils/spider.py b/crawlab/utils/spider.py index 8720c50f..9a2b48df 100644 --- a/crawlab/utils/spider.py +++ b/crawlab/utils/spider.py @@ -1,4 +1,7 @@ +import itertools import os +import re + import requests from datetime import datetime, timedelta @@ -121,3 +124,51 @@ def get_detail_page_data(url, spider, idx, data): # assign values for k, v in row.items(): data[idx][k] = v + + +def generate_urls(base_url: str) -> str: + url = base_url + + # number range list + list_arr = [] + for i, res in enumerate(re.findall(r'{(\d+),(\d+)}', base_url)): + try: + _min = int(res[0]) + _max = int(res[1]) + except ValueError as err: + raise ValueError(f'{base_url} is not a valid URL pattern') + + # list + _list = range(_min, _max + 1) + + # key + _key = f'n{i}' + + # append list and key + list_arr.append((_list, _key)) + + # replace url placeholder with key + url = url.replace('{' + res[0] + ',' + res[1] + '}', '{' + _key + '}', 1) + + # string list + for i, res in enumerate(re.findall(r'\[([\w\-,]+)\]', base_url)): + # list + _list = res.split(',') + + # key + _key = f's{i}' + + # append list and key + list_arr.append((_list, _key)) + + # replace url placeholder with key + url = url.replace('[' + ','.join(_list) + ']', '{' + _key + '}', 1) + + # combine together + _list_arr = [] + for res in itertools.product(*map(lambda x: x[0], list_arr)): + _url = url + for _arr, _rep in zip(list_arr, res): + _list, _key = _arr + _url = _url.replace('{' + _key + '}', str(_rep), 1) + yield _url diff --git a/frontend/.env.development b/frontend/.env.development index 53240f56..dade16fb 100644 --- a/frontend/.env.development +++ b/frontend/.env.development @@ -1 +1,2 @@ -API_BASE_URL=http://localhost:5000/api +NODE_ENV='development' +VUE_APP_BASE_URL=http://localhost:8000/api diff --git a/frontend/.env.production b/frontend/.env.production index dc42502a..a8b89254 100644 --- a/frontend/.env.production +++ b/frontend/.env.production @@ -1 +1,2 @@ -API_BASE_URL=http://139.129.230.98:8000/api +NODE_ENV='production' +VUE_APP_BASE_URL=http://crawlab.cn:8000/api diff --git a/frontend/.eslintrc.js b/frontend/.eslintrc.js index 98d04316..5dadb0d8 100644 --- a/frontend/.eslintrc.js +++ b/frontend/.eslintrc.js @@ -13,5 +13,8 @@ module.exports = { }, parserOptions: { parser: 'babel-eslint' + }, + globals: { + '_hmt': 1 } } diff --git a/frontend/index.html b/frontend/index.html index 98cf9b85..59f3bbb9 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -1,15 +1,18 @@ - + + Crawlab - - - -
- - + + + +
+ + diff --git a/frontend/package.json b/frontend/package.json index 1e005431..701463a3 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,12 +1,12 @@ { "name": "crawlab", - "version": "0.2.0", + "version": "0.2.1", "private": true, "scripts": { "serve": "cross-env NODE_ENV=development vue-cli-service serve --ip=0.0.0.0", - "serve-prod": "cross-env NODE_ENV=production vue-cli-service serve --mode=production --ip=0.0.0.0", + "serve:prod": "cross-env NODE_ENV=production vue-cli-service serve --mode=production --ip=0.0.0.0", "config": "vue ui", - "build": "vue-cli-service build", + "build:prod": "vue-cli-service build --mode production", "lint": "vue-cli-service lint", "test:unit": "vue-cli-service test:unit" }, @@ -23,6 +23,7 @@ "nprogress": "0.2.0", "path": "^0.12.7", "vue": "^2.5.22", + "vue-ba": "^1.2.5", "vue-codemirror-lite": "^1.0.4", "vue-i18n": "^8.9.0", "vue-router": "^3.0.1", diff --git a/frontend/src/App.vue b/frontend/src/App.vue index 38d6c19d..2a86e532 100644 --- a/frontend/src/App.vue +++ b/frontend/src/App.vue @@ -10,8 +10,45 @@ import DialogView from './components/Common/DialogView' export default { name: 'App', + data () { + return { + msgPopup: undefined + } + }, components: { DialogView + }, + computed: { + useStats () { + return localStorage.getItem('useStats') + } + }, + methods: {}, + mounted () { + window.setUseStats = (value) => { + localStorage.setItem('useStats', value) + document.querySelector('.el-message__closeBtn').click() + if (value === 1) { + _hmt.push(['_trackPageview', '/allow_stats']) + } else { + _hmt.push(['_trackPageview', '/disallow_stats']) + } + } + + // first-time user + if (this.useStats === undefined || this.useStats === null) { + this.$message({ + type: 'info', + dangerouslyUseHTMLString: true, + showClose: true, + duration: 0, + message: this.$t('

Do you allow us to collect some statistics to improve Crawlab?

' + + '
' + + '' + + '' + + '
') + }) + } } } @@ -52,4 +89,31 @@ export default { .el-form .el-form-item { margin-bottom: 10px; } + + .message-btn { + margin: 0 5px; + padding: 5px 10px; + background: transparent; + color: #909399; + font-size: 12px; + border-radius: 4px; + cursor: pointer; + } + + .message-btn:hover { + opacity: 0.8; + text-decoration: underline; + } + + .message-btn.success { + background: #67c23a; + border-color: #67c23a; + color: #fff; + } + + .message-btn.danger { + background: #f56c6c; + border-color: #f56c6c; + color: #fff; + } diff --git a/frontend/src/api/request.js b/frontend/src/api/request.js index 53603af8..4aaa0c04 100644 --- a/frontend/src/api/request.js +++ b/frontend/src/api/request.js @@ -1,15 +1,12 @@ import axios from 'axios' -let baseUrl = 'http://localhost:8000/api' -if (process.env.NODE_ENV === 'production') { - baseUrl = 'http://139.129.230.98:8000/api' -} +let baseUrl = process.env.VUE_APP_API_BASE_URL ? process.env.VUE_APP_API_BASE_URL : 'http://localhost:8000/api' // console.log(process.env) // const baseUrl = process.env.API_BASE_URL || 'http://localhost:8000/api' const request = (method, path, params, data) => { return new Promise((resolve, reject) => { - const url = `${baseUrl}${path}` + const url = baseUrl + path axios({ method, url, diff --git a/frontend/src/components/Config/ConfigList.vue b/frontend/src/components/Config/ConfigList.vue index 908f2c9a..1b0e9d58 100644 --- a/frontend/src/components/Config/ConfigList.vue +++ b/frontend/src/components/Config/ConfigList.vue @@ -34,12 +34,15 @@ - + + + + @@ -79,7 +82,8 @@
{{$t('Run')}} - {{$t('Extract Fields')}} + {{$t('Extract Fields')}} + {{$t('Preview')}} {{$t('Save')}}
@@ -214,6 +218,24 @@ export default { }) }, onExtractFields () { + this.onSave() + .then(() => { + this.extractFieldsLoading = true + this.$store.dispatch('spider/extractFields') + .then(response => { + if (response.data.item_selector) { + this.$set(this.spiderForm, 'item_selector', response.data.item_selector) + this.$set(this.spiderForm, 'item_selector_type', 'css') + } + + if (response.data.fields && response.data.fields.length) { + this.spiderForm.fields = response.data.fields + } + }) + .finally(() => { + this.extractFieldsLoading = false + }) + }) } }, created () { @@ -245,7 +267,7 @@ export default { if (!this.spiderForm.start_url) this.$set(this.spiderForm, 'start_url', 'http://example.com') if (!this.spiderForm.item_selector_type) this.$set(this.spiderForm, 'item_selector_type', 'css') if (!this.spiderForm.pagination_selector_type) this.$set(this.spiderForm, 'pagination_selector_type', 'css') - if (!this.spiderForm.obey_robots_txt) this.$set(this.spiderForm, 'obey_robots_txt', true) + if (this.spiderForm.obey_robots_txt === undefined) this.$set(this.spiderForm, 'obey_robots_txt', true) } } diff --git a/frontend/src/main.js b/frontend/src/main.js index d90e7bcf..721f0f63 100644 --- a/frontend/src/main.js +++ b/frontend/src/main.js @@ -12,6 +12,8 @@ import 'font-awesome/scss/font-awesome.scss'// FontAwesome import 'codemirror/lib/codemirror.css' +// import ba from 'vue-ba' + import App from './App' import store from './store' import router from './router' @@ -24,8 +26,23 @@ import i18n from './i18n' Vue.use(ElementUI, { locale }) +// Vue.use(ba, 'c35e3a563a06caee2524902c81975add') +// Vue.use(ba, { +// siteId: 'c35e3a563a06caee2524902c81975add' +// }) + Vue.config.productionTip = false +// 百度统计 +window._hmt = window._hmt || []; +(function () { + let hm = document.createElement('script') + hm.src = 'https://hm.baidu.com/hm.js?c35e3a563a06caee2524902c81975add' + let s = document.getElementsByTagName('script')[0] + s.parentNode.insertBefore(hm, s) +})() + +// inject request api Vue.prototype.$request = request const app = new Vue({ diff --git a/frontend/src/router/index.js b/frontend/src/router/index.js index bf96c11b..4eddc102 100644 --- a/frontend/src/router/index.js +++ b/frontend/src/router/index.js @@ -222,4 +222,12 @@ router.beforeEach((to, from, next) => { next() }) +router.afterEach((to, from, next) => { + if (to.path) { + if (localStorage.getItem('useStats') !== '0') { + window._hmt.push(['_trackPageview', to.path]) + } + } +}) + export default router diff --git a/frontend/src/store/index.js b/frontend/src/store/index.js index 57fb5065..d21e0b74 100644 --- a/frontend/src/store/index.js +++ b/frontend/src/store/index.js @@ -12,6 +12,7 @@ import file from './modules/file' import schedule from './modules/schedule' import lang from './modules/lang' import site from './modules/site' +import stats from './modules/stats' import getters from './getters' Vue.use(Vuex) @@ -29,7 +30,9 @@ const store = new Vuex.Store({ file, schedule, lang, - site + site, + // 百度统计 + stats }, getters }) diff --git a/frontend/src/store/modules/spider.js b/frontend/src/store/modules/spider.js index 673ce33f..831c0d0e 100644 --- a/frontend/src/store/modules/spider.js +++ b/frontend/src/store/modules/spider.js @@ -105,6 +105,7 @@ const actions = { // configurable spider crawl_type: state.spiderForm.crawl_type, start_url: state.spiderForm.start_url, + url_pattern: state.spiderForm.url_pattern, item_selector: state.spiderForm.item_selector, item_selector_type: state.spiderForm.item_selector_type, pagination_selector: state.spiderForm.pagination_selector, @@ -207,6 +208,9 @@ const actions = { .then(response => { commit('SET_PREVIEW_CRAWL_DATA', response.data.items) }) + }, + extractFields ({ state, commit }) { + return request.post(`/spiders/${state.spiderForm._id}/extract_fields`) } } diff --git a/frontend/src/store/modules/stats.js b/frontend/src/store/modules/stats.js new file mode 100644 index 00000000..0a0bbe16 --- /dev/null +++ b/frontend/src/store/modules/stats.js @@ -0,0 +1,14 @@ +const state = {} +const getters = { + useStats () { + return localStorage.getItem('useStats') + } +} +const mutations = {} +const actions = {} +export default { + state, + getters, + mutations, + actions +} diff --git a/frontend/src/views/home/Home.vue b/frontend/src/views/home/Home.vue index e721a95c..c46c2431 100644 --- a/frontend/src/views/home/Home.vue +++ b/frontend/src/views/home/Home.vue @@ -85,6 +85,9 @@ export default { this.dailyTasks = response.data.daily_tasks this.initEchartsDailyTasks() }) + }, + mounted () { + // this.$ba.trackPageview('/') } } diff --git a/frontend/src/views/layout/components/TagsView.vue b/frontend/src/views/layout/components/TagsView.vue index 406ed45c..e97c8dc8 100644 --- a/frontend/src/views/layout/components/TagsView.vue +++ b/frontend/src/views/layout/components/TagsView.vue @@ -2,15 +2,15 @@
+ v-for="tag in visitedViews" + ref="tag" + :class="isActive(tag)?'active':''" + :to="{ path: tag.path, query: tag.query, fullPath: tag.fullPath }" + :key="tag.path" + tag="span" + class="tags-view-item" + @click.middle.native="closeSelectedTag(tag)" + @contextmenu.prevent.native="openMenu(tag,$event)"> {{ $t(generateTitle(tag.title)) }} @@ -47,7 +47,7 @@ export default { return this.$store.state.tagsView.visitedViews }, routers () { - return this.$store.state.permission.routers + return this.$store.state.permission ? this.$store.state.permission.routers : [] } }, watch: { diff --git a/frontend/yarn.lock b/frontend/yarn.lock index 159ea9c1..75d4b025 100644 --- a/frontend/yarn.lock +++ b/frontend/yarn.lock @@ -8401,6 +8401,14 @@ vm-browserify@0.0.4: dependencies: indexof "0.0.1" +vue-ba@^1.2.5: + version "1.2.5" + resolved "https://registry.npm.taobao.org/vue-ba/download/vue-ba-1.2.5.tgz#fef30732ee749a65a81a4f47113527ee41fda64b" + integrity sha1-/vMHMu50mmWoGk9HETUn7kH9pks= + dependencies: + deep-equal "^1.0.1" + vue "^2.3.3" + vue-codemirror-lite@^1.0.4: version "1.0.4" resolved "http://registry.npm.taobao.org/vue-codemirror-lite/download/vue-codemirror-lite-1.0.4.tgz#48a5cd7d17c0914503c8cd9d9b56b438e49c3410" @@ -8485,6 +8493,11 @@ vue-template-es2015-compiler@^1.6.0, vue-template-es2015-compiler@^1.8.2: version "1.8.2" resolved "http://registry.npm.taobao.org/vue-template-es2015-compiler/download/vue-template-es2015-compiler-1.8.2.tgz#dd73e80ba58bb65dd7a8aa2aeef6089cf6116f2a" +vue@^2.3.3: + version "2.6.10" + resolved "https://registry.npm.taobao.org/vue/download/vue-2.6.10.tgz#a72b1a42a4d82a721ea438d1b6bf55e66195c637" + integrity sha1-pysaQqTYKnIepDjRtr9V5mGVxjc= + vue@^2.5.17, vue@^2.5.22: version "2.6.6" resolved "https://registry.yarnpkg.com/vue/-/vue-2.6.6.tgz#dde41e483c11c46a7bf523909f4f2f816ab60d25"