Merge pull request #43 from tikazyq/develop

Develop
This commit is contained in:
Marvin Zhang
2019-05-29 22:10:09 +08:00
committed by GitHub
20 changed files with 330 additions and 52 deletions

View File

@@ -50,6 +50,6 @@ MONGO_PORT = 27017
MONGO_DB = 'crawlab_test'
# Flask 变量
DEBUG = True
DEBUG = False
FLASK_HOST = '127.0.0.1'
FLASK_PORT = 8000

View File

@@ -4,6 +4,7 @@ import shutil
import subprocess
from datetime import datetime
from random import random
from urllib.parse import urlparse
import gevent
import requests
@@ -25,7 +26,7 @@ from utils import jsonify
from utils.deploy import zip_file, unzip_file
from utils.file import get_file_suffix_stats, get_file_suffix
from utils.spider import get_lang_by_stats, get_last_n_run_errors_count, get_last_n_day_tasks_count, get_list_page_data, \
get_detail_page_data
get_detail_page_data, generate_urls
parser = reqparse.RequestParser()
parser.add_argument('file', type=FileStorage, location='files')
@@ -85,6 +86,9 @@ class SpiderApi(BaseApi):
# spider start url
('start_url', str),
# url pattern: support generation of urls with patterns
('url_pattern', str),
# spider item selector
('item_selector', str),
@@ -98,7 +102,7 @@ class SpiderApi(BaseApi):
('pagination_selector_type', str),
# whether to obey robots.txt
('obey_robots_txt', str),
('obey_robots_txt', bool),
)
def get(self, id=None, action=None):
@@ -478,20 +482,29 @@ class SpiderApi(BaseApi):
}, 400
try:
r = requests.get(spider['start_url'], headers={
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
})
r = None
for url in generate_urls(spider['start_url']):
r = requests.get(url, headers={
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
})
break
except Exception as err:
return {
'status': 'ok',
'error': 'connection error'
}, 500
if r.status_code != 200:
if not r:
return {
'status': 'ok',
'error': 'status code is not 200, but %s' % r.status_code
}
'status': 'ok',
'error': 'response is not returned'
}, 500
if r and r.status_code != 200:
return {
'status': 'ok',
'error': 'status code is not 200, but %s' % r.status_code
}, r.status_code
# get html parse tree
sel = etree.HTML(r.content)
@@ -502,10 +515,21 @@ class SpiderApi(BaseApi):
def _get_text_child_tags(sel):
tags = []
for tag in sel.iter():
if tag.text is not None:
if tag.text is not None and tag.text.strip() != '':
tags.append(tag)
return tags
@staticmethod
def _get_a_child_tags(sel):
tags = []
for tag in sel.iter():
if tag.tag == 'a':
if tag.get('href') is not None and not tag.get('href').startswith('#') and not tag.get(
'href').startswith('javascript'):
tags.append(tag)
return tags
def preview_crawl(self, id: str):
spider = db_manager.get(col_name='spiders', id=id)
@@ -544,6 +568,9 @@ class SpiderApi(BaseApi):
if f.get('is_detail'):
url = d.get(f['name'])
if url is not None:
if not url.startswith('http') and not url.startswith('//'):
u = urlparse(spider['start_url'])
url = f'{u.scheme}://{u.netloc}{url}'
ev_list.append(gevent.spawn(get_detail_page_data, url, spider, idx, data))
break
@@ -566,7 +593,7 @@ class SpiderApi(BaseApi):
sel = self._get_html(spider)
# when error happens, return
if type(sel) == type(tuple):
if type(sel) == tuple:
return sel
list_tag_list = []
@@ -592,15 +619,54 @@ class SpiderApi(BaseApi):
# find the list tag with the most child text tags
_tag_list = []
_max_tag = None
_max_num = 0
max_tag = None
max_num = 0
for tag in list_tag_list:
_child_text_tags = self._get_text_child_tags(tag[0])
if len(_child_text_tags) > _max_num:
_max_tag = tag
_max_num = len(_child_text_tags)
if len(_child_text_tags) > max_num:
max_tag = tag
max_num = len(_child_text_tags)
# TODO: extract list fields
# get list item selector
item_selector = None
if max_tag.get('id') is not None:
item_selector = f'#{max_tag.get("id")} > {max_tag.getchildren()[0].tag}'
elif max_tag.get('class') is not None:
if len(sel.cssselect(f'.{max_tag.get("class")}')) == 1:
item_selector = f'.{max_tag.get("class")} > {max_tag.getchildren()[0].tag}'
# get list fields
fields = []
if item_selector is not None:
for i, tag in enumerate(self._get_text_child_tags(max_tag[0])):
if tag.get('class') is not None:
cls_str = '.'.join([x for x in tag.get("class").split(' ') if x != ''])
# print(tag.tag + '.' + cls_str)
if len(tag.cssselect(f'{tag.tag}.{cls_str}')) == 1:
fields.append({
'name': f'field{i + 1}',
'type': 'css',
'extract_type': 'text',
'query': f'{tag.tag}.{cls_str}',
})
for i, tag in enumerate(self._get_a_child_tags(max_tag[0])):
# if the tag is <a...></a>, extract its href
if tag.get('class') is not None:
cls_str = '.'.join([x for x in tag.get("class").split(' ') if x != ''])
fields.append({
'name': f'field{i + 1}_url',
'type': 'css',
'extract_type': 'attribute',
'attribute': 'href',
'query': f'{tag.tag}.{cls_str}',
})
return {
'status': 'ok',
'item_selector': item_selector,
'fields': fields
}
class SpiderImportApi(Resource):

View File

@@ -1,10 +1,15 @@
# -*- coding: utf-8 -*-
import os
import sys
from urllib.parse import urlparse
import scrapy
from spiders.db import spider
from spiders.items import SpidersItem
from spiders.utils import generate_urls
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')))
def get_detail_url(item):
@@ -75,8 +80,10 @@ def get_next_url(response):
class ConfigSpiderSpider(scrapy.Spider):
name = 'config_spider'
# allowed_domains = []
start_urls = [spider['start_url']]
def start_requests(self):
for url in generate_urls(spider['start_url']):
yield scrapy.Request(url=url)
def parse(self, response):
@@ -91,7 +98,7 @@ class ConfigSpiderSpider(scrapy.Spider):
yield scrapy.Request(url=next_url)
elif spider['crawl_type'] == 'detail':
# TODO: detail page onlny
# TODO: detail page only
# detail page only
pass

View File

@@ -1,4 +1,7 @@
import itertools
import os
import re
import requests
from datetime import datetime, timedelta
@@ -121,3 +124,51 @@ def get_detail_page_data(url, spider, idx, data):
# assign values
for k, v in row.items():
data[idx][k] = v
def generate_urls(base_url: str) -> str:
url = base_url
# number range list
list_arr = []
for i, res in enumerate(re.findall(r'{(\d+),(\d+)}', base_url)):
try:
_min = int(res[0])
_max = int(res[1])
except ValueError as err:
raise ValueError(f'{base_url} is not a valid URL pattern')
# list
_list = range(_min, _max + 1)
# key
_key = f'n{i}'
# append list and key
list_arr.append((_list, _key))
# replace url placeholder with key
url = url.replace('{' + res[0] + ',' + res[1] + '}', '{' + _key + '}', 1)
# string list
for i, res in enumerate(re.findall(r'\[([\w\-,]+)\]', base_url)):
# list
_list = res.split(',')
# key
_key = f's{i}'
# append list and key
list_arr.append((_list, _key))
# replace url placeholder with key
url = url.replace('[' + ','.join(_list) + ']', '{' + _key + '}', 1)
# combine together
_list_arr = []
for res in itertools.product(*map(lambda x: x[0], list_arr)):
_url = url
for _arr, _rep in zip(list_arr, res):
_list, _key = _arr
_url = _url.replace('{' + _key + '}', str(_rep), 1)
yield _url

View File

@@ -1 +1,2 @@
API_BASE_URL=http://localhost:5000/api
NODE_ENV='development'
VUE_APP_BASE_URL=http://localhost:8000/api

View File

@@ -1 +1,2 @@
API_BASE_URL=http://139.129.230.98:8000/api
NODE_ENV='production'
VUE_APP_BASE_URL=http://crawlab.cn:8000/api

View File

@@ -13,5 +13,8 @@ module.exports = {
},
parserOptions: {
parser: 'babel-eslint'
},
globals: {
'_hmt': 1
}
}

View File

@@ -1,15 +1,18 @@
<!DOCTYPE html>
<html lang="en">
<head>
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<meta name="renderer" content="webkit">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no">
<script>
</script>
<title>Crawlab</title>
</head>
<body>
<!--<script src=<%= BASE_URL %>/tinymce4.7.5/tinymce.min.js></script>-->
<div id="app"></div>
<!-- built files will be auto injected -->
</body>
</head>
<body>
<!--<script src=<%= BASE_URL %>/tinymce4.7.5/tinymce.min.js></script>-->
<div id="app"></div>
<!-- built files will be auto injected -->
</body>
</html>

View File

@@ -1,12 +1,12 @@
{
"name": "crawlab",
"version": "0.2.0",
"version": "0.2.1",
"private": true,
"scripts": {
"serve": "cross-env NODE_ENV=development vue-cli-service serve --ip=0.0.0.0",
"serve-prod": "cross-env NODE_ENV=production vue-cli-service serve --mode=production --ip=0.0.0.0",
"serve:prod": "cross-env NODE_ENV=production vue-cli-service serve --mode=production --ip=0.0.0.0",
"config": "vue ui",
"build": "vue-cli-service build",
"build:prod": "vue-cli-service build --mode production",
"lint": "vue-cli-service lint",
"test:unit": "vue-cli-service test:unit"
},
@@ -23,6 +23,7 @@
"nprogress": "0.2.0",
"path": "^0.12.7",
"vue": "^2.5.22",
"vue-ba": "^1.2.5",
"vue-codemirror-lite": "^1.0.4",
"vue-i18n": "^8.9.0",
"vue-router": "^3.0.1",

View File

@@ -10,8 +10,45 @@ import DialogView from './components/Common/DialogView'
export default {
name: 'App',
data () {
return {
msgPopup: undefined
}
},
components: {
DialogView
},
computed: {
useStats () {
return localStorage.getItem('useStats')
}
},
methods: {},
mounted () {
window.setUseStats = (value) => {
localStorage.setItem('useStats', value)
document.querySelector('.el-message__closeBtn').click()
if (value === 1) {
_hmt.push(['_trackPageview', '/allow_stats'])
} else {
_hmt.push(['_trackPageview', '/disallow_stats'])
}
}
// first-time user
if (this.useStats === undefined || this.useStats === null) {
this.$message({
type: 'info',
dangerouslyUseHTMLString: true,
showClose: true,
duration: 0,
message: this.$t('<p>Do you allow us to collect some statistics to improve Crawlab?</p>' +
'<div style="text-align: center;margin-top: 10px;">' +
'<button class="message-btn" onclick="setUseStats(1)">' + this.$t('Yes') + '</button>' +
'<button class="message-btn" onclick="setUseStats(0)">' + this.$t('No') + '</button>' +
'</div>')
})
}
}
}
</script>
@@ -52,4 +89,31 @@ export default {
.el-form .el-form-item {
margin-bottom: 10px;
}
.message-btn {
margin: 0 5px;
padding: 5px 10px;
background: transparent;
color: #909399;
font-size: 12px;
border-radius: 4px;
cursor: pointer;
}
.message-btn:hover {
opacity: 0.8;
text-decoration: underline;
}
.message-btn.success {
background: #67c23a;
border-color: #67c23a;
color: #fff;
}
.message-btn.danger {
background: #f56c6c;
border-color: #f56c6c;
color: #fff;
}
</style>

View File

@@ -1,15 +1,12 @@
import axios from 'axios'
let baseUrl = 'http://localhost:8000/api'
if (process.env.NODE_ENV === 'production') {
baseUrl = 'http://139.129.230.98:8000/api'
}
let baseUrl = process.env.VUE_APP_API_BASE_URL ? process.env.VUE_APP_API_BASE_URL : 'http://localhost:8000/api'
// console.log(process.env)
// const baseUrl = process.env.API_BASE_URL || 'http://localhost:8000/api'
const request = (method, path, params, data) => {
return new Promise((resolve, reject) => {
const url = `${baseUrl}${path}`
const url = baseUrl + path
axios({
method,
url,

View File

@@ -34,12 +34,15 @@
</el-button>
</el-button-group>
</el-form-item>
<el-form-item :label="$t('Start URL')">
<el-form-item :label="$t('Start URL')" required>
<el-input v-model="spiderForm.start_url" :placeholder="$t('Start URL')"></el-input>
</el-form-item>
<el-form-item :label="$t('Obey robots.txt')">
<el-switch v-model="spiderForm.obey_robots_txt" :placeholder="$t('Obey robots.txt')"></el-switch>
</el-form-item>
<!--<el-form-item :label="$t('URL Pattern')">-->
<!--<el-input v-model="spiderForm.url_pattern" :placeholder="$t('URL Pattern')"></el-input>-->
<!--</el-form-item>-->
</el-form>
</el-col>
<el-col :span="11" :offset="1">
@@ -79,7 +82,8 @@
<el-row class="button-group-container">
<div class="button-group">
<el-button type="danger" @click="onCrawl">{{$t('Run')}}</el-button>
<el-button type="primary" @click="onExtractFields" v-loading="extractFieldsLoading">{{$t('Extract Fields')}}</el-button>
<el-button type="primary" @click="onExtractFields" v-loading="extractFieldsLoading">{{$t('Extract Fields')}}
</el-button>
<el-button type="warning" @click="onPreview" v-loading="previewLoading">{{$t('Preview')}}</el-button>
<el-button type="success" @click="onSave" v-loading="saveLoading">{{$t('Save')}}</el-button>
</div>
@@ -214,6 +218,24 @@ export default {
})
},
onExtractFields () {
this.onSave()
.then(() => {
this.extractFieldsLoading = true
this.$store.dispatch('spider/extractFields')
.then(response => {
if (response.data.item_selector) {
this.$set(this.spiderForm, 'item_selector', response.data.item_selector)
this.$set(this.spiderForm, 'item_selector_type', 'css')
}
if (response.data.fields && response.data.fields.length) {
this.spiderForm.fields = response.data.fields
}
})
.finally(() => {
this.extractFieldsLoading = false
})
})
}
},
created () {
@@ -245,7 +267,7 @@ export default {
if (!this.spiderForm.start_url) this.$set(this.spiderForm, 'start_url', 'http://example.com')
if (!this.spiderForm.item_selector_type) this.$set(this.spiderForm, 'item_selector_type', 'css')
if (!this.spiderForm.pagination_selector_type) this.$set(this.spiderForm, 'pagination_selector_type', 'css')
if (!this.spiderForm.obey_robots_txt) this.$set(this.spiderForm, 'obey_robots_txt', true)
if (this.spiderForm.obey_robots_txt === undefined) this.$set(this.spiderForm, 'obey_robots_txt', true)
}
}
</script>

View File

@@ -12,6 +12,8 @@ import 'font-awesome/scss/font-awesome.scss'// FontAwesome
import 'codemirror/lib/codemirror.css'
// import ba from 'vue-ba'
import App from './App'
import store from './store'
import router from './router'
@@ -24,8 +26,23 @@ import i18n from './i18n'
Vue.use(ElementUI, { locale })
// Vue.use(ba, 'c35e3a563a06caee2524902c81975add')
// Vue.use(ba, {
// siteId: 'c35e3a563a06caee2524902c81975add'
// })
Vue.config.productionTip = false
// 百度统计
window._hmt = window._hmt || [];
(function () {
let hm = document.createElement('script')
hm.src = 'https://hm.baidu.com/hm.js?c35e3a563a06caee2524902c81975add'
let s = document.getElementsByTagName('script')[0]
s.parentNode.insertBefore(hm, s)
})()
// inject request api
Vue.prototype.$request = request
const app = new Vue({

View File

@@ -222,4 +222,12 @@ router.beforeEach((to, from, next) => {
next()
})
router.afterEach((to, from, next) => {
if (to.path) {
if (localStorage.getItem('useStats') !== '0') {
window._hmt.push(['_trackPageview', to.path])
}
}
})
export default router

View File

@@ -12,6 +12,7 @@ import file from './modules/file'
import schedule from './modules/schedule'
import lang from './modules/lang'
import site from './modules/site'
import stats from './modules/stats'
import getters from './getters'
Vue.use(Vuex)
@@ -29,7 +30,9 @@ const store = new Vuex.Store({
file,
schedule,
lang,
site
site,
// 百度统计
stats
},
getters
})

View File

@@ -105,6 +105,7 @@ const actions = {
// configurable spider
crawl_type: state.spiderForm.crawl_type,
start_url: state.spiderForm.start_url,
url_pattern: state.spiderForm.url_pattern,
item_selector: state.spiderForm.item_selector,
item_selector_type: state.spiderForm.item_selector_type,
pagination_selector: state.spiderForm.pagination_selector,
@@ -207,6 +208,9 @@ const actions = {
.then(response => {
commit('SET_PREVIEW_CRAWL_DATA', response.data.items)
})
},
extractFields ({ state, commit }) {
return request.post(`/spiders/${state.spiderForm._id}/extract_fields`)
}
}

View File

@@ -0,0 +1,14 @@
const state = {}
const getters = {
useStats () {
return localStorage.getItem('useStats')
}
}
const mutations = {}
const actions = {}
export default {
state,
getters,
mutations,
actions
}

View File

@@ -85,6 +85,9 @@ export default {
this.dailyTasks = response.data.daily_tasks
this.initEchartsDailyTasks()
})
},
mounted () {
// this.$ba.trackPageview('/')
}
}
</script>

View File

@@ -2,15 +2,15 @@
<div class="tags-view-container">
<scroll-pane ref="scrollPane" class="tags-view-wrapper">
<router-link
v-for="tag in visitedViews"
ref="tag"
:class="isActive(tag)?'active':''"
:to="{ path: tag.path, query: tag.query, fullPath: tag.fullPath }"
:key="tag.path"
tag="span"
class="tags-view-item"
@click.middle.native="closeSelectedTag(tag)"
@contextmenu.prevent.native="openMenu(tag,$event)">
v-for="tag in visitedViews"
ref="tag"
:class="isActive(tag)?'active':''"
:to="{ path: tag.path, query: tag.query, fullPath: tag.fullPath }"
:key="tag.path"
tag="span"
class="tags-view-item"
@click.middle.native="closeSelectedTag(tag)"
@contextmenu.prevent.native="openMenu(tag,$event)">
{{ $t(generateTitle(tag.title)) }}
<span v-if="!tag.meta.affix" class="el-icon-close" @click.prevent.stop="closeSelectedTag(tag)"/>
</router-link>
@@ -47,7 +47,7 @@ export default {
return this.$store.state.tagsView.visitedViews
},
routers () {
return this.$store.state.permission.routers
return this.$store.state.permission ? this.$store.state.permission.routers : []
}
},
watch: {

View File

@@ -8401,6 +8401,14 @@ vm-browserify@0.0.4:
dependencies:
indexof "0.0.1"
vue-ba@^1.2.5:
version "1.2.5"
resolved "https://registry.npm.taobao.org/vue-ba/download/vue-ba-1.2.5.tgz#fef30732ee749a65a81a4f47113527ee41fda64b"
integrity sha1-/vMHMu50mmWoGk9HETUn7kH9pks=
dependencies:
deep-equal "^1.0.1"
vue "^2.3.3"
vue-codemirror-lite@^1.0.4:
version "1.0.4"
resolved "http://registry.npm.taobao.org/vue-codemirror-lite/download/vue-codemirror-lite-1.0.4.tgz#48a5cd7d17c0914503c8cd9d9b56b438e49c3410"
@@ -8485,6 +8493,11 @@ vue-template-es2015-compiler@^1.6.0, vue-template-es2015-compiler@^1.8.2:
version "1.8.2"
resolved "http://registry.npm.taobao.org/vue-template-es2015-compiler/download/vue-template-es2015-compiler-1.8.2.tgz#dd73e80ba58bb65dd7a8aa2aeef6089cf6116f2a"
vue@^2.3.3:
version "2.6.10"
resolved "https://registry.npm.taobao.org/vue/download/vue-2.6.10.tgz#a72b1a42a4d82a721ea438d1b6bf55e66195c637"
integrity sha1-pysaQqTYKnIepDjRtr9V5mGVxjc=
vue@^2.5.17, vue@^2.5.22:
version "2.6.6"
resolved "https://registry.yarnpkg.com/vue/-/vue-2.6.6.tgz#dde41e483c11c46a7bf523909f4f2f816ab60d25"