updated configurable spider

This commit is contained in:
Marvin Zhang
2019-05-26 19:16:46 +08:00
parent 5daebcb39b
commit e30c1c122a
8 changed files with 461 additions and 175 deletions

View File

@@ -1,6 +1,7 @@
aiohttp==3.5.4
amqp==2.4.2
aniso8601==6.0.0
Appium-Python-Client==0.40
APScheduler==3.6.0
asn1crypto==0.24.0
async-timeout==3.0.1
@@ -26,6 +27,8 @@ Flask-Cors==3.0.7
Flask-RESTful==0.3.7
flask-restplus==0.12.1
flower==0.9.3
gevent==1.4.0
greenlet==0.4.15
gunicorn==19.9.0
html5lib==1.0.1
humanfriendly==4.18
@@ -55,6 +58,8 @@ python-dateutil==2.8.0
pytz==2018.9
queuelib==1.5.0
redis==3.2.1
redisbeat==1.1.4
reppy==0.4.12
requests==2.21.0
Scrapy==1.6.0
selenium==3.141.0

View File

@@ -5,6 +5,7 @@ import subprocess
from datetime import datetime
from random import random
import gevent
import requests
from bson import ObjectId
from flask import current_app, request
@@ -23,7 +24,8 @@ from tasks.spider import execute_spider, execute_config_spider
from utils import jsonify
from utils.deploy import zip_file, unzip_file
from utils.file import get_file_suffix_stats, get_file_suffix
from utils.spider import get_lang_by_stats, get_last_n_run_errors_count, get_last_n_day_tasks_count
from utils.spider import get_lang_by_stats, get_last_n_run_errors_count, get_last_n_day_tasks_count, get_list_page_data, \
get_detail_page_data
parser = reqparse.RequestParser()
parser.add_argument('file', type=FileStorage, location='files')
@@ -71,9 +73,12 @@ class SpiderApi(BaseApi):
# Configurable Spider
########################
# spider crawl fields
# spider crawl fields for list page
('fields', str),
# spider crawl fields for detail page
('detail_fields', str),
# spider crawl type
('crawl_type', str),
@@ -442,13 +447,22 @@ class SpiderApi(BaseApi):
def update_fields(self, id: str):
"""
Update fields variables for configurable spiders
Update list page fields variables for configurable spiders
:param id: spider_id
"""
args = self.parser.parse_args()
fields = json.loads(args.fields)
db_manager.update_one(col_name='spiders', id=id, values={'fields': fields})
def update_detail_fields(self, id: str):
"""
Update detail page fields variables for configurable spiders
:param id: spider_id
"""
args = self.parser.parse_args()
detail_fields = json.loads(args.detail_fields)
db_manager.update_one(col_name='spiders', id=id, values={'detail_fields': detail_fields})
def preview_crawl(self, id: str):
spider = db_manager.get(col_name='spiders', id=id)
@@ -489,25 +503,8 @@ class SpiderApi(BaseApi):
'error': 'item_selector should not be empty'
}, 400
# TODO: enable xpath
data = []
items = sel.cssselect(spider['item_selector'])
for item in items:
row = {}
for f in spider['fields']:
if f['type'] == QueryType.CSS:
# css selector
res = item.cssselect(f['query'])
else:
# xpath
res = item.xpath(f['query'])
data = get_list_page_data(spider, sel)[:10]
if len(res) > 0:
if f['extract_type'] == ExtractType.TEXT:
row[f['name']] = res[0].text
else:
row[f['name']] = res[0].get(f['attribute'])
data.append(row)
return {
'status': 'ok',
'items': data
@@ -517,7 +514,23 @@ class SpiderApi(BaseApi):
pass
elif spider['crawl_type'] == CrawlType.LIST_DETAIL:
pass
data = get_list_page_data(spider, sel)[:10]
ev_list = []
for idx, d in enumerate(data):
for f in spider['fields']:
if f.get('is_detail'):
url = d.get(f['name'])
if url is not None:
ev_list.append(gevent.spawn(get_detail_page_data, url, spider, idx, data))
break
gevent.joinall(ev_list)
return {
'status': 'ok',
'items': data
}
class SpiderImportApi(Resource):

View File

@@ -11,6 +11,15 @@ from spiders.db import spider
class SpidersItem(scrapy.Item):
fields = {f['name']: scrapy.Field() for f in spider['fields']}
if spider['crawl_type'] == 'list':
fields = {f['name']: scrapy.Field() for f in spider['fields']}
elif spider['crawl_type'] == 'detail':
fields = {f['name']: scrapy.Field() for f in spider['detail_fields']}
elif spider['crawl_type'] == 'list-detail':
fields = {f['name']: scrapy.Field() for f in (spider['fields'] + spider['detail_fields'])}
else:
fields = {}
# basic fields
fields['_id'] = scrapy.Field()
fields['task_id'] = scrapy.Field()

View File

@@ -7,57 +7,110 @@ from spiders.db import spider
from spiders.items import SpidersItem
class NormalSpiderSpider(scrapy.Spider):
def get_detail_url(item):
for f in spider['fields']:
if f.get('is_detail'):
return item.get(f['name'])
return None
def get_spiders_item(sel, fields, item=None):
if item is None:
item = SpidersItem()
for f in fields:
if f['type'] == 'xpath':
# xpath selector
if f['extract_type'] == 'text':
# text content
query = f['query'] + '/text()'
else:
# attribute
attribute = f["attribute"]
query = f['query'] + f'/@("{attribute}")'
item[f['name']] = sel.xpath(query).extract_first()
else:
# css selector
if f['extract_type'] == 'text':
# text content
query = f['query'] + '::text'
else:
# attribute
attribute = f["attribute"]
query = f['query'] + f'::attr("{attribute}")'
item[f['name']] = sel.css(query).extract_first()
return item
def get_list_items(response):
if spider['item_selector_type'] == 'xpath':
# xpath selector
items = response.xpath(spider['item_selector'])
else:
# css selector
items = response.css(spider['item_selector'])
return items
def get_next_url(response):
# pagination
if spider.get('pagination_selector') is not None:
if spider['pagination_selector_type'] == 'xpath':
# xpath selector
next_url = response.xpath(spider['pagination_selector'] + '/@href').extract_first()
else:
# css selector
next_url = response.css(spider['pagination_selector'] + '::attr("href")').extract_first()
# found next url
if next_url is not None:
if not next_url.startswith('http') and not next_url.startswith('//'):
u = urlparse(response.url)
next_url = f'{u.scheme}://{u.netloc}{next_url}'
return next_url
return None
class ConfigSpiderSpider(scrapy.Spider):
name = 'config_spider'
# allowed_domains = []
start_urls = [spider['start_url']]
def parse(self, response):
if spider['item_selector_type'] == 'xpath':
# xpath selector
items = response.xpath(spider['item_selector'])
else:
# css selector
items = response.css(spider['item_selector'])
for _item in items:
item = SpidersItem()
for f in spider['fields']:
if f['type'] == 'xpath':
# xpath selector
if f['extract_type'] == 'text':
# text content
query = f['query'] + '/text()'
else:
# attribute
attribute = f["attribute"]
query = f['query'] + f'/@("{attribute}")'
item[f['name']] = _item.xpath(query).extract_first()
else:
# css selector
if f['extract_type'] == 'text':
# text content
query = f['query'] + '::text'
else:
# attribute
attribute = f["attribute"]
query = f['query'] + f'::attr("{attribute}")'
item[f['name']] = _item.css(query).extract_first()
if spider['crawl_type'] == 'list':
items = get_list_items(response)
# list page only
for _item in items:
item = get_spiders_item(sel=_item, fields=spider['fields'])
yield item
# pagination
if spider.get('pagination_selector') is not None:
if spider['pagination_selector_type'] == 'xpath':
# xpath selector
next_url = response.xpath(spider['pagination_selector'] + '/@href').extract_first()
else:
# css selector
next_url = response.css(spider['pagination_selector'] + '::attr("href")').extract_first()
# found next url
next_url = get_next_url(response)
if next_url is not None:
if not next_url.startswith('http') and not next_url.startswith('//'):
u = urlparse(response.url)
next_url = f'{u.scheme}://{u.netloc}{next_url}'
yield scrapy.Request(url=next_url)
elif spider['crawl_type'] == 'detail':
# TODO: detail page onlny
# detail page only
pass
elif spider['crawl_type'] == 'list-detail':
# list page + detail page
items = get_list_items(response)
for _item in items:
item = get_spiders_item(sel=_item, fields=spider['fields'])
detail_url = get_detail_url(item)
if detail_url is not None:
yield scrapy.Request(url=detail_url,
callback=self.parse_detail,
meta={
'item': item
})
next_url = get_next_url(response)
if next_url is not None:
yield scrapy.Request(url=next_url)
def parse_detail(self, response):
item = get_spiders_item(sel=response, fields=spider['detail_fields'], item=response.meta['item'])
yield item

View File

@@ -1,9 +1,11 @@
import os
import requests
from datetime import datetime, timedelta
from bson import ObjectId
from lxml import etree
from constants.spider import FILE_SUFFIX_LANG_MAPPING, LangType, SUFFIX_IGNORE, SpiderType
from constants.spider import FILE_SUFFIX_LANG_MAPPING, LangType, SUFFIX_IGNORE, SpiderType, QueryType, ExtractType
from constants.task import TaskStatus
from db.manager import db_manager
@@ -69,3 +71,53 @@ def get_last_n_day_tasks_count(spider_id: ObjectId, n: int) -> list:
'$gte': (datetime.now() - timedelta(n))
}
})
def get_list_page_data(spider, sel):
data = []
if spider['item_selector_type'] == QueryType.XPATH:
items = sel.xpath(spider['item_selector'])
else:
items = sel.cssselect(spider['item_selector'])
for item in items:
row = {}
for f in spider['fields']:
if f['type'] == QueryType.CSS:
# css selector
res = item.cssselect(f['query'])
else:
# xpath
res = item.xpath(f['query'])
if len(res) > 0:
if f['extract_type'] == ExtractType.TEXT:
row[f['name']] = res[0].text
else:
row[f['name']] = res[0].get(f['attribute'])
data.append(row)
return data
def get_detail_page_data(url, spider, idx, data):
r = requests.get(url)
sel = etree.HTML(r.content)
row = {}
for f in spider['detail_fields']:
if f['type'] == QueryType.CSS:
# css selector
res = sel.cssselect(f['query'])
else:
# xpath
res = sel.xpath(f['query'])
if len(res) > 0:
if f['extract_type'] == ExtractType.TEXT:
row[f['name']] = res[0].text
else:
row[f['name']] = res[0].get(f['attribute'])
# assign values
for k, v in row.items():
data[idx][k] = v

View File

@@ -8,7 +8,7 @@
<el-table :data="previewCrawlData"
:header-cell-style="{background:'rgb(48, 65, 86)',color:'white'}"
border>
<el-table-column v-for="(f, index) in spiderForm.fields"
<el-table-column v-for="(f, index) in fields"
:label="f.name"
:key="index"
min-width="100px">
@@ -20,8 +20,9 @@
</el-dialog>
<!--./preview results-->
<!--config detail-->
<el-row>
<el-col :span="11" offset="1">
<el-col :span="11" :offset="1">
<el-form label-width="150px">
<el-form-item :label="$t('Crawl Type')">
<el-button-group>
@@ -72,76 +73,40 @@
</el-form>
</el-col>
</el-row>
<!--./config detail-->
<!--button group-->
<el-row style="margin-top: 10px">
<div class="button-group-wrapper">
<div class="button-group">
<el-button type="primary" @click="addField" icon="el-icon-plus">{{$t('Add Field')}}</el-button>
</div>
<div class="button-group">
<el-button type="danger" @click="onCrawl">{{$t('Run')}}</el-button>
<el-button type="warning" @click="onPreview" v-loading="previewLoading">{{$t('Preview')}}</el-button>
<el-button type="success" @click="onSave" v-loading="saveLoading">{{$t('Save')}}</el-button>
</div>
<el-row class="button-group-container">
<div class="button-group">
<el-button type="danger" @click="onCrawl">{{$t('Run')}}</el-button>
<el-button type="warning" @click="onPreview" v-loading="previewLoading">{{$t('Preview')}}</el-button>
<el-button type="success" @click="onSave" v-loading="saveLoading">{{$t('Save')}}</el-button>
</div>
</el-row>
<!--./button group-->
<!--field list-->
<el-row style="margin-top: 10px;">
<el-table :data="spiderForm.fields"
class="table edit"
:header-cell-style="{background:'rgb(48, 65, 86)',color:'white'}"
border>
<el-table-column :label="$t('Field Name')" width="200px">
<template slot-scope="scope">
<el-input v-model="scope.row.name" :placeholder="$t('Field Name')"></el-input>
</template>
</el-table-column>
<el-table-column :label="$t('Query Type')" width="200px">
<template slot-scope="scope">
<el-select v-model="scope.row.type" :placeholder="$t('Query Type')">
<el-option value="css" :label="$t('CSS Selector')"></el-option>
<el-option value="xpath" :label="$t('XPath')"></el-option>
</el-select>
</template>
</el-table-column>
<el-table-column :label="$t('Query')" width="250px">
<template slot-scope="scope">
<el-input v-model="scope.row.query" :placeholder="$t('Query')"></el-input>
</template>
</el-table-column>
<el-table-column :label="$t('Extract Type')" width="120px">
<template slot-scope="scope">
<el-select v-model="scope.row.extract_type" :placeholder="$t('Extract Type')">
<el-option value="text" :label="$t('Text')"></el-option>
<el-option value="attribute" :label="$t('Attribute')"></el-option>
</el-select>
</template>
</el-table-column>
<el-table-column :label="$t('Attribute')" width="250px">
<template slot-scope="scope">
<template v-if="scope.row.extract_type === 'attribute'">
<el-input v-model="scope.row.attribute"
:placeholder="$t('Attribute')">
</el-input>
</template>
<template v-else>
</template>
</template>
</el-table-column>
<el-table-column :label="$t('Action')" fixed="right">
<template slot-scope="scope">
<div class="action-button-group">
<el-button size="mini" icon="el-icon-delete" type="danger"
@click="deleteField(scope.$index)"></el-button>
</div>
</template>
</el-table-column>
</el-table>
<!--list field list-->
<el-row v-if="['list','list-detail'].includes(spiderForm.crawl_type)"
class="list-fields-container">
<fields-table-view
type="list"
title="List Page Fields"
:fields="spiderForm.fields"
/>
</el-row>
<!--./field list-->
<!--./list field list-->
<!--detail field list-->
<el-row v-if="['detail','list-detail'].includes(spiderForm.crawl_type)"
class="detail-fields-container"
style="margin-top: 10px;">
<fields-table-view
type="detail"
title="Detail Page Fields"
:fields="spiderForm.detail_fields"
/>
</el-row>
<!--./detail field list-->
</div>
</template>
@@ -149,9 +114,11 @@
import {
mapState
} from 'vuex'
import FieldsTableView from '../TableView/FieldsTableView'
export default {
name: 'ConfigList',
components: { FieldsTableView },
data () {
return {
crawlTypeList: [
@@ -168,18 +135,20 @@ export default {
...mapState('spider', [
'spiderForm',
'previewCrawlData'
])
]),
fields () {
if (this.spiderForm.crawl_type === 'list') {
return this.spiderForm.fields
} else if (this.spiderForm.crawl_type === 'detail') {
return this.spiderForm.detail_fields
} else if (this.spiderForm.crawl_type === 'list-detail') {
return this.spiderForm.fields.concat(this.spiderForm.detail_fields)
} else {
return []
}
}
},
methods: {
addField () {
this.spiderForm.fields.push({
type: 'css',
extract_type: 'text'
})
},
deleteField (index) {
this.spiderForm.fields.splice(index, 1)
},
onSelectCrawlType (value) {
this.spiderForm.crawl_type = value
},
@@ -201,6 +170,9 @@ export default {
this.saveLoading = false
})
})
.then(() => {
this.$store.dispatch('spider/updateSpiderDetailFields')
})
.catch(() => {
this.$message.error(this.$t('Something wrong happened'))
this.saveLoading = false
@@ -241,6 +213,7 @@ export default {
}
},
created () {
// fields for list page
if (!this.spiderForm.fields) {
this.spiderForm.fields = []
for (let i = 0; i < 3; i++) {
@@ -251,6 +224,19 @@ export default {
})
}
}
// fields for detail page
if (!this.spiderForm.detail_fields) {
this.spiderForm.detail_fields = []
for (let i = 0; i < 3; i++) {
this.spiderForm.detail_fields.push({
name: `field_${i + 1}`,
type: 'css',
extract_type: 'text'
})
}
}
if (!this.spiderForm.crawl_type) this.$set(this.spiderForm, 'crawl_type', 'list')
if (!this.spiderForm.start_url) this.$set(this.spiderForm, 'start_url', 'http://example.com')
if (!this.spiderForm.item_selector_type) this.$set(this.spiderForm, 'item_selector_type', 'css')
@@ -261,43 +247,29 @@ export default {
</script>
<style scoped>
.el-table {
}
.el-table.edit >>> .el-table__body td {
padding: 0;
}
.el-table.edit >>> .el-table__body td .cell {
padding: 0;
font-size: 12px;
}
.el-table.edit >>> .el-input__inner:hover {
text-decoration: underline;
}
.el-table.edit >>> .el-input__inner {
height: 36px;
border: none;
border-radius: 0;
font-size: 12px;
}
.el-table.edit >>> .el-select .el-input .el-select__caret {
line-height: 36px;
}
.button-group-wrapper {
display: flex;
justify-content: space-between;
.button-group-container {
margin-top: 10px;
border-bottom: 1px dashed #dcdfe6;
padding-bottom: 20px;
}
.button-group {
text-align: right;
}
.action-button-group {
margin-left: 10px;
.list-fields-container {
margin-top: 20px;
border-bottom: 1px dashed #dcdfe6;
padding-bottom: 20px;
}
.detail-fields-container {
margin-top: 20px;
}
.title {
color: #606266;
font-size: 14px;
}
</style>

View File

@@ -0,0 +1,179 @@
<template>
<div class="fields-table-view">
<el-row class="button-group-container">
<label class="title">{{$t(this.title)}}</label>
<div class="button-group">
<el-button type="primary" size="small" @click="addField" icon="el-icon-plus">{{$t('Add Field')}}</el-button>
</div>
</el-row>
<el-row>
<el-table :data="fields"
class="table edit"
:header-cell-style="{background:'rgb(48, 65, 86)',color:'white'}"
border>
<el-table-column v-if="type === 'list' && spiderForm.crawl_type === 'list-detail'"
:label="$t('Detail Page URL')"
align="center">
<template slot-scope="scope">
<el-checkbox v-model="scope.row.is_detail"
@change="onCheck(scope.row)">
</el-checkbox>
</template>
</el-table-column>
<el-table-column :label="$t('Field Name')" width="200px">
<template slot-scope="scope">
<el-input v-model="scope.row.name" :placeholder="$t('Field Name')"
@change="onNameChange(scope.row)"></el-input>
</template>
</el-table-column>
<el-table-column :label="$t('Query Type')" width="200px">
<template slot-scope="scope">
<el-select v-model="scope.row.type" :placeholder="$t('Query Type')">
<el-option value="css" :label="$t('CSS Selector')"></el-option>
<el-option value="xpath" :label="$t('XPath')"></el-option>
</el-select>
</template>
</el-table-column>
<el-table-column :label="$t('Query')" width="250px">
<template slot-scope="scope">
<el-input v-model="scope.row.query" :placeholder="$t('Query')"></el-input>
</template>
</el-table-column>
<el-table-column :label="$t('Extract Type')" width="120px">
<template slot-scope="scope">
<el-select v-model="scope.row.extract_type" :placeholder="$t('Extract Type')">
<el-option value="text" :label="$t('Text')"></el-option>
<el-option value="attribute" :label="$t('Attribute')"></el-option>
</el-select>
</template>
</el-table-column>
<el-table-column :label="$t('Attribute')" width="250px">
<template slot-scope="scope">
<template v-if="scope.row.extract_type === 'attribute'">
<el-input v-model="scope.row.attribute"
:placeholder="$t('Attribute')">
</el-input>
</template>
<template v-else>
</template>
</template>
</el-table-column>
<el-table-column :label="$t('Action')" fixed="right" min-width="100px">
<template slot-scope="scope">
<div class="action-button-group">
<el-button size="mini"
style="margin-left:10px"
icon="el-icon-delete"
type="danger"
@click="deleteField(scope.$index)">
</el-button>
</div>
</template>
</el-table-column>
</el-table>
</el-row>
</div>
</template>
<script>
import {
mapState
} from 'vuex'
export default {
name: 'FieldsTableView',
props: {
type: {
type: String,
default: 'list'
},
title: {
type: String,
default: ''
},
fields: {
type: Array,
default () {
return []
}
}
},
computed: {
...mapState('spider', [
'spiderForm'
])
},
methods: {
addField () {
this.fields.push({
type: 'css',
extract_type: 'text'
})
},
deleteField (index) {
this.fields.splice(index, 1)
},
onNameChange (row) {
if (this.fields.filter(d => d.name === row.name).length > 1) {
this.$message.error(this.$t(`Duplicated field names for ${row.name}`))
}
},
onCheck (row) {
this.fields.forEach(d => {
if (row.name !== d.name) {
this.$set(d, 'is_detail', false)
}
})
}
}
}
</script>
<style scoped>
.el-table.edit >>> .el-table__body td {
padding: 0;
}
.el-table.edit >>> .el-table__body td .cell {
padding: 0;
font-size: 12px;
}
.el-table.edit >>> .el-input__inner:hover {
text-decoration: underline;
}
.el-table.edit >>> .el-input__inner {
height: 36px;
border: none;
border-radius: 0;
font-size: 12px;
}
.el-table.edit >>> .el-select .el-input .el-select__caret {
line-height: 36px;
}
.button-group-container {
/*display: inline-block;*/
/*width: 100%;*/
}
.button-group-container .title {
float: left;
line-height: 32px;
}
.button-group-container .button-group {
float: right;
}
.action-button-group {
display: flex;
margin-left: 10px;
}
.action-button-group >>> .el-checkbox__label {
font-size: 12px;
}
</style>

View File

@@ -128,6 +128,9 @@ export default {
'Pagination Selector Type': '分页项选择器类别',
'Preview Results': '预览结果',
'Obey robots.txt': '遵守Robots协议',
'List Page Fields': '列表页字段',
'Detail Page Fields': '详情页字段',
'Detail Page URL': '详情页URL',
// 爬虫列表
'Name': '名称',