updated auto field extraction

This commit is contained in:
Marvin Zhang
2019-06-04 07:55:51 +08:00
parent 91b9a8e78d
commit c0cec217d7
4 changed files with 81 additions and 15 deletions

View File

@@ -371,7 +371,6 @@ class SpiderApi(BaseApi):
# make source / destination
src = os.path.join(dir_path, os.listdir(dir_path)[0])
# src = dir_path
dst = os.path.join(PROJECT_DEPLOY_FILE_FOLDER, str(spider.get('_id')))
# logging info
@@ -511,11 +510,15 @@ class SpiderApi(BaseApi):
return sel
@staticmethod
def _get_children(sel):
return [tag for tag in sel.getchildren() if type(tag) != etree._Comment]
@staticmethod
def _get_text_child_tags(sel):
tags = []
for tag in sel.iter():
if tag.text is not None and tag.text.strip() != '':
if type(tag) != etree._Comment and tag.text is not None and tag.text.strip() != '':
tags.append(tag)
return tags
@@ -530,6 +533,19 @@ class SpiderApi(BaseApi):
return tags
@staticmethod
def _get_next_page_tag(sel):
next_page_text_list = [
'下一页',
'下页',
'next page',
'next',
]
for tag in sel.iter():
if tag.text is not None and tag.text.lower().strip() in next_page_text_list:
return tag
return None
def preview_crawl(self, id: str):
spider = db_manager.get(col_name='spiders', id=id)
@@ -601,7 +617,7 @@ class SpiderApi(BaseApi):
# iterate all child nodes in a top-down direction
for tag in sel.iter():
# get child tags
child_tags = tag.getchildren()
child_tags = [t for t in tag.getchildren() if type(t) != etree._Comment]
if len(child_tags) < threshold:
# if number of child tags is below threshold, skip
@@ -622,7 +638,7 @@ class SpiderApi(BaseApi):
max_tag = None
max_num = 0
for tag in list_tag_list:
_child_text_tags = self._get_text_child_tags(tag[0])
_child_text_tags = self._get_text_child_tags(self._get_children(tag)[0])
if len(_child_text_tags) > max_num:
max_tag = tag
max_num = len(_child_text_tags)
@@ -630,16 +646,24 @@ class SpiderApi(BaseApi):
# get list item selector
item_selector = None
if max_tag.get('id') is not None:
item_selector = f'#{max_tag.get("id")} > {max_tag.getchildren()[0].tag}'
item_selector = f'#{max_tag.get("id")} > {self._get_children(max_tag)[0].tag}'
elif max_tag.get('class') is not None:
if len(sel.cssselect(f'.{max_tag.get("class")}')) == 1:
item_selector = f'.{max_tag.get("class")} > {max_tag.getchildren()[0].tag}'
item_selector = f'.{max_tag.get("class")} > {self._get_children(max_tag)[0].tag}'
# get list fields
fields = []
if item_selector is not None:
for i, tag in enumerate(self._get_text_child_tags(max_tag[0])):
if tag.get('class') is not None:
first_tag = self._get_children(max_tag)[0]
for i, tag in enumerate(self._get_text_child_tags(first_tag)):
if len(first_tag.cssselect(f'{tag.tag}')) == 1:
fields.append({
'name': f'field{i + 1}',
'type': 'css',
'extract_type': 'text',
'query': f'{tag.tag}',
})
elif tag.get('class') is not None:
cls_str = '.'.join([x for x in tag.get("class").split(' ') if x != ''])
# print(tag.tag + '.' + cls_str)
if len(tag.cssselect(f'{tag.tag}.{cls_str}')) == 1:
@@ -650,7 +674,7 @@ class SpiderApi(BaseApi):
'query': f'{tag.tag}.{cls_str}',
})
for i, tag in enumerate(self._get_a_child_tags(max_tag[0])):
for i, tag in enumerate(self._get_a_child_tags(self._get_children(max_tag)[0])):
# if the tag is <a...></a>, extract its href
if tag.get('class') is not None:
cls_str = '.'.join([x for x in tag.get("class").split(' ') if x != ''])
@@ -662,9 +686,19 @@ class SpiderApi(BaseApi):
'query': f'{tag.tag}.{cls_str}',
})
# get pagination tag
pagination_selector = None
pagination_tag = self._get_next_page_tag(sel)
if pagination_tag is not None:
if pagination_tag.get('id') is not None:
pagination_selector = f'#{pagination_tag.get("id")}'
elif pagination_tag.get('class') is not None and len(sel.cssselect(f'.{pagination_tag.get("id")}')) == 1:
pagination_selector = f'.{pagination_tag.get("id")}'
return {
'status': 'ok',
'item_selector': item_selector,
'pagination_selector': pagination_selector,
'fields': fields
}
@@ -801,3 +835,24 @@ class SpiderManageApi(Resource):
'status': 'ok',
'message': 'success'
}
def upload(self):
args = self.parser.parse_args()
f = request.files[0]
if get_file_suffix(f.filename) != 'zip':
return {
'status': 'ok',
'error': 'file type mismatch'
}, 400
# save zip file on temp folder
file_path = '%s/%s' % (PROJECT_TMP_FOLDER, f.filename)
with open(file_path, 'wb') as fw:
fw.write(f.stream.read())
# unzip zip file
dir_path = file_path.replace('.zip', '')
if os.path.exists(dir_path):
shutil.rmtree(dir_path)
unzip_file(file_path, dir_path)

View File

@@ -1,8 +1,6 @@
import axios from 'axios'
let baseUrl = process.env.VUE_APP_BASE_URL ? process.env.VUE_APP_BASE_URL : 'http://localhost:8000/api'
// console.log(process.env)
// const baseUrl = process.env.API_BASE_URL || 'http://localhost:8000/api'
const request = (method, path, params, data) => {
return new Promise((resolve, reject) => {

View File

@@ -58,6 +58,8 @@ export default {
'Extract Fields': '提取字段',
'Download': '下载',
'Download CSV': '下载CSV',
'Upload Zip File': '上传Zip文件',
'Upload': '上传',
// 主页
'Total Tasks': '总任务数',

View File

@@ -81,9 +81,16 @@
width="40%"
:visible.sync="addCustomizedDialogVisible"
:before-close="onAddCustomizedDialogClose">
<p>
{{$t('Please go to the source folder of your spiders, create a sub-folder and add your spider codes into it')}}
</p>
<el-form :model="spiderForm" ref="addConfigurableForm" inline-message>
<el-form-item :label="$t('Upload Zip File')" label-width="120px" name="site">
<el-upload
:action="$request.baseUrl + '/spiders/manage/upload'"
:on-success="onUploadSuccess"
:file-list="fileList">
<el-button size="small" type="primary">{{$t('Upload')}}</el-button>
</el-upload>
</el-form-item>
</el-form>
</el-dialog>
<!--./customized spider dialog-->
@@ -213,6 +220,7 @@
import {
mapState
} from 'vuex'
import ElUploadDrag from 'element-ui/packages/upload/src/upload-dragger'
export default {
name: 'SpiderList',
@@ -244,7 +252,8 @@ export default {
],
spiderFormRules: {
name: [{ required: true, message: 'Required Field', trigger: 'change' }]
}
},
fileList: []
}
},
computed: {
@@ -480,6 +489,8 @@ export default {
})
}
})
},
onUploadSuccess () {
}
},
created () {