mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-22 17:31:03 +01:00
updated auto field extraction
This commit is contained in:
@@ -371,7 +371,6 @@ class SpiderApi(BaseApi):
|
||||
|
||||
# make source / destination
|
||||
src = os.path.join(dir_path, os.listdir(dir_path)[0])
|
||||
# src = dir_path
|
||||
dst = os.path.join(PROJECT_DEPLOY_FILE_FOLDER, str(spider.get('_id')))
|
||||
|
||||
# logging info
|
||||
@@ -511,11 +510,15 @@ class SpiderApi(BaseApi):
|
||||
|
||||
return sel
|
||||
|
||||
@staticmethod
|
||||
def _get_children(sel):
|
||||
return [tag for tag in sel.getchildren() if type(tag) != etree._Comment]
|
||||
|
||||
@staticmethod
|
||||
def _get_text_child_tags(sel):
|
||||
tags = []
|
||||
for tag in sel.iter():
|
||||
if tag.text is not None and tag.text.strip() != '':
|
||||
if type(tag) != etree._Comment and tag.text is not None and tag.text.strip() != '':
|
||||
tags.append(tag)
|
||||
return tags
|
||||
|
||||
@@ -530,6 +533,19 @@ class SpiderApi(BaseApi):
|
||||
|
||||
return tags
|
||||
|
||||
@staticmethod
|
||||
def _get_next_page_tag(sel):
|
||||
next_page_text_list = [
|
||||
'下一页',
|
||||
'下页',
|
||||
'next page',
|
||||
'next',
|
||||
]
|
||||
for tag in sel.iter():
|
||||
if tag.text is not None and tag.text.lower().strip() in next_page_text_list:
|
||||
return tag
|
||||
return None
|
||||
|
||||
def preview_crawl(self, id: str):
|
||||
spider = db_manager.get(col_name='spiders', id=id)
|
||||
|
||||
@@ -601,7 +617,7 @@ class SpiderApi(BaseApi):
|
||||
# iterate all child nodes in a top-down direction
|
||||
for tag in sel.iter():
|
||||
# get child tags
|
||||
child_tags = tag.getchildren()
|
||||
child_tags = [t for t in tag.getchildren() if type(t) != etree._Comment]
|
||||
|
||||
if len(child_tags) < threshold:
|
||||
# if number of child tags is below threshold, skip
|
||||
@@ -622,7 +638,7 @@ class SpiderApi(BaseApi):
|
||||
max_tag = None
|
||||
max_num = 0
|
||||
for tag in list_tag_list:
|
||||
_child_text_tags = self._get_text_child_tags(tag[0])
|
||||
_child_text_tags = self._get_text_child_tags(self._get_children(tag)[0])
|
||||
if len(_child_text_tags) > max_num:
|
||||
max_tag = tag
|
||||
max_num = len(_child_text_tags)
|
||||
@@ -630,16 +646,24 @@ class SpiderApi(BaseApi):
|
||||
# get list item selector
|
||||
item_selector = None
|
||||
if max_tag.get('id') is not None:
|
||||
item_selector = f'#{max_tag.get("id")} > {max_tag.getchildren()[0].tag}'
|
||||
item_selector = f'#{max_tag.get("id")} > {self._get_children(max_tag)[0].tag}'
|
||||
elif max_tag.get('class') is not None:
|
||||
if len(sel.cssselect(f'.{max_tag.get("class")}')) == 1:
|
||||
item_selector = f'.{max_tag.get("class")} > {max_tag.getchildren()[0].tag}'
|
||||
item_selector = f'.{max_tag.get("class")} > {self._get_children(max_tag)[0].tag}'
|
||||
|
||||
# get list fields
|
||||
fields = []
|
||||
if item_selector is not None:
|
||||
for i, tag in enumerate(self._get_text_child_tags(max_tag[0])):
|
||||
if tag.get('class') is not None:
|
||||
first_tag = self._get_children(max_tag)[0]
|
||||
for i, tag in enumerate(self._get_text_child_tags(first_tag)):
|
||||
if len(first_tag.cssselect(f'{tag.tag}')) == 1:
|
||||
fields.append({
|
||||
'name': f'field{i + 1}',
|
||||
'type': 'css',
|
||||
'extract_type': 'text',
|
||||
'query': f'{tag.tag}',
|
||||
})
|
||||
elif tag.get('class') is not None:
|
||||
cls_str = '.'.join([x for x in tag.get("class").split(' ') if x != ''])
|
||||
# print(tag.tag + '.' + cls_str)
|
||||
if len(tag.cssselect(f'{tag.tag}.{cls_str}')) == 1:
|
||||
@@ -650,7 +674,7 @@ class SpiderApi(BaseApi):
|
||||
'query': f'{tag.tag}.{cls_str}',
|
||||
})
|
||||
|
||||
for i, tag in enumerate(self._get_a_child_tags(max_tag[0])):
|
||||
for i, tag in enumerate(self._get_a_child_tags(self._get_children(max_tag)[0])):
|
||||
# if the tag is <a...></a>, extract its href
|
||||
if tag.get('class') is not None:
|
||||
cls_str = '.'.join([x for x in tag.get("class").split(' ') if x != ''])
|
||||
@@ -662,9 +686,19 @@ class SpiderApi(BaseApi):
|
||||
'query': f'{tag.tag}.{cls_str}',
|
||||
})
|
||||
|
||||
# get pagination tag
|
||||
pagination_selector = None
|
||||
pagination_tag = self._get_next_page_tag(sel)
|
||||
if pagination_tag is not None:
|
||||
if pagination_tag.get('id') is not None:
|
||||
pagination_selector = f'#{pagination_tag.get("id")}'
|
||||
elif pagination_tag.get('class') is not None and len(sel.cssselect(f'.{pagination_tag.get("id")}')) == 1:
|
||||
pagination_selector = f'.{pagination_tag.get("id")}'
|
||||
|
||||
return {
|
||||
'status': 'ok',
|
||||
'item_selector': item_selector,
|
||||
'pagination_selector': pagination_selector,
|
||||
'fields': fields
|
||||
}
|
||||
|
||||
@@ -801,3 +835,24 @@ class SpiderManageApi(Resource):
|
||||
'status': 'ok',
|
||||
'message': 'success'
|
||||
}
|
||||
|
||||
def upload(self):
|
||||
args = self.parser.parse_args()
|
||||
f = request.files[0]
|
||||
|
||||
if get_file_suffix(f.filename) != 'zip':
|
||||
return {
|
||||
'status': 'ok',
|
||||
'error': 'file type mismatch'
|
||||
}, 400
|
||||
|
||||
# save zip file on temp folder
|
||||
file_path = '%s/%s' % (PROJECT_TMP_FOLDER, f.filename)
|
||||
with open(file_path, 'wb') as fw:
|
||||
fw.write(f.stream.read())
|
||||
|
||||
# unzip zip file
|
||||
dir_path = file_path.replace('.zip', '')
|
||||
if os.path.exists(dir_path):
|
||||
shutil.rmtree(dir_path)
|
||||
unzip_file(file_path, dir_path)
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
import axios from 'axios'
|
||||
|
||||
let baseUrl = process.env.VUE_APP_BASE_URL ? process.env.VUE_APP_BASE_URL : 'http://localhost:8000/api'
|
||||
// console.log(process.env)
|
||||
// const baseUrl = process.env.API_BASE_URL || 'http://localhost:8000/api'
|
||||
|
||||
const request = (method, path, params, data) => {
|
||||
return new Promise((resolve, reject) => {
|
||||
|
||||
@@ -58,6 +58,8 @@ export default {
|
||||
'Extract Fields': '提取字段',
|
||||
'Download': '下载',
|
||||
'Download CSV': '下载CSV',
|
||||
'Upload Zip File': '上传Zip文件',
|
||||
'Upload': '上传',
|
||||
|
||||
// 主页
|
||||
'Total Tasks': '总任务数',
|
||||
|
||||
@@ -81,9 +81,16 @@
|
||||
width="40%"
|
||||
:visible.sync="addCustomizedDialogVisible"
|
||||
:before-close="onAddCustomizedDialogClose">
|
||||
<p>
|
||||
{{$t('Please go to the source folder of your spiders, create a sub-folder and add your spider codes into it')}}
|
||||
</p>
|
||||
<el-form :model="spiderForm" ref="addConfigurableForm" inline-message>
|
||||
<el-form-item :label="$t('Upload Zip File')" label-width="120px" name="site">
|
||||
<el-upload
|
||||
:action="$request.baseUrl + '/spiders/manage/upload'"
|
||||
:on-success="onUploadSuccess"
|
||||
:file-list="fileList">
|
||||
<el-button size="small" type="primary">{{$t('Upload')}}</el-button>
|
||||
</el-upload>
|
||||
</el-form-item>
|
||||
</el-form>
|
||||
</el-dialog>
|
||||
<!--./customized spider dialog-->
|
||||
|
||||
@@ -213,6 +220,7 @@
|
||||
import {
|
||||
mapState
|
||||
} from 'vuex'
|
||||
import ElUploadDrag from 'element-ui/packages/upload/src/upload-dragger'
|
||||
|
||||
export default {
|
||||
name: 'SpiderList',
|
||||
@@ -244,7 +252,8 @@ export default {
|
||||
],
|
||||
spiderFormRules: {
|
||||
name: [{ required: true, message: 'Required Field', trigger: 'change' }]
|
||||
}
|
||||
},
|
||||
fileList: []
|
||||
}
|
||||
},
|
||||
computed: {
|
||||
@@ -480,6 +489,8 @@ export default {
|
||||
})
|
||||
}
|
||||
})
|
||||
},
|
||||
onUploadSuccess () {
|
||||
}
|
||||
},
|
||||
created () {
|
||||
|
||||
Reference in New Issue
Block a user