diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py index 0bfe8e4c..0f1772e9 100644 --- a/crawlab/routes/spiders.py +++ b/crawlab/routes/spiders.py @@ -371,7 +371,6 @@ class SpiderApi(BaseApi): # make source / destination src = os.path.join(dir_path, os.listdir(dir_path)[0]) - # src = dir_path dst = os.path.join(PROJECT_DEPLOY_FILE_FOLDER, str(spider.get('_id'))) # logging info @@ -511,11 +510,15 @@ class SpiderApi(BaseApi): return sel + @staticmethod + def _get_children(sel): + return [tag for tag in sel.getchildren() if type(tag) != etree._Comment] + @staticmethod def _get_text_child_tags(sel): tags = [] for tag in sel.iter(): - if tag.text is not None and tag.text.strip() != '': + if type(tag) != etree._Comment and tag.text is not None and tag.text.strip() != '': tags.append(tag) return tags @@ -530,6 +533,19 @@ class SpiderApi(BaseApi): return tags + @staticmethod + def _get_next_page_tag(sel): + next_page_text_list = [ + '下一页', + '下页', + 'next page', + 'next', + ] + for tag in sel.iter(): + if tag.text is not None and tag.text.lower().strip() in next_page_text_list: + return tag + return None + def preview_crawl(self, id: str): spider = db_manager.get(col_name='spiders', id=id) @@ -601,7 +617,7 @@ class SpiderApi(BaseApi): # iterate all child nodes in a top-down direction for tag in sel.iter(): # get child tags - child_tags = tag.getchildren() + child_tags = [t for t in tag.getchildren() if type(t) != etree._Comment] if len(child_tags) < threshold: # if number of child tags is below threshold, skip @@ -622,7 +638,7 @@ class SpiderApi(BaseApi): max_tag = None max_num = 0 for tag in list_tag_list: - _child_text_tags = self._get_text_child_tags(tag[0]) + _child_text_tags = self._get_text_child_tags(self._get_children(tag)[0]) if len(_child_text_tags) > max_num: max_tag = tag max_num = len(_child_text_tags) @@ -630,16 +646,24 @@ class SpiderApi(BaseApi): # get list item selector item_selector = None if max_tag.get('id') is not None: - item_selector = f'#{max_tag.get("id")} > {max_tag.getchildren()[0].tag}' + item_selector = f'#{max_tag.get("id")} > {self._get_children(max_tag)[0].tag}' elif max_tag.get('class') is not None: if len(sel.cssselect(f'.{max_tag.get("class")}')) == 1: - item_selector = f'.{max_tag.get("class")} > {max_tag.getchildren()[0].tag}' + item_selector = f'.{max_tag.get("class")} > {self._get_children(max_tag)[0].tag}' # get list fields fields = [] if item_selector is not None: - for i, tag in enumerate(self._get_text_child_tags(max_tag[0])): - if tag.get('class') is not None: + first_tag = self._get_children(max_tag)[0] + for i, tag in enumerate(self._get_text_child_tags(first_tag)): + if len(first_tag.cssselect(f'{tag.tag}')) == 1: + fields.append({ + 'name': f'field{i + 1}', + 'type': 'css', + 'extract_type': 'text', + 'query': f'{tag.tag}', + }) + elif tag.get('class') is not None: cls_str = '.'.join([x for x in tag.get("class").split(' ') if x != '']) # print(tag.tag + '.' + cls_str) if len(tag.cssselect(f'{tag.tag}.{cls_str}')) == 1: @@ -650,7 +674,7 @@ class SpiderApi(BaseApi): 'query': f'{tag.tag}.{cls_str}', }) - for i, tag in enumerate(self._get_a_child_tags(max_tag[0])): + for i, tag in enumerate(self._get_a_child_tags(self._get_children(max_tag)[0])): # if the tag is , extract its href if tag.get('class') is not None: cls_str = '.'.join([x for x in tag.get("class").split(' ') if x != '']) @@ -662,9 +686,19 @@ class SpiderApi(BaseApi): 'query': f'{tag.tag}.{cls_str}', }) + # get pagination tag + pagination_selector = None + pagination_tag = self._get_next_page_tag(sel) + if pagination_tag is not None: + if pagination_tag.get('id') is not None: + pagination_selector = f'#{pagination_tag.get("id")}' + elif pagination_tag.get('class') is not None and len(sel.cssselect(f'.{pagination_tag.get("id")}')) == 1: + pagination_selector = f'.{pagination_tag.get("id")}' + return { 'status': 'ok', 'item_selector': item_selector, + 'pagination_selector': pagination_selector, 'fields': fields } @@ -801,3 +835,24 @@ class SpiderManageApi(Resource): 'status': 'ok', 'message': 'success' } + + def upload(self): + args = self.parser.parse_args() + f = request.files[0] + + if get_file_suffix(f.filename) != 'zip': + return { + 'status': 'ok', + 'error': 'file type mismatch' + }, 400 + + # save zip file on temp folder + file_path = '%s/%s' % (PROJECT_TMP_FOLDER, f.filename) + with open(file_path, 'wb') as fw: + fw.write(f.stream.read()) + + # unzip zip file + dir_path = file_path.replace('.zip', '') + if os.path.exists(dir_path): + shutil.rmtree(dir_path) + unzip_file(file_path, dir_path) diff --git a/frontend/src/api/request.js b/frontend/src/api/request.js index 1bc4d57e..aa165df0 100644 --- a/frontend/src/api/request.js +++ b/frontend/src/api/request.js @@ -1,8 +1,6 @@ import axios from 'axios' let baseUrl = process.env.VUE_APP_BASE_URL ? process.env.VUE_APP_BASE_URL : 'http://localhost:8000/api' -// console.log(process.env) -// const baseUrl = process.env.API_BASE_URL || 'http://localhost:8000/api' const request = (method, path, params, data) => { return new Promise((resolve, reject) => { diff --git a/frontend/src/i18n/zh.js b/frontend/src/i18n/zh.js index 18813c3a..86bdda1a 100644 --- a/frontend/src/i18n/zh.js +++ b/frontend/src/i18n/zh.js @@ -58,6 +58,8 @@ export default { 'Extract Fields': '提取字段', 'Download': '下载', 'Download CSV': '下载CSV', + 'Upload Zip File': '上传Zip文件', + 'Upload': '上传', // 主页 'Total Tasks': '总任务数', diff --git a/frontend/src/views/spider/SpiderList.vue b/frontend/src/views/spider/SpiderList.vue index fd8be0e0..a756d550 100644 --- a/frontend/src/views/spider/SpiderList.vue +++ b/frontend/src/views/spider/SpiderList.vue @@ -81,9 +81,16 @@ width="40%" :visible.sync="addCustomizedDialogVisible" :before-close="onAddCustomizedDialogClose"> -

- {{$t('Please go to the source folder of your spiders, create a sub-folder and add your spider codes into it')}} -

+ + + + {{$t('Upload')}} + + + @@ -213,6 +220,7 @@ import { mapState } from 'vuex' +import ElUploadDrag from 'element-ui/packages/upload/src/upload-dragger' export default { name: 'SpiderList', @@ -244,7 +252,8 @@ export default { ], spiderFormRules: { name: [{ required: true, message: 'Required Field', trigger: 'change' }] - } + }, + fileList: [] } }, computed: { @@ -480,6 +489,8 @@ export default { }) } }) + }, + onUploadSuccess () { } }, created () {