updated auto field extraction

2026-01-22 17:31:03 +01:00 · 2019-06-04 07:55:51 +08:00
parent 91b9a8e78d
commit c0cec217d7
4 changed files with 81 additions and 15 deletions
--- a/crawlab/routes/spiders.py
+++ b/crawlab/routes/spiders.py
@@ -371,7 +371,6 @@ class SpiderApi(BaseApi):

        # make source / destination
        src = os.path.join(dir_path, os.listdir(dir_path)[0])
-        # src = dir_path
        dst = os.path.join(PROJECT_DEPLOY_FILE_FOLDER, str(spider.get('_id')))

        # logging info
@@ -511,11 +510,15 @@ class SpiderApi(BaseApi):

        return sel

+    @staticmethod
+    def _get_children(sel):
+        return [tag for tag in sel.getchildren() if type(tag) != etree._Comment]
+
    @staticmethod
    def _get_text_child_tags(sel):
        tags = []
        for tag in sel.iter():
-            if tag.text is not None and tag.text.strip() != '':
+            if type(tag) != etree._Comment and tag.text is not None and tag.text.strip() != '':
                tags.append(tag)
        return tags

@@ -530,6 +533,19 @@ class SpiderApi(BaseApi):

        return tags

+    @staticmethod
+    def _get_next_page_tag(sel):
+        next_page_text_list = [
+            '下一页',
+            '下页',
+            'next page',
+            'next',
+        ]
+        for tag in sel.iter():
+            if tag.text is not None and tag.text.lower().strip() in next_page_text_list:
+                return tag
+        return None
+
    def preview_crawl(self, id: str):
        spider = db_manager.get(col_name='spiders', id=id)

@@ -601,7 +617,7 @@ class SpiderApi(BaseApi):
        # iterate all child nodes in a top-down direction
        for tag in sel.iter():
            # get child tags
-            child_tags = tag.getchildren()
+            child_tags = [t for t in tag.getchildren() if type(t) != etree._Comment]

            if len(child_tags) < threshold:
                # if number of child tags is below threshold, skip
@@ -622,7 +638,7 @@ class SpiderApi(BaseApi):
        max_tag = None
        max_num = 0
        for tag in list_tag_list:
-            _child_text_tags = self._get_text_child_tags(tag[0])
+            _child_text_tags = self._get_text_child_tags(self._get_children(tag)[0])
            if len(_child_text_tags) > max_num:
                max_tag = tag
                max_num = len(_child_text_tags)
@@ -630,16 +646,24 @@ class SpiderApi(BaseApi):
        # get list item selector
        item_selector = None
        if max_tag.get('id') is not None:
-            item_selector = f'#{max_tag.get("id")} > {max_tag.getchildren()[0].tag}'
+            item_selector = f'#{max_tag.get("id")} > {self._get_children(max_tag)[0].tag}'
        elif max_tag.get('class') is not None:
            if len(sel.cssselect(f'.{max_tag.get("class")}')) == 1:
-                item_selector = f'.{max_tag.get("class")} > {max_tag.getchildren()[0].tag}'
+                item_selector = f'.{max_tag.get("class")} > {self._get_children(max_tag)[0].tag}'

        # get list fields
        fields = []
        if item_selector is not None:
-            for i, tag in enumerate(self._get_text_child_tags(max_tag[0])):
-                if tag.get('class') is not None:
+            first_tag = self._get_children(max_tag)[0]
+            for i, tag in enumerate(self._get_text_child_tags(first_tag)):
+                if len(first_tag.cssselect(f'{tag.tag}')) == 1:
+                    fields.append({
+                        'name': f'field{i + 1}',
+                        'type': 'css',
+                        'extract_type': 'text',
+                        'query': f'{tag.tag}',
+                    })
+                elif tag.get('class') is not None:
                    cls_str = '.'.join([x for x in tag.get("class").split(' ') if x != ''])
                    # print(tag.tag + '.' + cls_str)
                    if len(tag.cssselect(f'{tag.tag}.{cls_str}')) == 1:
@@ -650,7 +674,7 @@ class SpiderApi(BaseApi):
                            'query': f'{tag.tag}.{cls_str}',
                        })

-            for i, tag in enumerate(self._get_a_child_tags(max_tag[0])):
+            for i, tag in enumerate(self._get_a_child_tags(self._get_children(max_tag)[0])):
                # if the tag is <a...></a>, extract its href
                if tag.get('class') is not None:
                    cls_str = '.'.join([x for x in tag.get("class").split(' ') if x != ''])
@@ -662,9 +686,19 @@ class SpiderApi(BaseApi):
                        'query': f'{tag.tag}.{cls_str}',
                    })

+        # get pagination tag
+        pagination_selector = None
+        pagination_tag = self._get_next_page_tag(sel)
+        if pagination_tag is not None:
+            if pagination_tag.get('id') is not None:
+                pagination_selector = f'#{pagination_tag.get("id")}'
+            elif pagination_tag.get('class') is not None and len(sel.cssselect(f'.{pagination_tag.get("id")}')) == 1:
+                pagination_selector = f'.{pagination_tag.get("id")}'
+
        return {
            'status': 'ok',
            'item_selector': item_selector,
+            'pagination_selector': pagination_selector,
            'fields': fields
        }

@@ -801,3 +835,24 @@ class SpiderManageApi(Resource):
            'status': 'ok',
            'message': 'success'
        }
+
+    def upload(self):
+        args = self.parser.parse_args()
+        f = request.files[0]
+
+        if get_file_suffix(f.filename) != 'zip':
+            return {
+                       'status': 'ok',
+                       'error': 'file type mismatch'
+                   }, 400
+
+        # save zip file on temp folder
+        file_path = '%s/%s' % (PROJECT_TMP_FOLDER, f.filename)
+        with open(file_path, 'wb') as fw:
+            fw.write(f.stream.read())
+
+        # unzip zip file
+        dir_path = file_path.replace('.zip', '')
+        if os.path.exists(dir_path):
+            shutil.rmtree(dir_path)
+        unzip_file(file_path, dir_path)
--- a/frontend/src/api/request.js
+++ b/frontend/src/api/request.js
@@ -1,8 +1,6 @@
 import axios from 'axios'

 let baseUrl = process.env.VUE_APP_BASE_URL ? process.env.VUE_APP_BASE_URL : 'http://localhost:8000/api'
-// console.log(process.env)
-// const baseUrl = process.env.API_BASE_URL || 'http://localhost:8000/api'

 const request = (method, path, params, data) => {
  return new Promise((resolve, reject) => {
--- a/frontend/src/i18n/zh.js
+++ b/frontend/src/i18n/zh.js
@@ -58,6 +58,8 @@ export default {
  'Extract Fields': '提取字段',
  'Download': '下载',
  'Download CSV': '下载CSV',
+  'Upload Zip File': '上传Zip文件',
+  'Upload': '上传',

  // 主页
  'Total Tasks': '总任务数',
--- a/frontend/src/views/spider/SpiderList.vue
+++ b/frontend/src/views/spider/SpiderList.vue
@@ -81,9 +81,16 @@
               width="40%"
               :visible.sync="addCustomizedDialogVisible"
               :before-close="onAddCustomizedDialogClose">
-      <p>
-        {{$t('Please go to the source folder of your spiders, create a sub-folder and add your spider codes into it')}}
-      </p>
+      <el-form :model="spiderForm" ref="addConfigurableForm" inline-message>
+        <el-form-item :label="$t('Upload Zip File')" label-width="120px" name="site">
+          <el-upload
+            :action="$request.baseUrl + '/spiders/manage/upload'"
+            :on-success="onUploadSuccess"
+            :file-list="fileList">
+            <el-button size="small" type="primary">{{$t('Upload')}}</el-button>
+          </el-upload>
+        </el-form-item>
+      </el-form>
    </el-dialog>
    <!--./customized spider dialog-->

@@ -213,6 +220,7 @@
 import {
  mapState
 } from 'vuex'
+import ElUploadDrag from 'element-ui/packages/upload/src/upload-dragger'

 export default {
  name: 'SpiderList',
@@ -244,7 +252,8 @@ export default {
      ],
      spiderFormRules: {
        name: [{ required: true, message: 'Required Field', trigger: 'change' }]
-      }
+      },
+      fileList: []
    }
  },
  computed: {
@@ -480,6 +489,8 @@ export default {
            })
        }
      })
+    },
+    onUploadSuccess () {
    }
  },
  created () {