Merge pull request #502 from crawlab-team/develop

Develop
2026-01-22 17:31:03 +01:00 · 2020-02-03 10:32:03 +08:00
parent 33ef0a1bf6 be3235cefa
commit 5190327c4c
6 changed files with 56 additions and 7 deletions
--- a/CHANGELOG-zh.md
+++ b/CHANGELOG-zh.md
@@ -3,6 +3,7 @@
 - **交互式教程**. 引导用户了解 Crawlab 的主要功能.
 - **加入全局环境变量**. 可以设置全局环境变量，然后传入到所有爬虫程序中. [#177](https://github.com/crawlab-team/crawlab/issues/177)
 - **项目**. 允许用户将爬虫关联到项目上. [#316](https://github.com/crawlab-team/crawlab/issues/316)
+- **示例爬虫**. 当初始化时，自动加入示例爬虫. [#379](https://github.com/crawlab-team/crawlab/issues/379)
 - **用户管理优化**. 限制管理用户的权限. [#456](https://github.com/crawlab-team/crawlab/issues/456)
 - **设置页面优化**.

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,7 @@
 - **Interactive Tutorial**. Guide users through the main functionalities of Crawlab.
 - **Global Environment Variables**. Allow users to set global environment variables, which will be passed into all spider programs. [#177](https://github.com/crawlab-team/crawlab/issues/177)
 - **Project**. Allow users to link spiders to projects. [#316](https://github.com/crawlab-team/crawlab/issues/316)
+- **Demo Spiders**. Added demo spiders when Crawlab is initialized. [#379](https://github.com/crawlab-team/crawlab/issues/379)
 - **User Admin Optimization**. Restrict privilleges of admin users. [#456](https://github.com/crawlab-team/crawlab/issues/456)
 - **Setting Page Optimization**.

--- a/backend/services/spider.go
+++ b/backend/services/spider.go
@@ -317,7 +317,7 @@ func InitSpiderService() error {
 				// 添加该爬虫到数据库
 				spider = model.Spider{
 					Id:          bson.NewObjectId(),
-					Name:        configData.Name,
+					Name:        spiderName,
 					DisplayName: configData.DisplayName,
 					Type:        constants.Customized,
 					Col:         configData.Col,
--- a/spiders/baidu_config/Spiderfile
+++ b/spiders/baidu_config/Spiderfile
@@ -9,16 +9,16 @@ start_stage: list
 stages:
 - name: list
  is_list: true
-  list_css: ""
-  list_xpath: //body
-  page_css: ""
-  page_xpath: //body
+  list_css: ".result.c-container"
+  list_xpath: ""
+  page_css: "a.n"
+  page_xpath: ""
  page_attr: href
  fields:
  - name: title
    css: ""
    xpath: .//h3/a
-    attr: href
+    attr: ""
    next_stage: ""
    remark: ""
  - name: url
@@ -30,7 +30,7 @@ stages:
  - name: abstract
    css: ""
    xpath: .//*[@class="c-abstract"]
-    attr: href
+    attr: ""
    next_stage: ""
    remark: ""
 settings:
--- a/spiders/bing_general/Spiderfile
+++ b/spiders/bing_general/Spiderfile
@@ -0,0 +1,6 @@
+name: "bing_general"
+display_name: "必应搜索 (通用)"
+remark: "必应搜索 Crawlab，列表+分页"
+col: "results_bing_general"
+type: "customized"
+cmd: "python bing_spider.py"
--- a/spiders/bing_general/bing_spider.py
+++ b/spiders/bing_general/bing_spider.py
@@ -0,0 +1,41 @@
+import requests
+from bs4 import BeautifulSoup as bs
+from urllib.parse import urljoin, urlparse
+import re
+from crawlab import save_item
+
+s = requests.Session()
+
+def get_real_url(response, url):
+    if re.search(r'^https?', url):
+        return url
+    elif re.search(r'^\/\/', url):
+        u = urlparse(response.url)
+        return u.scheme + url
+    return urljoin(response.url, url)
+
+def start_requests():
+	for i in range(0, 9):
+		fr = 'PERE' if not i else 'MORE'
+		url = f'https://cn.bing.com/search?q=crawlab&first={10 * i + 1}&FROM={fr}'
+		request_page(url)
+
+def request_page(url):
+	print(f'requesting {url}')
+	r = s.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'})
+	parse_list(r)
+
+def parse_list(response):
+	soup = bs(response.content.decode('utf-8'))
+	for el in list(soup.select('#b_results > li')):
+		try:
+			save_item({
+				'title': el.select_one('h2').text,
+				'url': el.select_one('h2 a').attrs.get('href'),
+				'abstract': el.select_one('.b_caption p').text,
+			})
+		except:
+			pass
+
+if __name__ == '__main__':
+	start_requests()