added demo for general spiders

2026-01-22 17:31:03 +01:00 · 2020-02-03 10:30:04 +08:00
parent 6db457b038
commit be3235cefa
4 changed files with 54 additions and 7 deletions
--- a/backend/services/spider.go
+++ b/backend/services/spider.go
@@ -317,7 +317,7 @@ func InitSpiderService() error {
 				// 添加该爬虫到数据库
 				spider = model.Spider{
 					Id:          bson.NewObjectId(),
-					Name:        configData.Name,
+					Name:        spiderName,
 					DisplayName: configData.DisplayName,
 					Type:        constants.Customized,
 					Col:         configData.Col,
--- a/spiders/baidu_config/Spiderfile
+++ b/spiders/baidu_config/Spiderfile
@@ -9,16 +9,16 @@ start_stage: list
 stages:
 - name: list
  is_list: true
-  list_css: ""
-  list_xpath: //body
-  page_css: ""
-  page_xpath: //body
+  list_css: ".result.c-container"
+  list_xpath: ""
+  page_css: "a.n"
+  page_xpath: ""
  page_attr: href
  fields:
  - name: title
    css: ""
    xpath: .//h3/a
-    attr: href
+    attr: ""
    next_stage: ""
    remark: ""
  - name: url
@@ -30,7 +30,7 @@ stages:
  - name: abstract
    css: ""
    xpath: .//*[@class="c-abstract"]
-    attr: href
+    attr: ""
    next_stage: ""
    remark: ""
 settings:
--- a/spiders/bing_general/Spiderfile
+++ b/spiders/bing_general/Spiderfile
@@ -0,0 +1,6 @@
+name: "bing_general"
+display_name: "必应搜索 (通用)"
+remark: "必应搜索 Crawlab，列表+分页"
+col: "results_bing_general"
+type: "customized"
+cmd: "python bing_spider.py"
--- a/spiders/bing_general/bing_spider.py
+++ b/spiders/bing_general/bing_spider.py
@@ -0,0 +1,41 @@
+import requests
+from bs4 import BeautifulSoup as bs
+from urllib.parse import urljoin, urlparse
+import re
+from crawlab import save_item
+
+s = requests.Session()
+
+def get_real_url(response, url):
+    if re.search(r'^https?', url):
+        return url
+    elif re.search(r'^\/\/', url):
+        u = urlparse(response.url)
+        return u.scheme + url
+    return urljoin(response.url, url)
+
+def start_requests():
+	for i in range(0, 9):
+		fr = 'PERE' if not i else 'MORE'
+		url = f'https://cn.bing.com/search?q=crawlab&first={10 * i + 1}&FROM={fr}'
+		request_page(url)
+
+def request_page(url):
+	print(f'requesting {url}')
+	r = s.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'})
+	parse_list(r)
+
+def parse_list(response):
+	soup = bs(response.content.decode('utf-8'))
+	for el in list(soup.select('#b_results > li')):
+		try:
+			save_item({
+				'title': el.select_one('h2').text,
+				'url': el.select_one('h2 a').attrs.get('href'),
+				'abstract': el.select_one('.b_caption p').text,
+			})
+		except:
+			pass
+
+if __name__ == '__main__':
+	start_requests()