mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-22 17:31:03 +01:00
added demo for general spiders
This commit is contained in:
@@ -317,7 +317,7 @@ func InitSpiderService() error {
|
||||
// 添加该爬虫到数据库
|
||||
spider = model.Spider{
|
||||
Id: bson.NewObjectId(),
|
||||
Name: configData.Name,
|
||||
Name: spiderName,
|
||||
DisplayName: configData.DisplayName,
|
||||
Type: constants.Customized,
|
||||
Col: configData.Col,
|
||||
|
||||
@@ -9,16 +9,16 @@ start_stage: list
|
||||
stages:
|
||||
- name: list
|
||||
is_list: true
|
||||
list_css: ""
|
||||
list_xpath: //body
|
||||
page_css: ""
|
||||
page_xpath: //body
|
||||
list_css: ".result.c-container"
|
||||
list_xpath: ""
|
||||
page_css: "a.n"
|
||||
page_xpath: ""
|
||||
page_attr: href
|
||||
fields:
|
||||
- name: title
|
||||
css: ""
|
||||
xpath: .//h3/a
|
||||
attr: href
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
- name: url
|
||||
@@ -30,7 +30,7 @@ stages:
|
||||
- name: abstract
|
||||
css: ""
|
||||
xpath: .//*[@class="c-abstract"]
|
||||
attr: href
|
||||
attr: ""
|
||||
next_stage: ""
|
||||
remark: ""
|
||||
settings:
|
||||
|
||||
6
spiders/bing_general/Spiderfile
Normal file
6
spiders/bing_general/Spiderfile
Normal file
@@ -0,0 +1,6 @@
|
||||
name: "bing_general"
|
||||
display_name: "必应搜索 (通用)"
|
||||
remark: "必应搜索 Crawlab,列表+分页"
|
||||
col: "results_bing_general"
|
||||
type: "customized"
|
||||
cmd: "python bing_spider.py"
|
||||
41
spiders/bing_general/bing_spider.py
Normal file
41
spiders/bing_general/bing_spider.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup as bs
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import re
|
||||
from crawlab import save_item
|
||||
|
||||
s = requests.Session()
|
||||
|
||||
def get_real_url(response, url):
|
||||
if re.search(r'^https?', url):
|
||||
return url
|
||||
elif re.search(r'^\/\/', url):
|
||||
u = urlparse(response.url)
|
||||
return u.scheme + url
|
||||
return urljoin(response.url, url)
|
||||
|
||||
def start_requests():
|
||||
for i in range(0, 9):
|
||||
fr = 'PERE' if not i else 'MORE'
|
||||
url = f'https://cn.bing.com/search?q=crawlab&first={10 * i + 1}&FROM={fr}'
|
||||
request_page(url)
|
||||
|
||||
def request_page(url):
|
||||
print(f'requesting {url}')
|
||||
r = s.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'})
|
||||
parse_list(r)
|
||||
|
||||
def parse_list(response):
|
||||
soup = bs(response.content.decode('utf-8'))
|
||||
for el in list(soup.select('#b_results > li')):
|
||||
try:
|
||||
save_item({
|
||||
'title': el.select_one('h2').text,
|
||||
'url': el.select_one('h2 a').attrs.get('href'),
|
||||
'abstract': el.select_one('.b_caption p').text,
|
||||
})
|
||||
except:
|
||||
pass
|
||||
|
||||
if __name__ == '__main__':
|
||||
start_requests()
|
||||
Reference in New Issue
Block a user