Merge pull request #502 from crawlab-team/develop

Develop
This commit is contained in:
Marvin Zhang
2020-02-03 10:32:03 +08:00
committed by GitHub
6 changed files with 56 additions and 7 deletions

View File

@@ -3,6 +3,7 @@
- **交互式教程**. 引导用户了解 Crawlab 的主要功能.
- **加入全局环境变量**. 可以设置全局环境变量,然后传入到所有爬虫程序中. [#177](https://github.com/crawlab-team/crawlab/issues/177)
- **项目**. 允许用户将爬虫关联到项目上. [#316](https://github.com/crawlab-team/crawlab/issues/316)
- **示例爬虫**. 当初始化时,自动加入示例爬虫. [#379](https://github.com/crawlab-team/crawlab/issues/379)
- **用户管理优化**. 限制管理用户的权限. [#456](https://github.com/crawlab-team/crawlab/issues/456)
- **设置页面优化**.

View File

@@ -3,6 +3,7 @@
- **Interactive Tutorial**. Guide users through the main functionalities of Crawlab.
- **Global Environment Variables**. Allow users to set global environment variables, which will be passed into all spider programs. [#177](https://github.com/crawlab-team/crawlab/issues/177)
- **Project**. Allow users to link spiders to projects. [#316](https://github.com/crawlab-team/crawlab/issues/316)
- **Demo Spiders**. Added demo spiders when Crawlab is initialized. [#379](https://github.com/crawlab-team/crawlab/issues/379)
- **User Admin Optimization**. Restrict privilleges of admin users. [#456](https://github.com/crawlab-team/crawlab/issues/456)
- **Setting Page Optimization**.

View File

@@ -317,7 +317,7 @@ func InitSpiderService() error {
// 添加该爬虫到数据库
spider = model.Spider{
Id: bson.NewObjectId(),
Name: configData.Name,
Name: spiderName,
DisplayName: configData.DisplayName,
Type: constants.Customized,
Col: configData.Col,

View File

@@ -9,16 +9,16 @@ start_stage: list
stages:
- name: list
is_list: true
list_css: ""
list_xpath: //body
page_css: ""
page_xpath: //body
list_css: ".result.c-container"
list_xpath: ""
page_css: "a.n"
page_xpath: ""
page_attr: href
fields:
- name: title
css: ""
xpath: .//h3/a
attr: href
attr: ""
next_stage: ""
remark: ""
- name: url
@@ -30,7 +30,7 @@ stages:
- name: abstract
css: ""
xpath: .//*[@class="c-abstract"]
attr: href
attr: ""
next_stage: ""
remark: ""
settings:

View File

@@ -0,0 +1,6 @@
name: "bing_general"
display_name: "必应搜索 (通用)"
remark: "必应搜索 Crawlab列表+分页"
col: "results_bing_general"
type: "customized"
cmd: "python bing_spider.py"

View File

@@ -0,0 +1,41 @@
import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin, urlparse
import re
from crawlab import save_item
s = requests.Session()
def get_real_url(response, url):
if re.search(r'^https?', url):
return url
elif re.search(r'^\/\/', url):
u = urlparse(response.url)
return u.scheme + url
return urljoin(response.url, url)
def start_requests():
for i in range(0, 9):
fr = 'PERE' if not i else 'MORE'
url = f'https://cn.bing.com/search?q=crawlab&first={10 * i + 1}&FROM={fr}'
request_page(url)
def request_page(url):
print(f'requesting {url}')
r = s.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'})
parse_list(r)
def parse_list(response):
soup = bs(response.content.decode('utf-8'))
for el in list(soup.select('#b_results > li')):
try:
save_item({
'title': el.select_one('h2').text,
'url': el.select_one('h2 a').attrs.get('href'),
'abstract': el.select_one('.b_caption p').text,
})
except:
pass
if __name__ == '__main__':
start_requests()