diff --git a/backend/services/spider.go b/backend/services/spider.go index 48777042..fc0a8d2f 100644 --- a/backend/services/spider.go +++ b/backend/services/spider.go @@ -317,7 +317,7 @@ func InitSpiderService() error { // 添加该爬虫到数据库 spider = model.Spider{ Id: bson.NewObjectId(), - Name: configData.Name, + Name: spiderName, DisplayName: configData.DisplayName, Type: constants.Customized, Col: configData.Col, diff --git a/spiders/baidu_config/Spiderfile b/spiders/baidu_config/Spiderfile index 5266b85b..a29d4acb 100644 --- a/spiders/baidu_config/Spiderfile +++ b/spiders/baidu_config/Spiderfile @@ -9,16 +9,16 @@ start_stage: list stages: - name: list is_list: true - list_css: "" - list_xpath: //body - page_css: "" - page_xpath: //body + list_css: ".result.c-container" + list_xpath: "" + page_css: "a.n" + page_xpath: "" page_attr: href fields: - name: title css: "" xpath: .//h3/a - attr: href + attr: "" next_stage: "" remark: "" - name: url @@ -30,7 +30,7 @@ stages: - name: abstract css: "" xpath: .//*[@class="c-abstract"] - attr: href + attr: "" next_stage: "" remark: "" settings: diff --git a/spiders/bing_general/Spiderfile b/spiders/bing_general/Spiderfile new file mode 100644 index 00000000..614c135e --- /dev/null +++ b/spiders/bing_general/Spiderfile @@ -0,0 +1,6 @@ +name: "bing_general" +display_name: "必应搜索 (通用)" +remark: "必应搜索 Crawlab,列表+分页" +col: "results_bing_general" +type: "customized" +cmd: "python bing_spider.py" \ No newline at end of file diff --git a/spiders/bing_general/bing_spider.py b/spiders/bing_general/bing_spider.py new file mode 100644 index 00000000..e982e4ee --- /dev/null +++ b/spiders/bing_general/bing_spider.py @@ -0,0 +1,41 @@ +import requests +from bs4 import BeautifulSoup as bs +from urllib.parse import urljoin, urlparse +import re +from crawlab import save_item + +s = requests.Session() + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +def start_requests(): + for i in range(0, 9): + fr = 'PERE' if not i else 'MORE' + url = f'https://cn.bing.com/search?q=crawlab&first={10 * i + 1}&FROM={fr}' + request_page(url) + +def request_page(url): + print(f'requesting {url}') + r = s.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}) + parse_list(r) + +def parse_list(response): + soup = bs(response.content.decode('utf-8')) + for el in list(soup.select('#b_results > li')): + try: + save_item({ + 'title': el.select_one('h2').text, + 'url': el.select_one('h2 a').attrs.get('href'), + 'abstract': el.select_one('.b_caption p').text, + }) + except: + pass + +if __name__ == '__main__': + start_requests() \ No newline at end of file