Files
crawlab/backend/template/spiderfile/Spiderfile.baidu
2019-12-13 12:55:53 +08:00

23 lines
592 B
Plaintext

version: 0.4.0
name: toscrapy_books
start_url: http://www.baidu.com/s?wd=crawlab
start_stage: list
engine: scrapy
stages:
- name: list
is_list: true
list_xpath: //*[contains(@class, "c-container")]
page_xpath: //*[@id="page"]//a[@class="n"][last()]
page_attr: href
fields:
- name: title
xpath: .//h3/a
- name: url
xpath: .//h3/a
attr: href
- name: abstract
xpath: .//*[@class="c-abstract"]
settings:
ROBOTSTXT_OBEY: false
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36