mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-23 17:31:11 +01:00
620 lines
22 KiB
HTML
620 lines
22 KiB
HTML
|
|
<!DOCTYPE HTML>
|
|
<html lang="" >
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta content="text/html; charset=utf-8" http-equiv="Content-Type">
|
|
<title>可配置爬虫 · GitBook</title>
|
|
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
|
|
<meta name="description" content="">
|
|
<meta name="generator" content="GitBook 3.2.3">
|
|
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="../../gitbook/style.css">
|
|
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="../../gitbook/gitbook-plugin-highlight/website.css">
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="../../gitbook/gitbook-plugin-search/search.css">
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="../../gitbook/gitbook-plugin-fontsettings/website.css">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<meta name="HandheldFriendly" content="true"/>
|
|
<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
|
|
<meta name="apple-mobile-web-app-capable" content="yes">
|
|
<meta name="apple-mobile-web-app-status-bar-style" content="black">
|
|
<link rel="apple-touch-icon-precomposed" sizes="152x152" href="../../gitbook/images/apple-touch-icon-precomposed-152.png">
|
|
<link rel="shortcut icon" href="../../gitbook/images/favicon.ico" type="image/x-icon">
|
|
|
|
|
|
<link rel="next" href="Deploy.html" />
|
|
|
|
|
|
<link rel="prev" href="CustomizedSpider.html" />
|
|
|
|
|
|
</head>
|
|
<body>
|
|
|
|
<div class="book">
|
|
<div class="book-summary">
|
|
|
|
|
|
<div id="book-search-input" role="search">
|
|
<input type="text" placeholder="Type to search" />
|
|
</div>
|
|
|
|
|
|
<nav role="navigation">
|
|
|
|
|
|
|
|
<ul class="summary">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="chapter " data-level="1.1" data-path="../../">
|
|
|
|
<a href="../../">
|
|
|
|
|
|
Crawlab简介
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.2" data-path="../../Installation/">
|
|
|
|
<a href="../../Installation/">
|
|
|
|
|
|
安装Crawlab
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<ul class="articles">
|
|
|
|
|
|
<li class="chapter " data-level="1.2.1" data-path="../../Installation/Docker.html">
|
|
|
|
<a href="../../Installation/Docker.html">
|
|
|
|
|
|
Docker
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.2.2" data-path="../../Installation/Direct.html">
|
|
|
|
<a href="../../Installation/Direct.html">
|
|
|
|
|
|
直接部署
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.2.3" data-path="../../Installation/Preview.html">
|
|
|
|
<a href="../../Installation/Preview.html">
|
|
|
|
|
|
预览模式
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3" data-path="../">
|
|
|
|
<a href="../">
|
|
|
|
|
|
使用Crawlab
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<ul class="articles">
|
|
|
|
|
|
<li class="chapter " data-level="1.3.1" data-path="../Node/">
|
|
|
|
<a href="../Node/">
|
|
|
|
|
|
节点
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<ul class="articles">
|
|
|
|
|
|
<li class="chapter " data-level="1.3.1.1" data-path="../Node/View.html">
|
|
|
|
<a href="../Node/View.html">
|
|
|
|
|
|
查看节点列表
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.1.2" data-path="../Node/Edit.html">
|
|
|
|
<a href="../Node/Edit.html">
|
|
|
|
|
|
修改节点信息
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.2" data-path="./">
|
|
|
|
<a href="./">
|
|
|
|
|
|
爬虫
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<ul class="articles">
|
|
|
|
|
|
<li class="chapter " data-level="1.3.2.1" data-path="Create.html">
|
|
|
|
<a href="Create.html">
|
|
|
|
|
|
创建爬虫
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<ul class="articles">
|
|
|
|
|
|
<li class="chapter " data-level="1.3.2.1.1" data-path="CustomizedSpider.html">
|
|
|
|
<a href="CustomizedSpider.html">
|
|
|
|
|
|
自定义爬虫
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter active" data-level="1.3.2.1.2" data-path="ConfigurableSpider.html">
|
|
|
|
<a href="ConfigurableSpider.html">
|
|
|
|
|
|
可配置爬虫
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.2.2" data-path="Deploy.html">
|
|
|
|
<a href="Deploy.html">
|
|
|
|
|
|
部署爬虫
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.2.3" data-path="Run.html">
|
|
|
|
<a href="Run.html">
|
|
|
|
|
|
运行爬虫
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.2.4" data-path="Analytics.html">
|
|
|
|
<a href="Analytics.html">
|
|
|
|
|
|
统计数据
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.3" data-path="../Task/">
|
|
|
|
<a href="../Task/">
|
|
|
|
|
|
任务
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<ul class="articles">
|
|
|
|
|
|
<li class="chapter " data-level="1.3.3.1" data-path="../Task/View.html">
|
|
|
|
<a href="../Task/View.html">
|
|
|
|
|
|
查看任务
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.3.2" data-path="../Task/Action.html">
|
|
|
|
<a href="../Task/Action.html">
|
|
|
|
|
|
操作任务
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.3.3" data-path="../Task/DownloadResults.html">
|
|
|
|
<a href="../Task/DownloadResults.html">
|
|
|
|
|
|
下载结果
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.4" data-path="../Schedule/">
|
|
|
|
<a href="../Schedule/">
|
|
|
|
|
|
定时任务
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.5" data-path="../Site/">
|
|
|
|
<a href="../Site/">
|
|
|
|
|
|
网站
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.4" data-path="../../Architecture/">
|
|
|
|
<a href="../../Architecture/">
|
|
|
|
|
|
架构
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.5" data-path="../../Examples/">
|
|
|
|
<a href="../../Examples/">
|
|
|
|
|
|
样例
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<ul class="articles">
|
|
|
|
|
|
<li class="chapter " data-level="1.5.1" data-path="../../Examples/ScrapyIntegration.html">
|
|
|
|
<a href="../../Examples/ScrapyIntegration.html">
|
|
|
|
|
|
与Scrapy集成
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
<li class="divider"></li>
|
|
|
|
<li>
|
|
<a href="https://www.gitbook.com" target="blank" class="gitbook-link">
|
|
Published with GitBook
|
|
</a>
|
|
</li>
|
|
</ul>
|
|
|
|
|
|
</nav>
|
|
|
|
|
|
</div>
|
|
|
|
<div class="book-body">
|
|
|
|
<div class="body-inner">
|
|
|
|
|
|
|
|
<div class="book-header" role="navigation">
|
|
|
|
|
|
<!-- Title -->
|
|
<h1>
|
|
<i class="fa fa-circle-o-notch fa-spin"></i>
|
|
<a href="../.." >可配置爬虫</a>
|
|
</h1>
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<div class="page-wrapper" tabindex="-1" role="main">
|
|
<div class="page-inner">
|
|
|
|
<div id="book-search-results">
|
|
<div class="search-noresults">
|
|
|
|
<section class="normal markdown-section">
|
|
|
|
<h2 id="可配置爬虫">可配置爬虫</h2>
|
|
<p>可配置爬虫是版本<a href="https://github.com/tikazyq/crawlab/releases/tag/v0.2.1" target="_blank">v0.2.1</a>开发的功能。目的是将具有相似网站结构的爬虫项目可配置化,将开发爬虫的过程流程化,大大提高爬虫开发效率。</p>
|
|
<p>Crawlab的可配置爬虫是基于Scrapy的,因此天生支持并发。而且,可配置爬虫完全支持<a href="CustomizedSpider">自定义爬虫</a>的一般功能,因此也支持任务调度、任务监控、日志监控、数据分析。</p>
|
|
<h3 id="添加爬虫">添加爬虫</h3>
|
|
<p>在<code>侧边栏</code>点击<code>爬虫</code>导航至<code>爬虫列表</code>,点击<strong>添加爬虫</strong>按钮。</p>
|
|
<p><img src="https://user-gold-cdn.xitu.io/2019/5/27/16af74ec408111a7?w=1662&h=702&f=png&s=98898" alt="爬虫列表"></p>
|
|
<p>点击<strong>可配置爬虫</strong>。</p>
|
|
<p><img src="https://user-gold-cdn.xitu.io/2019/5/27/16af74f4c75346da?w=1667&h=703&f=png&s=92067" alt="爬虫列表-添加爬虫"></p>
|
|
<p>输入完基本信息,点击<strong>添加</strong>。</p>
|
|
<p><img src="https://user-gold-cdn.xitu.io/2019/5/27/16af751c5d8d984d?w=1666&h=688&f=png&s=90926" alt="爬虫列表-爬虫信息"></p>
|
|
<h3 id="配置爬虫">配置爬虫</h3>
|
|
<p>添加完成后,可以看到刚刚添加的可配置爬虫出现了在最下方,点击<strong>查看</strong>进入到<strong>爬虫详情</strong>。</p>
|
|
<p><img src="https://user-gold-cdn.xitu.io/2019/5/27/16af754c6f000698?w=1645&h=739&f=png&s=103908" alt=""></p>
|
|
<p>点击<strong>配置</strong>标签进入到配置页面。接下来,我们需要对爬虫规则进行配置。</p>
|
|
<p><img src="https://user-gold-cdn.xitu.io/2019/5/27/16af756d003eae66?w=1659&h=726&f=png&s=92224" alt=""></p>
|
|
<p>这里已经有一些配置好的初始输入项。我们简单介绍一下各自的含义。</p>
|
|
<h4 id="抓取类别">抓取类别</h4>
|
|
<p>这也是爬虫抓取采用的策略,也就是爬虫遍历网页是如何进行的。作为第一个版本,我们有<strong>仅列表</strong>、<strong>仅详情页</strong>、<strong>列表+详情页</strong>。</p>
|
|
<ul>
|
|
<li>仅列表页。这也是最简单的形式,爬虫遍历列表上的列表项,将数据抓取下来。</li>
|
|
<li>仅详情页。爬虫只抓取详情页。</li>
|
|
<li>列表+详情页。爬虫先遍历列表页,将列表项中的详情页地址提取出来并跟进抓取详情页。</li>
|
|
</ul>
|
|
<p>这里我们选择<strong>列表+详情页</strong>。</p>
|
|
<h4 id="列表项选择器--分页选择器">列表项选择器 & 分页选择器</h4>
|
|
<p>列表项的匹和分页按钮的匹配查询,由CSS或XPath来进行匹配。</p>
|
|
<h4 id="开始url">开始URL</h4>
|
|
<p>爬虫最开始遍历的网址。</p>
|
|
<h4 id="遵守robots协议">遵守Robots协议</h4>
|
|
<p>这个默认是开启的。如果开启,爬虫将先抓取网站的robots.txt并判断页面是否可抓;否则,不会对此进行验证。用户可以选择将其关闭。请注意,任何无视Robots协议的行为都有法律风险。</p>
|
|
<h4 id="列表页字段--详情页字段">列表页字段 & 详情页字段</h4>
|
|
<p>这些都是再列表页或详情页中需要提取的字段。字段由CSS选择器或者XPath来匹配提取。可以选择文本或者属性。</p>
|
|
<p>在检查完目标网页的元素CSS选择器之后,我们输入列表项选择器、开始URL、列表页/详情页等信息。注意勾选url为详情页URL。</p>
|
|
<p><img src="https://user-gold-cdn.xitu.io/2019/5/27/16af7685423c7d57?w=1653&h=873&f=png&s=117230" alt=""></p>
|
|
<p>点击保存、预览,查看预览内容。</p>
|
|
<p><img src="https://user-gold-cdn.xitu.io/2019/5/27/16af769811d7bd0c?w=1720&h=663&f=png&s=123762" alt=""></p>
|
|
|
|
|
|
</section>
|
|
|
|
</div>
|
|
<div class="search-results">
|
|
<div class="has-results">
|
|
|
|
<h1 class="search-results-title"><span class='search-results-count'></span> results matching "<span class='search-query'></span>"</h1>
|
|
<ul class="search-results-list"></ul>
|
|
|
|
</div>
|
|
<div class="no-results">
|
|
|
|
<h1 class="search-results-title">No results matching "<span class='search-query'></span>"</h1>
|
|
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
</div>
|
|
</div>
|
|
|
|
</div>
|
|
|
|
|
|
|
|
<a href="CustomizedSpider.html" class="navigation navigation-prev " aria-label="Previous page: 自定义爬虫">
|
|
<i class="fa fa-angle-left"></i>
|
|
</a>
|
|
|
|
|
|
<a href="Deploy.html" class="navigation navigation-next " aria-label="Next page: 部署爬虫">
|
|
<i class="fa fa-angle-right"></i>
|
|
</a>
|
|
|
|
|
|
|
|
</div>
|
|
|
|
<script>
|
|
var gitbook = gitbook || [];
|
|
gitbook.push(function() {
|
|
gitbook.page.hasChanged({"page":{"title":"可配置爬虫","level":"1.3.2.1.2","depth":4,"next":{"title":"部署爬虫","level":"1.3.2.2","depth":3,"path":"Usage/Spider/Deploy.md","ref":"Usage/Spider/Deploy.md","articles":[]},"previous":{"title":"自定义爬虫","level":"1.3.2.1.1","depth":4,"path":"Usage/Spider/CustomizedSpider.md","ref":"Usage/Spider/CustomizedSpider.md","articles":[]},"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":[],"pluginsConfig":{"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"Usage/Spider/ConfigurableSpider.md","mtime":"2019-06-16T04:28:54.000Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2019-06-16T14:03:57.361Z"},"basePath":"../..","book":{"language":""}});
|
|
});
|
|
</script>
|
|
</div>
|
|
|
|
|
|
<script src="../../gitbook/gitbook.js"></script>
|
|
<script src="../../gitbook/theme.js"></script>
|
|
|
|
|
|
<script src="../../gitbook/gitbook-plugin-search/search-engine.js"></script>
|
|
|
|
|
|
|
|
<script src="../../gitbook/gitbook-plugin-search/search.js"></script>
|
|
|
|
|
|
|
|
<script src="../../gitbook/gitbook-plugin-lunr/lunr.min.js"></script>
|
|
|
|
|
|
|
|
<script src="../../gitbook/gitbook-plugin-lunr/search-lunr.js"></script>
|
|
|
|
|
|
|
|
<script src="../../gitbook/gitbook-plugin-sharing/buttons.js"></script>
|
|
|
|
|
|
|
|
<script src="../../gitbook/gitbook-plugin-fontsettings/fontsettings.js"></script>
|
|
|
|
|
|
|
|
</body>
|
|
</html>
|
|
|