mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-21 17:21:09 +01:00
603 lines
19 KiB
HTML
603 lines
19 KiB
HTML
|
|
<!DOCTYPE HTML>
|
|
<html lang="" >
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta content="text/html; charset=utf-8" http-equiv="Content-Type">
|
|
<title>自定义爬虫 · GitBook</title>
|
|
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
|
|
<meta name="description" content="">
|
|
<meta name="generator" content="GitBook 3.2.3">
|
|
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="../../gitbook/style.css">
|
|
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="../../gitbook/gitbook-plugin-highlight/website.css">
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="../../gitbook/gitbook-plugin-search/search.css">
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="../../gitbook/gitbook-plugin-fontsettings/website.css">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<meta name="HandheldFriendly" content="true"/>
|
|
<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
|
|
<meta name="apple-mobile-web-app-capable" content="yes">
|
|
<meta name="apple-mobile-web-app-status-bar-style" content="black">
|
|
<link rel="apple-touch-icon-precomposed" sizes="152x152" href="../../gitbook/images/apple-touch-icon-precomposed-152.png">
|
|
<link rel="shortcut icon" href="../../gitbook/images/favicon.ico" type="image/x-icon">
|
|
|
|
|
|
<link rel="next" href="ConfigurableSpider.html" />
|
|
|
|
|
|
<link rel="prev" href="Create.html" />
|
|
|
|
|
|
</head>
|
|
<body>
|
|
|
|
<div class="book">
|
|
<div class="book-summary">
|
|
|
|
|
|
<div id="book-search-input" role="search">
|
|
<input type="text" placeholder="Type to search" />
|
|
</div>
|
|
|
|
|
|
<nav role="navigation">
|
|
|
|
|
|
|
|
<ul class="summary">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="chapter " data-level="1.1" data-path="../../">
|
|
|
|
<a href="../../">
|
|
|
|
|
|
Crawlab简介
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.2" data-path="../../Installation/">
|
|
|
|
<a href="../../Installation/">
|
|
|
|
|
|
安装Crawlab
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<ul class="articles">
|
|
|
|
|
|
<li class="chapter " data-level="1.2.1" data-path="../../Installation/Docker.html">
|
|
|
|
<a href="../../Installation/Docker.html">
|
|
|
|
|
|
Docker
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.2.2" data-path="../../Installation/Direct.html">
|
|
|
|
<a href="../../Installation/Direct.html">
|
|
|
|
|
|
直接部署
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.2.3" data-path="../../Installation/Preview.html">
|
|
|
|
<a href="../../Installation/Preview.html">
|
|
|
|
|
|
预览模式
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3" data-path="../">
|
|
|
|
<a href="../">
|
|
|
|
|
|
使用Crawlab
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<ul class="articles">
|
|
|
|
|
|
<li class="chapter " data-level="1.3.1" data-path="../Node/">
|
|
|
|
<a href="../Node/">
|
|
|
|
|
|
节点
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<ul class="articles">
|
|
|
|
|
|
<li class="chapter " data-level="1.3.1.1" data-path="../Node/View.html">
|
|
|
|
<a href="../Node/View.html">
|
|
|
|
|
|
查看节点列表
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.1.2" data-path="../Node/Edit.html">
|
|
|
|
<a href="../Node/Edit.html">
|
|
|
|
|
|
修改节点信息
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.2" data-path="./">
|
|
|
|
<a href="./">
|
|
|
|
|
|
爬虫
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<ul class="articles">
|
|
|
|
|
|
<li class="chapter " data-level="1.3.2.1" data-path="Create.html">
|
|
|
|
<a href="Create.html">
|
|
|
|
|
|
创建爬虫
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<ul class="articles">
|
|
|
|
|
|
<li class="chapter active" data-level="1.3.2.1.1" data-path="CustomizedSpider.html">
|
|
|
|
<a href="CustomizedSpider.html">
|
|
|
|
|
|
自定义爬虫
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.2.1.2" data-path="ConfigurableSpider.html">
|
|
|
|
<a href="ConfigurableSpider.html">
|
|
|
|
|
|
可配置爬虫
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.2.2" data-path="Deploy.html">
|
|
|
|
<a href="Deploy.html">
|
|
|
|
|
|
部署爬虫
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.2.3" data-path="Run.html">
|
|
|
|
<a href="Run.html">
|
|
|
|
|
|
运行爬虫
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.2.4" data-path="Analytics.html">
|
|
|
|
<a href="Analytics.html">
|
|
|
|
|
|
统计数据
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.3" data-path="../Task/">
|
|
|
|
<a href="../Task/">
|
|
|
|
|
|
任务
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<ul class="articles">
|
|
|
|
|
|
<li class="chapter " data-level="1.3.3.1" data-path="../Task/View.html">
|
|
|
|
<a href="../Task/View.html">
|
|
|
|
|
|
查看任务
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.3.2" data-path="../Task/Action.html">
|
|
|
|
<a href="../Task/Action.html">
|
|
|
|
|
|
操作任务
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.3.3" data-path="../Task/DownloadResults.html">
|
|
|
|
<a href="../Task/DownloadResults.html">
|
|
|
|
|
|
下载结果
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.4" data-path="../Schedule/">
|
|
|
|
<a href="../Schedule/">
|
|
|
|
|
|
定时任务
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.5" data-path="../Site/">
|
|
|
|
<a href="../Site/">
|
|
|
|
|
|
网站
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.4" data-path="../../Architecture/">
|
|
|
|
<a href="../../Architecture/">
|
|
|
|
|
|
架构
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.5" data-path="../../Examples/">
|
|
|
|
<a href="../../Examples/">
|
|
|
|
|
|
样例
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<ul class="articles">
|
|
|
|
|
|
<li class="chapter " data-level="1.5.1" data-path="../../Examples/ScrapyIntegration.html">
|
|
|
|
<a href="../../Examples/ScrapyIntegration.html">
|
|
|
|
|
|
与Scrapy集成
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
<li class="divider"></li>
|
|
|
|
<li>
|
|
<a href="https://www.gitbook.com" target="blank" class="gitbook-link">
|
|
Published with GitBook
|
|
</a>
|
|
</li>
|
|
</ul>
|
|
|
|
|
|
</nav>
|
|
|
|
|
|
</div>
|
|
|
|
<div class="book-body">
|
|
|
|
<div class="body-inner">
|
|
|
|
|
|
|
|
<div class="book-header" role="navigation">
|
|
|
|
|
|
<!-- Title -->
|
|
<h1>
|
|
<i class="fa fa-circle-o-notch fa-spin"></i>
|
|
<a href="../.." >自定义爬虫</a>
|
|
</h1>
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<div class="page-wrapper" tabindex="-1" role="main">
|
|
<div class="page-inner">
|
|
|
|
<div id="book-search-results">
|
|
<div class="search-noresults">
|
|
|
|
<section class="normal markdown-section">
|
|
|
|
<h2 id="自定义爬虫">自定义爬虫</h2>
|
|
<p>自定义爬虫是指用户可以添加的任何语言任何框架的爬虫,高度自定义化。当用户添加好自定义爬虫之后,Crawlab就可以将其集成到爬虫管理的系统中来。</p>
|
|
<p>自定义爬虫的添加有两种方式:</p>
|
|
<ol>
|
|
<li>通过Web界面上传爬虫</li>
|
|
<li>通过创建项目目录</li>
|
|
</ol>
|
|
<h3 id="通过web界面上传">通过Web界面上传</h3>
|
|
<p>在通过Web界面上传之前,需要将爬虫项目文件打包成<code>zip</code>格式。</p>
|
|
<p><img src="https://crawlab.oss-cn-hangzhou.aliyuncs.com/gitbook/spider-list.png" alt=""></p>
|
|
<p>然后,在<code>侧边栏</code>点击<code>爬虫</code>导航至<code>爬虫列表</code>,点击<code>添加爬虫</code>按钮,选择<code>自定义爬虫</code>,点击<code>上传</code>按钮,选择刚刚打包好的<code>zip</code>文件。上传成功后,<code>爬虫列表</code>中会出现新添加的自定义爬虫。这样就算添加好了。</p>
|
|
<p>这个方式稍微有些繁琐,但是对于无法轻松获取服务器的读写权限时是非常有用的,适合在生产环境上使用。</p>
|
|
<h3 id="通过添加项目目录">通过添加项目目录</h3>
|
|
<p>Crawlab会自动发现<code>PROJECT_SOURCE_FILE_FOLDER</code>目录下的所有爬虫目录,并将这些目录生成自定义爬虫并集成到Crawlab中。因此,将爬虫项目目录拷贝到<code>PROJECT_SOURCE_FILE_FOLDER</code>目录下,就可以添加一个爬虫了。</p>
|
|
<p>这种方式非常方便,但是需要获得主机服务器的读写权限,因而比较适合在开发环境上采用。</p>
|
|
<h3 id="配置爬虫">配置爬虫</h3>
|
|
<p>在定义爬虫中,我们需要配置一下<code>执行命令</code>(运行爬虫时后台执行的<code>shell</code>命令)和<code>结果集</code>(通过<code>CRAWLAB_COLLECTION</code>传递给爬虫程序,爬虫程序存储结果的地方),然后点击<code>保存</code>按钮保存爬虫信息。</p>
|
|
<p><img src="https://crawlab.oss-cn-hangzhou.aliyuncs.com/gitbook/spider-detail-overview.png" alt=""></p>
|
|
<p>接下来,我们就可以部署、运行自定义爬虫了。</p>
|
|
|
|
|
|
</section>
|
|
|
|
</div>
|
|
<div class="search-results">
|
|
<div class="has-results">
|
|
|
|
<h1 class="search-results-title"><span class='search-results-count'></span> results matching "<span class='search-query'></span>"</h1>
|
|
<ul class="search-results-list"></ul>
|
|
|
|
</div>
|
|
<div class="no-results">
|
|
|
|
<h1 class="search-results-title">No results matching "<span class='search-query'></span>"</h1>
|
|
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
</div>
|
|
</div>
|
|
|
|
</div>
|
|
|
|
|
|
|
|
<a href="Create.html" class="navigation navigation-prev " aria-label="Previous page: 创建爬虫">
|
|
<i class="fa fa-angle-left"></i>
|
|
</a>
|
|
|
|
|
|
<a href="ConfigurableSpider.html" class="navigation navigation-next " aria-label="Next page: 可配置爬虫">
|
|
<i class="fa fa-angle-right"></i>
|
|
</a>
|
|
|
|
|
|
|
|
</div>
|
|
|
|
<script>
|
|
var gitbook = gitbook || [];
|
|
gitbook.push(function() {
|
|
gitbook.page.hasChanged({"page":{"title":"自定义爬虫","level":"1.3.2.1.1","depth":4,"next":{"title":"可配置爬虫","level":"1.3.2.1.2","depth":4,"path":"Usage/Spider/ConfigurableSpider.md","ref":"Usage/Spider/ConfigurableSpider.md","articles":[]},"previous":{"title":"创建爬虫","level":"1.3.2.1","depth":3,"path":"Usage/Spider/Create.md","ref":"Usage/Spider/Create.md","articles":[{"title":"自定义爬虫","level":"1.3.2.1.1","depth":4,"path":"Usage/Spider/CustomizedSpider.md","ref":"Usage/Spider/CustomizedSpider.md","articles":[]},{"title":"可配置爬虫","level":"1.3.2.1.2","depth":4,"path":"Usage/Spider/ConfigurableSpider.md","ref":"Usage/Spider/ConfigurableSpider.md","articles":[]}]},"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":[],"pluginsConfig":{"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"Usage/Spider/CustomizedSpider.md","mtime":"2019-06-16T04:40:31.000Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2019-06-16T14:03:57.361Z"},"basePath":"../..","book":{"language":""}});
|
|
});
|
|
</script>
|
|
</div>
|
|
|
|
|
|
<script src="../../gitbook/gitbook.js"></script>
|
|
<script src="../../gitbook/theme.js"></script>
|
|
|
|
|
|
<script src="../../gitbook/gitbook-plugin-search/search-engine.js"></script>
|
|
|
|
|
|
|
|
<script src="../../gitbook/gitbook-plugin-search/search.js"></script>
|
|
|
|
|
|
|
|
<script src="../../gitbook/gitbook-plugin-lunr/lunr.min.js"></script>
|
|
|
|
|
|
|
|
<script src="../../gitbook/gitbook-plugin-lunr/search-lunr.js"></script>
|
|
|
|
|
|
|
|
<script src="../../gitbook/gitbook-plugin-sharing/buttons.js"></script>
|
|
|
|
|
|
|
|
<script src="../../gitbook/gitbook-plugin-fontsettings/fontsettings.js"></script>
|
|
|
|
|
|
|
|
</body>
|
|
</html>
|
|
|