mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-22 17:31:03 +01:00
588 lines
24 KiB
HTML
588 lines
24 KiB
HTML
|
|
<!DOCTYPE HTML>
|
|
<html lang="" >
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta content="text/html; charset=utf-8" http-equiv="Content-Type">
|
|
<title>简介 · GitBook</title>
|
|
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
|
|
<meta name="description" content="">
|
|
<meta name="generator" content="GitBook 3.2.3">
|
|
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="gitbook/style.css">
|
|
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="gitbook/gitbook-plugin-highlight/website.css">
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="gitbook/gitbook-plugin-search/search.css">
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="gitbook/gitbook-plugin-fontsettings/website.css">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<meta name="HandheldFriendly" content="true"/>
|
|
<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
|
|
<meta name="apple-mobile-web-app-capable" content="yes">
|
|
<meta name="apple-mobile-web-app-status-bar-style" content="black">
|
|
<link rel="apple-touch-icon-precomposed" sizes="152x152" href="gitbook/images/apple-touch-icon-precomposed-152.png">
|
|
<link rel="shortcut icon" href="gitbook/images/favicon.ico" type="image/x-icon">
|
|
|
|
|
|
<link rel="next" href="QuickStart/" />
|
|
|
|
|
|
|
|
</head>
|
|
<body>
|
|
|
|
<div class="book">
|
|
<div class="book-summary">
|
|
|
|
|
|
<div id="book-search-input" role="search">
|
|
<input type="text" placeholder="Type to search" />
|
|
</div>
|
|
|
|
|
|
<nav role="navigation">
|
|
|
|
|
|
|
|
<ul class="summary">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="chapter active" data-level="1.1" data-path="./">
|
|
|
|
<a href="./">
|
|
|
|
|
|
简介
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.2" data-path="QuickStart/">
|
|
|
|
<a href="QuickStart/">
|
|
|
|
|
|
快速开始
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<ul class="articles">
|
|
|
|
|
|
<li class="chapter " data-level="1.2.1" data-path="QuickStart/Installation.html">
|
|
|
|
<a href="QuickStart/Installation.html">
|
|
|
|
|
|
安装
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.2.2" data-path="QuickStart/Run.html">
|
|
|
|
<a href="QuickStart/Run.html">
|
|
|
|
|
|
运行
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3" data-path="Concept/">
|
|
|
|
<a href="Concept/">
|
|
|
|
|
|
概念
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<ul class="articles">
|
|
|
|
|
|
<li class="chapter " data-level="1.3.1" data-path="Concept/Node.html">
|
|
|
|
<a href="Concept/Node.html">
|
|
|
|
|
|
节点
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.2" data-path="Concept/Spider.html">
|
|
|
|
<a href="Concept/Spider.html">
|
|
|
|
|
|
爬虫
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.3" data-path="Concept/Task.html">
|
|
|
|
<a href="Concept/Task.html">
|
|
|
|
|
|
任务
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.4" data-path="Concept/Deploy.html">
|
|
|
|
<a href="Concept/Deploy.html">
|
|
|
|
|
|
部署
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.4" data-path="Architecture/">
|
|
|
|
<a href="Architecture/">
|
|
|
|
|
|
架构
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<ul class="articles">
|
|
|
|
|
|
<li class="chapter " data-level="1.4.1" data-path="Architecture/Celery.html">
|
|
|
|
<a href="Architecture/Celery.html">
|
|
|
|
|
|
Celery
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.4.2" data-path="Architecture/App.html">
|
|
|
|
<a href="Architecture/App.html">
|
|
|
|
|
|
App
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.5" data-path="Examples/">
|
|
|
|
<a href="Examples/">
|
|
|
|
|
|
Examples
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<ul class="articles">
|
|
|
|
|
|
<li class="chapter " data-level="1.5.1" data-path="Examples/">
|
|
|
|
<a href="Examples/">
|
|
|
|
|
|
与Scrapy集成
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.5.2" data-path="Examples/">
|
|
|
|
<a href="Examples/">
|
|
|
|
|
|
与Puppeteer集成
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
<li class="divider"></li>
|
|
|
|
<li>
|
|
<a href="https://www.gitbook.com" target="blank" class="gitbook-link">
|
|
Published with GitBook
|
|
</a>
|
|
</li>
|
|
</ul>
|
|
|
|
|
|
</nav>
|
|
|
|
|
|
</div>
|
|
|
|
<div class="book-body">
|
|
|
|
<div class="body-inner">
|
|
|
|
|
|
|
|
<div class="book-header" role="navigation">
|
|
|
|
|
|
<!-- Title -->
|
|
<h1>
|
|
<i class="fa fa-circle-o-notch fa-spin"></i>
|
|
<a href="." >简介</a>
|
|
</h1>
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<div class="page-wrapper" tabindex="-1" role="main">
|
|
<div class="page-inner">
|
|
|
|
<div id="book-search-results">
|
|
<div class="search-noresults">
|
|
|
|
<section class="normal markdown-section">
|
|
|
|
<h1 id="crawlab">Crawlab</h1>
|
|
<p>基于Celery的爬虫分布式爬虫管理平台,支持多种编程语言以及多种爬虫框架.</p>
|
|
<p><a href="http://139.129.230.98:8080" target="_blank">查看演示 Demo</a></p>
|
|
<p><a href="https://github.com/tikazyq/crawlab/blob/master/README.md" target="_blank">English Documentation</a></p>
|
|
<h2 id="要求">要求</h2>
|
|
<ul>
|
|
<li>Python3</li>
|
|
<li>MongoDB</li>
|
|
<li>Redis</li>
|
|
</ul>
|
|
<h2 id="安装">安装</h2>
|
|
<pre><code class="lang-bash"><span class="hljs-comment"># 安装后台类库</span>
|
|
pip install -r requirements.txt
|
|
</code></pre>
|
|
<pre><code class="lang-bash"><span class="hljs-comment"># 安装前台类库</span>
|
|
<span class="hljs-built_in">cd</span> frontend
|
|
npm install
|
|
</code></pre>
|
|
<h2 id="配置">配置</h2>
|
|
<p>请更改配置文件<code>config.py</code>,配置API和数据库连接.</p>
|
|
<h2 id="快速开始">快速开始</h2>
|
|
<pre><code class="lang-bash"><span class="hljs-comment"># 启动后端API</span>
|
|
python app.py
|
|
|
|
<span class="hljs-comment"># 启动Flower服务</span>
|
|
python ./bin/run_flower.py
|
|
|
|
<span class="hljs-comment"># 启动worker</span>
|
|
python ./bin/run_worker.py
|
|
</code></pre>
|
|
<pre><code class="lang-bash"><span class="hljs-comment"># 运行前端</span>
|
|
<span class="hljs-built_in">cd</span> frontend
|
|
npm run serve
|
|
</code></pre>
|
|
<h2 id="截图">截图</h2>
|
|
<h4 id="首页">首页</h4>
|
|
<p><img src="img/screenshot-home.png" alt="home"></p>
|
|
<h4 id="爬虫列表">爬虫列表</h4>
|
|
<p><img src="img/screenshot-spiders.png" alt="spider-list"></p>
|
|
<h4 id="爬虫详情---概览">爬虫详情 - 概览</h4>
|
|
<p><img src="img/screenshot-spider-detail-overview.png" alt="spider-list"></p>
|
|
<h4 id="任务详情---抓取结果">任务详情 - 抓取结果</h4>
|
|
<p><img src="img/screenshot-task-detail-results.png" alt="spider-list"></p>
|
|
<h2 id="架构">架构</h2>
|
|
<p>Crawlab的架构跟Celery非常相似,但是加入了包括前端、爬虫、Flower在内的额外模块,以支持爬虫管理的功能。</p>
|
|
<p><img src="img/crawlab-architecture.png" alt="crawlab-architecture"></p>
|
|
<h3 id="节点">节点</h3>
|
|
<p>节点其实就是Celery中的Worker。一个节点运行时会连接到一个任务队列(例如Redis)来接收和运行任务。所有爬虫需要在运行时被部署到节点上,用户在部署前需要定义节点的IP地址和端口。</p>
|
|
<h3 id="爬虫">爬虫</h3>
|
|
<h5 id="自动发现">自动发现</h5>
|
|
<p>在<code>config.py</code>文件中,修改变量<code>PROJECT_SOURCE_FILE_FOLDER</code>作为爬虫项目所在的目录。Crawlab后台程序会自动发现这些爬虫项目并储存到数据库中。是不是很方便?</p>
|
|
<h5 id="部署爬虫">部署爬虫</h5>
|
|
<p>所有爬虫需要在抓取前被部署当相应当节点中。在"爬虫详情"页面点击"Deploy"按钮,爬虫将被部署到所有有效到节点中。</p>
|
|
<h5 id="运行爬虫">运行爬虫</h5>
|
|
<p>部署爬虫之后,你可以在"爬虫详情"页面点击"Run"按钮来启动爬虫。一个爬虫任务将被触发,你可以在任务列表页面中看到这个任务。</p>
|
|
<h3 id="任务">任务</h3>
|
|
<p>任务被触发并被节点执行。用户可以在任务详情页面中看到任务到状态、日志和抓取结果。</p>
|
|
<h3 id="后台应用">后台应用</h3>
|
|
<p>这是一个Flask应用,提供了必要的API来支持常规操作,例如CRUD、爬虫部署以及任务运行。每一个节点需要启动Flask应用来支持爬虫部署。运行<code>python manage.py app</code>或<code>python ./bin/run_app.py</code>来启动应用。</p>
|
|
<h3 id="中间者">中间者</h3>
|
|
<p>中间者跟Celery中定义的一样,作为运行异步任务的队列。</p>
|
|
<h3 id="前端">前端</h3>
|
|
<p>前端其实就是一个基于<a href="https://github.com/PanJiaChen/vue-element-admin" target="_blank">Vue-Element-Admin</a>的单页应用。其中重用了很多Element-UI的控件来支持相应的展示。</p>
|
|
<h2 id="与其他框架的集成">与其他框架的集成</h2>
|
|
<p>任务是利用python的<code>subprocess</code>模块中的<code>Popen</code>来实现的。任务ID将以环境变量<code>CRAWLAB_TASK_ID</code>的形式存在于爬虫任务运行的进程中,并以此来关联抓取数据。</p>
|
|
<p>在你的爬虫程序中,你需要将<code>CRAWLAB_TASK_ID</code>的值以<code>task_id</code>作为可以存入数据库中。这样Crawlab就直到如何将爬虫任务与抓取数据关联起来了。当前,Crawlab只支持MongoDB。</p>
|
|
<h3 id="scrapy">Scrapy</h3>
|
|
<p>以下是Crawlab跟Scrapy集成的例子,利用了Crawlab传过来的task_id和collection_name。</p>
|
|
<pre><code class="lang-python"><span class="hljs-keyword">import</span> os
|
|
<span class="hljs-keyword">from</span> pymongo <span class="hljs-keyword">import</span> MongoClient
|
|
|
|
MONGO_HOST = <span class="hljs-string">'192.168.99.100'</span>
|
|
MONGO_PORT = <span class="hljs-number">27017</span>
|
|
MONGO_DB = <span class="hljs-string">'crawlab_test'</span>
|
|
|
|
<span class="hljs-comment"># scrapy example in the pipeline</span>
|
|
<span class="hljs-class"><span class="hljs-keyword">class</span> <span class="hljs-title">JuejinPipeline</span><span class="hljs-params">(object)</span>:</span>
|
|
mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
|
|
db = mongo[MONGO_DB]
|
|
col_name = os.environ.get(<span class="hljs-string">'CRAWLAB_COLLECTION'</span>)
|
|
<span class="hljs-keyword">if</span> <span class="hljs-keyword">not</span> col_name:
|
|
col_name = <span class="hljs-string">'test'</span>
|
|
col = db[col_name]
|
|
|
|
<span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">process_item</span><span class="hljs-params">(self, item, spider)</span>:</span>
|
|
item[<span class="hljs-string">'task_id'</span>] = os.environ.get(<span class="hljs-string">'CRAWLAB_TASK_ID'</span>)
|
|
self.col.save(item)
|
|
<span class="hljs-keyword">return</span> item
|
|
</code></pre>
|
|
<h2 id="与其他框架比较">与其他框架比较</h2>
|
|
<p>限制以及有一些爬虫管理框架了,因此为啥还要用Crawlab?</p>
|
|
<p>因为很多现有当平台都依赖于Scrapyd,限制了爬虫的编程语言以及框架,爬虫工程师只能用scrapy和python。当然,scrapy是非常优秀的爬虫框架,但是它不能做一切事情。</p>
|
|
<p>Crawlab使用起来很方便,也很通用,可以适用于几乎任何主流语言和框架。它还有一个精美的前端界面,让用户可以方便的管理和运行爬虫。</p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th style="text-align:center">框架</th>
|
|
<th style="text-align:center">类型</th>
|
|
<th style="text-align:center">分布式</th>
|
|
<th style="text-align:center">前端</th>
|
|
<th style="text-align:center">依赖于Scrapyd</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td style="text-align:center"><a href="https://github.com/tikazyq/crawlab" target="_blank">Crawlab</a></td>
|
|
<td style="text-align:center">管理平台</td>
|
|
<td style="text-align:center">Y</td>
|
|
<td style="text-align:center">Y</td>
|
|
<td style="text-align:center">N</td>
|
|
</tr>
|
|
<tr>
|
|
<td style="text-align:center"><a href="https://github.com/Gerapy/Gerapy" target="_blank">Gerapy</a></td>
|
|
<td style="text-align:center">管理平台</td>
|
|
<td style="text-align:center">Y</td>
|
|
<td style="text-align:center">Y</td>
|
|
<td style="text-align:center">Y</td>
|
|
</tr>
|
|
<tr>
|
|
<td style="text-align:center"><a href="https://github.com/DormyMo/SpiderKeeper" target="_blank">SpiderKeeper</a></td>
|
|
<td style="text-align:center">管理平台</td>
|
|
<td style="text-align:center">Y</td>
|
|
<td style="text-align:center">Y</td>
|
|
<td style="text-align:center">Y</td>
|
|
</tr>
|
|
<tr>
|
|
<td style="text-align:center"><a href="https://github.com/my8100/scrapydweb" target="_blank">ScrapydWeb</a></td>
|
|
<td style="text-align:center">管理平台</td>
|
|
<td style="text-align:center">Y</td>
|
|
<td style="text-align:center">Y</td>
|
|
<td style="text-align:center">Y</td>
|
|
</tr>
|
|
<tr>
|
|
<td style="text-align:center"><a href="https://github.com/scrapy/scrapyd" target="_blank">Scrapyd</a></td>
|
|
<td style="text-align:center">网络服务</td>
|
|
<td style="text-align:center">Y</td>
|
|
<td style="text-align:center">N</td>
|
|
<td style="text-align:center">N/A</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
<h2 id="todos">TODOs</h2>
|
|
<h5 id="后端">后端</h5>
|
|
<ul>
|
|
<li>[ ] 文件管理</li>
|
|
<li>[ ] MySQL数据库支持</li>
|
|
<li>[ ] 重跑任务</li>
|
|
<li>[ ] 节点监控</li>
|
|
<li>[ ] 更多爬虫例子</li>
|
|
</ul>
|
|
<h5 id="前端">前端</h5>
|
|
<ul>
|
|
<li>[ ] 任务数据统计</li>
|
|
<li>[ ] 表格过滤</li>
|
|
<li>[x] 多语言支持 (中文)</li>
|
|
<li>[ ] 登录和用户管理</li>
|
|
<li>[ ] 全局搜索</li>
|
|
</ul>
|
|
|
|
|
|
</section>
|
|
|
|
</div>
|
|
<div class="search-results">
|
|
<div class="has-results">
|
|
|
|
<h1 class="search-results-title"><span class='search-results-count'></span> results matching "<span class='search-query'></span>"</h1>
|
|
<ul class="search-results-list"></ul>
|
|
|
|
</div>
|
|
<div class="no-results">
|
|
|
|
<h1 class="search-results-title">No results matching "<span class='search-query'></span>"</h1>
|
|
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
</div>
|
|
</div>
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<a href="QuickStart/" class="navigation navigation-next navigation-unique" aria-label="Next page: 快速开始">
|
|
<i class="fa fa-angle-right"></i>
|
|
</a>
|
|
|
|
|
|
|
|
</div>
|
|
|
|
<script>
|
|
var gitbook = gitbook || [];
|
|
gitbook.push(function() {
|
|
gitbook.page.hasChanged({"page":{"title":"简介","level":"1.1","depth":1,"next":{"title":"快速开始","level":"1.2","depth":1,"path":"QuickStart/README.md","ref":"QuickStart/README.md","articles":[{"title":"安装","level":"1.2.1","depth":2,"path":"QuickStart/Installation.md","ref":"QuickStart/Installation.md","articles":[]},{"title":"运行","level":"1.2.2","depth":2,"path":"QuickStart/Run.md","ref":"QuickStart/Run.md","articles":[]}]},"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":[],"pluginsConfig":{"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"README.md","mtime":"2019-03-28T11:44:15.000Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2019-06-04T15:47:26.876Z"},"basePath":".","book":{"language":""}});
|
|
});
|
|
</script>
|
|
</div>
|
|
|
|
|
|
<script src="gitbook/gitbook.js"></script>
|
|
<script src="gitbook/theme.js"></script>
|
|
|
|
|
|
<script src="gitbook/gitbook-plugin-search/search-engine.js"></script>
|
|
|
|
|
|
|
|
<script src="gitbook/gitbook-plugin-search/search.js"></script>
|
|
|
|
|
|
|
|
<script src="gitbook/gitbook-plugin-lunr/lunr.min.js"></script>
|
|
|
|
|
|
|
|
<script src="gitbook/gitbook-plugin-lunr/search-lunr.js"></script>
|
|
|
|
|
|
|
|
<script src="gitbook/gitbook-plugin-sharing/buttons.js"></script>
|
|
|
|
|
|
|
|
<script src="gitbook/gitbook-plugin-fontsettings/fontsettings.js"></script>
|
|
|
|
|
|
|
|
</body>
|
|
</html>
|
|
|