mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-21 17:21:09 +01:00
@@ -9,6 +9,7 @@ from flask import Flask
|
||||
from flask_cors import CORS
|
||||
from flask_restful import Api
|
||||
# from flask_restplus import Api
|
||||
from routes.sites import SiteApi
|
||||
from utils.log import other
|
||||
from constants.node import NodeStatus
|
||||
from db.manager import db_manager
|
||||
@@ -68,6 +69,9 @@ api.add_resource(StatsApi,
|
||||
api.add_resource(ScheduleApi,
|
||||
'/api/schedules',
|
||||
'/api/schedules/<string:id>')
|
||||
api.add_resource(SiteApi,
|
||||
'/api/sites',
|
||||
'/api/sites/<string:id>')
|
||||
|
||||
|
||||
def monitor_nodes_status(celery_app):
|
||||
|
||||
@@ -13,7 +13,7 @@ class DbManager(object):
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
|
||||
self.mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT, connect=False)
|
||||
self.db = self.mongo[MONGO_DB]
|
||||
|
||||
def save(self, col_name: str, item: dict, **kwargs) -> None:
|
||||
|
||||
72
crawlab/routes/sites.py
Normal file
72
crawlab/routes/sites.py
Normal file
@@ -0,0 +1,72 @@
|
||||
import json
|
||||
|
||||
from bson import ObjectId
|
||||
from pymongo import ASCENDING
|
||||
|
||||
from db.manager import db_manager
|
||||
from routes.base import BaseApi
|
||||
from utils import jsonify
|
||||
|
||||
|
||||
class SiteApi(BaseApi):
|
||||
col_name = 'sites'
|
||||
|
||||
arguments = (
|
||||
('keyword', str),
|
||||
('category', str),
|
||||
)
|
||||
|
||||
def get(self, id: str = None, action: str = None):
|
||||
# action by id
|
||||
if action is not None:
|
||||
if not hasattr(self, action):
|
||||
return {
|
||||
'status': 'ok',
|
||||
'code': 400,
|
||||
'error': 'action "%s" invalid' % action
|
||||
}, 400
|
||||
return getattr(self, action)(id)
|
||||
|
||||
elif id is not None:
|
||||
site = db_manager.get(col_name=self.col_name, id=id)
|
||||
return jsonify(site)
|
||||
|
||||
# list tasks
|
||||
args = self.parser.parse_args()
|
||||
page_size = args.get('page_size') or 10
|
||||
page_num = args.get('page_num') or 1
|
||||
filter_str = args.get('filter')
|
||||
keyword = args.get('keyword')
|
||||
filter_ = {}
|
||||
if filter_str is not None:
|
||||
filter_ = json.loads(filter_str)
|
||||
if keyword is not None:
|
||||
filter_['$or'] = [
|
||||
{'description': {'$regex': keyword}},
|
||||
{'name': {'$regex': keyword}},
|
||||
{'domain': {'$regex': keyword}}
|
||||
]
|
||||
|
||||
items = db_manager.list(
|
||||
col_name=self.col_name,
|
||||
cond=filter_,
|
||||
limit=page_size,
|
||||
skip=page_size * (page_num - 1),
|
||||
sort_key='rank',
|
||||
sort_direction=ASCENDING
|
||||
)
|
||||
|
||||
sites = []
|
||||
for site in items:
|
||||
# get spider count
|
||||
site['spider_count'] = db_manager.count('spiders', {'site': site['_id']})
|
||||
|
||||
sites.append(site)
|
||||
|
||||
return {
|
||||
'status': 'ok',
|
||||
'total_count': db_manager.count(self.col_name, filter_),
|
||||
'page_num': page_num,
|
||||
'page_size': page_size,
|
||||
'items': jsonify(sites)
|
||||
}
|
||||
@@ -61,6 +61,9 @@ class SpiderApi(BaseApi):
|
||||
|
||||
# spider schedule cron enabled
|
||||
('envs', str),
|
||||
|
||||
# spider site
|
||||
('site', str),
|
||||
)
|
||||
|
||||
def get(self, id=None, action=None):
|
||||
@@ -125,6 +128,12 @@ class SpiderApi(BaseApi):
|
||||
if last_task is not None:
|
||||
spider['task_ts'] = last_task['create_ts']
|
||||
|
||||
# get site
|
||||
if spider.get('site') is not None:
|
||||
site = db_manager.get('sites', spider['site'])
|
||||
if site is not None:
|
||||
spider['site_name'] = site['name']
|
||||
|
||||
# file stats
|
||||
stats = get_file_suffix_stats(dir_path)
|
||||
|
||||
|
||||
@@ -36,7 +36,6 @@ class TaskApi(BaseApi):
|
||||
'code': 400,
|
||||
'error': 'action "%s" invalid' % action
|
||||
}, 400
|
||||
# other.info(f"到这了{action},{id}")
|
||||
return getattr(self, action)(id)
|
||||
|
||||
elif id is not None:
|
||||
@@ -78,9 +77,6 @@ class TaskApi(BaseApi):
|
||||
sort_key='create_ts')
|
||||
items = []
|
||||
for task in tasks:
|
||||
# celery tasks
|
||||
# _task = db_manager.get('tasks_celery', id=task['_id'])
|
||||
|
||||
# get spider
|
||||
_spider = db_manager.get(col_name='spiders', id=str(task['spider_id']))
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ from db.manager import db_manager
|
||||
|
||||
|
||||
class Scheduler(object):
|
||||
mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
|
||||
mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT, connect=False)
|
||||
task_col = 'apscheduler_jobs'
|
||||
|
||||
# scheduler jobstore
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "crawlab",
|
||||
"version": "0.1.0",
|
||||
"version": "0.2.0",
|
||||
"private": true,
|
||||
"scripts": {
|
||||
"serve": "cross-env NODE_ENV=development vue-cli-service serve --ip=0.0.0.0",
|
||||
|
||||
@@ -23,6 +23,14 @@
|
||||
<el-input v-model="spiderForm.col" :placeholder="$t('Results Collection')"
|
||||
:disabled="isView"></el-input>
|
||||
</el-form-item>
|
||||
<el-form-item :label="$t('Site')">
|
||||
<el-autocomplete v-model="spiderForm.site"
|
||||
:placeholder="$t('Site')"
|
||||
:fetch-suggestions="fetchSiteSuggestions"
|
||||
clearable
|
||||
@select="onSiteSelect">
|
||||
</el-autocomplete>
|
||||
</el-form-item>
|
||||
<el-form-item :label="$t('Spider Type')">
|
||||
<el-select v-model="spiderForm.type" :placeholder="$t('Spider Type')" :disabled="isView" clearable>
|
||||
<el-option value="scrapy" label="Scrapy"></el-option>
|
||||
@@ -38,26 +46,6 @@
|
||||
<el-option value="go" label="Go"></el-option>
|
||||
</el-select>
|
||||
</el-form-item>
|
||||
<!--<el-form-item :label="$t('Schedule Enabled')">-->
|
||||
<!--<el-switch v-model="spiderForm.cron_enabled" :disabled="isView">-->
|
||||
<!--</el-switch>-->
|
||||
<!--</el-form-item>-->
|
||||
<!--<el-form-item :label="$t('Schedule Cron')" v-if="spiderForm.cron_enabled"-->
|
||||
<!--prop="cron"-->
|
||||
<!--:rules="cronRules"-->
|
||||
<!--:inline-message="true">-->
|
||||
<!--<template slot="label">-->
|
||||
<!--<el-tooltip :content="$t('Cron Format: [second] [minute] [hour] [day of month] [month] [day of week]')"-->
|
||||
<!--placement="top">-->
|
||||
<!--<span>-->
|
||||
<!--{{$t('Schedule Cron')}}-->
|
||||
<!--<i class="fa fa-exclamation-circle"></i>-->
|
||||
<!--</span>-->
|
||||
<!--</el-tooltip>-->
|
||||
<!--</template>-->
|
||||
<!--<el-input v-model="spiderForm.cron" :placeholder="$t('Schedule Cron')"-->
|
||||
<!--:disabled="isView"></el-input>-->
|
||||
<!--</el-form-item>-->
|
||||
</el-form>
|
||||
</el-row>
|
||||
<el-row class="button-container" v-if="!isView">
|
||||
@@ -172,6 +160,22 @@ export default {
|
||||
})
|
||||
}
|
||||
})
|
||||
},
|
||||
fetchSiteSuggestions (keyword, callback) {
|
||||
this.$request.get('/sites', {
|
||||
keyword: keyword,
|
||||
page_num: 1,
|
||||
page_size: 100
|
||||
}).then(response => {
|
||||
const data = response.data.items.map(d => {
|
||||
d.value = `${d.name} | ${d.domain}`
|
||||
return d
|
||||
})
|
||||
callback(data)
|
||||
})
|
||||
},
|
||||
onSiteSelect (item) {
|
||||
this.spiderForm.site = item._id
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -187,4 +191,8 @@ export default {
|
||||
width: 100%;
|
||||
text-align: right;
|
||||
}
|
||||
|
||||
.el-autocomplete {
|
||||
width: 100%;
|
||||
}
|
||||
</style>
|
||||
|
||||
@@ -10,6 +10,7 @@ export default {
|
||||
'Task Detail': '任务详情',
|
||||
'Schedules': '定时任务',
|
||||
'Deploys': '部署',
|
||||
'Sites': '网站',
|
||||
|
||||
// 标签
|
||||
Overview: '概览',
|
||||
@@ -70,7 +71,7 @@ export default {
|
||||
|
||||
// 节点状态
|
||||
Online: '在线',
|
||||
Offline: '在线',
|
||||
Offline: '离线',
|
||||
Unavailable: '未知',
|
||||
|
||||
// 爬虫
|
||||
@@ -130,6 +131,15 @@ export default {
|
||||
'Parameters': '参数',
|
||||
'Add Schedule': '添加定时任务',
|
||||
|
||||
// 网站
|
||||
'Site': '网站',
|
||||
'Rank': '排名',
|
||||
'Domain': '域名',
|
||||
'Category': '类别',
|
||||
'Select': '请选择',
|
||||
'Select Category': '请选择类别',
|
||||
'Spider Count': '爬虫数',
|
||||
|
||||
// 文件
|
||||
'Choose Folder': '选择文件',
|
||||
|
||||
|
||||
@@ -183,6 +183,26 @@ export const constantRouterMap = [
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
name: 'Site',
|
||||
path: '/sites',
|
||||
component: Layout,
|
||||
meta: {
|
||||
title: 'Site',
|
||||
icon: 'fa fa-sitemap'
|
||||
},
|
||||
children: [
|
||||
{
|
||||
path: '',
|
||||
name: 'SiteList',
|
||||
component: () => import('../views/site/SiteList'),
|
||||
meta: {
|
||||
title: 'Sites',
|
||||
icon: 'fa fa-sitemap'
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
{ path: '*', redirect: '/404', hidden: true }
|
||||
]
|
||||
|
||||
@@ -11,6 +11,7 @@ import task from './modules/task'
|
||||
import file from './modules/file'
|
||||
import schedule from './modules/schedule'
|
||||
import lang from './modules/lang'
|
||||
import site from './modules/site'
|
||||
import getters from './getters'
|
||||
|
||||
Vue.use(Vuex)
|
||||
@@ -27,7 +28,8 @@ const store = new Vuex.Store({
|
||||
task,
|
||||
file,
|
||||
schedule,
|
||||
lang
|
||||
lang,
|
||||
site
|
||||
},
|
||||
getters
|
||||
})
|
||||
|
||||
67
frontend/src/store/modules/site.js
Normal file
67
frontend/src/store/modules/site.js
Normal file
@@ -0,0 +1,67 @@
|
||||
import request from '../../api/request'
|
||||
|
||||
const state = {
|
||||
siteList: [],
|
||||
|
||||
// filter
|
||||
filter: {
|
||||
category: undefined
|
||||
},
|
||||
keyword: '',
|
||||
|
||||
// pagination
|
||||
pageNum: 1,
|
||||
pageSize: 10,
|
||||
totalCount: 0
|
||||
}
|
||||
|
||||
const getters = {}
|
||||
|
||||
const mutations = {
|
||||
SET_KEYWORD (state, value) {
|
||||
state.keyword = value
|
||||
},
|
||||
SET_SITE_LIST (state, value) {
|
||||
state.siteList = value
|
||||
},
|
||||
SET_PAGE_NUM (state, value) {
|
||||
state.pageNum = value
|
||||
},
|
||||
SET_PAGE_SIZE (state, value) {
|
||||
state.pageSize = value
|
||||
},
|
||||
SET_TOTAL_COUNT (state, value) {
|
||||
state.totalCount = value
|
||||
}
|
||||
}
|
||||
|
||||
const actions = {
|
||||
editSite ({ state, dispatch }, payload) {
|
||||
const { id, category } = payload
|
||||
return request.post(`/sites/${id}`, {
|
||||
category
|
||||
})
|
||||
},
|
||||
getSiteList ({ state, commit }) {
|
||||
return request.get('/sites', {
|
||||
page_num: state.pageNum,
|
||||
page_size: state.pageSize,
|
||||
keyword: state.keyword || undefined,
|
||||
filter: {
|
||||
category: state.filter.category || undefined
|
||||
}
|
||||
})
|
||||
.then(response => {
|
||||
commit('SET_SITE_LIST', response.data.items)
|
||||
commit('SET_TOTAL_COUNT', response.data.total_count)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
export default {
|
||||
namespaced: true,
|
||||
state,
|
||||
getters,
|
||||
mutations,
|
||||
actions
|
||||
}
|
||||
@@ -55,7 +55,7 @@ const mutations = {
|
||||
},
|
||||
SET_NODE_STATS (state, value) {
|
||||
state.nodeStats = value
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
const actions = {
|
||||
@@ -74,7 +74,8 @@ const actions = {
|
||||
lang: state.spiderForm.lang,
|
||||
col: state.spiderForm.col,
|
||||
cron: state.spiderForm.cron,
|
||||
cron_enabled: state.spiderForm.cron_enabled ? 1 : 0
|
||||
cron_enabled: state.spiderForm.cron_enabled ? 1 : 0,
|
||||
site: state.spiderForm.site
|
||||
})
|
||||
.then(() => {
|
||||
dispatch('getSpiderList')
|
||||
@@ -89,7 +90,8 @@ const actions = {
|
||||
lang: state.spiderForm.lang,
|
||||
col: state.spiderForm.col,
|
||||
cron: state.spiderForm.cron,
|
||||
cron_enabled: state.spiderForm.cron_enabled ? 1 : 0
|
||||
cron_enabled: state.spiderForm.cron_enabled ? 1 : 0,
|
||||
site: state.spiderForm.site
|
||||
})
|
||||
.then(() => {
|
||||
dispatch('getSpiderList')
|
||||
|
||||
205
frontend/src/views/site/SiteList.vue
Normal file
205
frontend/src/views/site/SiteList.vue
Normal file
@@ -0,0 +1,205 @@
|
||||
<template>
|
||||
<div class="app-container">
|
||||
<!--filter-->
|
||||
<div class="filter">
|
||||
<el-input prefix-icon="el-icon-search"
|
||||
:placeholder="$t('Search')"
|
||||
class="filter-search"
|
||||
v-model="keyword">
|
||||
</el-input>
|
||||
<el-select v-model="filter.category" class="filter-category" :placeholder="$t('Select Category')" clearable>
|
||||
<el-option v-for="op in categoryList" :key="op" :value="op" :label="op"></el-option>
|
||||
</el-select>
|
||||
<el-button type="success"
|
||||
icon="el-icon-refresh"
|
||||
class="btn refresh"
|
||||
@click="onSearch">
|
||||
{{$t('Search')}}
|
||||
</el-button>
|
||||
</div>
|
||||
|
||||
<!--table list-->
|
||||
<el-table :data="siteList"
|
||||
class="table"
|
||||
:header-cell-style="{background:'rgb(48, 65, 86)',color:'white'}"
|
||||
border>
|
||||
<template v-for="col in columns">
|
||||
<el-table-column v-if="col.name === 'category'"
|
||||
:key="col.name"
|
||||
:label="$t(col.label)"
|
||||
:width="col.width"
|
||||
:align="col.align">
|
||||
<template slot-scope="scope">
|
||||
<el-select v-model="scope.row[col.name]"
|
||||
:placeholder="$t('Select')"
|
||||
@change="onRowChange(scope.row)">
|
||||
<el-option v-for="op in categoryList"
|
||||
:key="op"
|
||||
:value="op"
|
||||
:label="op">
|
||||
</el-option>
|
||||
</el-select>
|
||||
</template>
|
||||
</el-table-column>
|
||||
<el-table-column v-else-if="col.name === 'domain'"
|
||||
:key="col.name"
|
||||
:label="$t(col.label)"
|
||||
:width="col.width"
|
||||
:align="col.align">
|
||||
<template slot-scope="scope">
|
||||
<a class="domain" :href="'http://' + scope.row[col.name]" target="_blank">
|
||||
{{scope.row[col.name]}}
|
||||
</a>
|
||||
</template>
|
||||
</el-table-column>
|
||||
<el-table-column v-else
|
||||
:key="col.name"
|
||||
:property="col.name"
|
||||
:label="$t(col.label)"
|
||||
:sortable="col.sortable"
|
||||
:align="col.align || 'center'"
|
||||
:width="col.width">
|
||||
</el-table-column>
|
||||
</template>
|
||||
<el-table-column :label="$t('Action')" align="left" width="120">
|
||||
<template slot-scope="scope">
|
||||
<el-tooltip :content="$t('View')" placement="top">
|
||||
<el-button type="primary" icon="el-icon-search" size="mini" @click="onView(scope.row)"></el-button>
|
||||
</el-tooltip>
|
||||
<!--<el-tooltip :content="$t('Remove')" placement="top">-->
|
||||
<!--<el-button type="danger" icon="el-icon-delete" size="mini" @click="onRemove(scope.row)"></el-button>-->
|
||||
<!--</el-tooltip>-->
|
||||
</template>
|
||||
</el-table-column>
|
||||
</el-table>
|
||||
<div class="pagination">
|
||||
<el-pagination
|
||||
@current-change="onPageChange"
|
||||
@size-change="onPageChange"
|
||||
:current-page.sync="pageNum"
|
||||
:page-sizes="[10, 20, 50, 100]"
|
||||
:page-size.sync="pageSize"
|
||||
layout="sizes, prev, pager, next"
|
||||
:total="totalCount">
|
||||
</el-pagination>
|
||||
</div>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script>
|
||||
import {
|
||||
mapState
|
||||
} from 'vuex'
|
||||
|
||||
export default {
|
||||
name: 'SiteList',
|
||||
data () {
|
||||
return {
|
||||
categoryList: [
|
||||
'新闻',
|
||||
'搜索引擎',
|
||||
'综合',
|
||||
'金融',
|
||||
'购物',
|
||||
'社交',
|
||||
'视频',
|
||||
'音乐',
|
||||
'资讯',
|
||||
'政企官网',
|
||||
'其他'
|
||||
],
|
||||
columns: [
|
||||
{ name: 'rank', label: 'Rank', align: 'center', width: '80' },
|
||||
{ name: 'name', label: 'Name', align: 'left', width: '120' },
|
||||
{ name: 'domain', label: 'Domain', align: 'left', width: '150' },
|
||||
{ name: 'description', label: 'Description', align: 'left' },
|
||||
{ name: 'category', label: 'Category', align: 'center', width: '180' },
|
||||
{ name: 'spider_count', label: 'Spider Count', align: 'center', width: '60' }
|
||||
]
|
||||
}
|
||||
},
|
||||
computed: {
|
||||
...mapState('site', [
|
||||
'filter',
|
||||
'siteList',
|
||||
'totalCount'
|
||||
]),
|
||||
keyword: {
|
||||
get () {
|
||||
return this.$store.state.site.keyword
|
||||
},
|
||||
set (value) {
|
||||
this.$store.commit('site/SET_KEYWORD', value)
|
||||
}
|
||||
},
|
||||
pageNum: {
|
||||
get () {
|
||||
return this.$store.state.site.pageNum
|
||||
},
|
||||
set (value) {
|
||||
this.$store.commit('site/SET_PAGE_NUM', value)
|
||||
}
|
||||
},
|
||||
pageSize: {
|
||||
get () {
|
||||
return this.$store.state.site.pageSize
|
||||
},
|
||||
set (value) {
|
||||
this.$store.commit('site/SET_PAGE_SIZE', value)
|
||||
}
|
||||
}
|
||||
},
|
||||
methods: {
|
||||
onSearch () {
|
||||
this.$store.dispatch('site/getSiteList')
|
||||
},
|
||||
onPageChange () {
|
||||
this.$store.dispatch('site/getSiteList')
|
||||
},
|
||||
onRowChange (row) {
|
||||
this.$store.dispatch('site/editSite', {
|
||||
id: row.domain,
|
||||
category: row.category
|
||||
})
|
||||
}
|
||||
},
|
||||
created () {
|
||||
this.$store.dispatch('site/getSiteList')
|
||||
}
|
||||
}
|
||||
</script>
|
||||
|
||||
<style scoped>
|
||||
.filter {
|
||||
display: flex;
|
||||
}
|
||||
|
||||
.filter .filter-search {
|
||||
width: 180px;
|
||||
}
|
||||
|
||||
.filter .filter-category {
|
||||
width: 180px;
|
||||
margin-left: 20px;
|
||||
}
|
||||
|
||||
.filter .btn {
|
||||
margin-left: 20px;
|
||||
}
|
||||
|
||||
.table {
|
||||
margin-top: 20px;
|
||||
}
|
||||
|
||||
.table >>> .el-select .el-input__inner {
|
||||
height: 32px;
|
||||
}
|
||||
|
||||
.table >>> .el-select .el-select__caret {
|
||||
line-height: 32px;
|
||||
}
|
||||
|
||||
.table >>> .domain {
|
||||
text-decoration: underline;
|
||||
}
|
||||
</style>
|
||||
@@ -160,8 +160,9 @@ export default {
|
||||
// tableData,
|
||||
columns: [
|
||||
{ name: 'name', label: 'Name', width: 'auto' },
|
||||
{ name: 'type', label: 'Spider Type', width: '160', sortable: true },
|
||||
{ name: 'lang', label: 'Language', width: '160', sortable: true },
|
||||
{ name: 'site_name', label: 'Site', width: '120' },
|
||||
{ name: 'type', label: 'Spider Type', width: '120', sortable: true },
|
||||
{ name: 'lang', label: 'Language', width: '120', sortable: true },
|
||||
{ name: 'task_ts', label: 'Last Run', width: '160' },
|
||||
{ name: 'last_7d_tasks', label: 'Last 7-Day Tasks', width: '80' },
|
||||
{ name: 'last_5_errors', label: 'Last 5-Run Errors', width: '80' }
|
||||
|
||||
0
spiders/chinaz/chinaz/__init__.py
Normal file
0
spiders/chinaz/chinaz/__init__.py
Normal file
18
spiders/chinaz/chinaz/items.py
Normal file
18
spiders/chinaz/chinaz/items.py
Normal file
@@ -0,0 +1,18 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://doc.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class ChinazItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
_id = scrapy.Field()
|
||||
task_id = scrapy.Field()
|
||||
name = scrapy.Field()
|
||||
domain = scrapy.Field()
|
||||
description = scrapy.Field()
|
||||
rank = scrapy.Field()
|
||||
103
spiders/chinaz/chinaz/middlewares.py
Normal file
103
spiders/chinaz/chinaz/middlewares.py
Normal file
@@ -0,0 +1,103 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your spider middleware
|
||||
#
|
||||
# See documentation in:
|
||||
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class ChinazSpiderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the spider middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
# Called for each response that goes through the spider
|
||||
# middleware and into the spider.
|
||||
|
||||
# Should return None or raise an exception.
|
||||
return None
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
# Called with the results returned from the Spider, after
|
||||
# it has processed the response.
|
||||
|
||||
# Must return an iterable of Request, dict or Item objects.
|
||||
for i in result:
|
||||
yield i
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
# Called when a spider or process_spider_input() method
|
||||
# (from other spider middleware) raises an exception.
|
||||
|
||||
# Should return either None or an iterable of Response, dict
|
||||
# or Item objects.
|
||||
pass
|
||||
|
||||
def process_start_requests(self, start_requests, spider):
|
||||
# Called with the start requests of the spider, and works
|
||||
# similarly to the process_spider_output() method, except
|
||||
# that it doesn’t have a response associated.
|
||||
|
||||
# Must return only requests (not items).
|
||||
for r in start_requests:
|
||||
yield r
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
|
||||
|
||||
class ChinazDownloaderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the downloader middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# Called for each request that goes through the downloader
|
||||
# middleware.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this request
|
||||
# - or return a Response object
|
||||
# - or return a Request object
|
||||
# - or raise IgnoreRequest: process_exception() methods of
|
||||
# installed downloader middleware will be called
|
||||
return None
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
# Called with the response returned from the downloader.
|
||||
|
||||
# Must either;
|
||||
# - return a Response object
|
||||
# - return a Request object
|
||||
# - or raise IgnoreRequest
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
# Called when a download handler or a process_request()
|
||||
# (from other downloader middleware) raises an exception.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this exception
|
||||
# - return a Response object: stops process_exception() chain
|
||||
# - return a Request object: stops process_exception() chain
|
||||
pass
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
28
spiders/chinaz/chinaz/pipelines.py
Normal file
28
spiders/chinaz/chinaz/pipelines.py
Normal file
@@ -0,0 +1,28 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
import os
|
||||
|
||||
from pymongo import MongoClient
|
||||
|
||||
MONGO_HOST = os.environ.get('MONGO_HOST') or 'localhost'
|
||||
MONGO_PORT = int(os.environ.get('MONGO_PORT') or '27017')
|
||||
MONGO_DB = os.environ.get('MONGO_DB') or 'crawlab_test'
|
||||
|
||||
|
||||
class MongoPipeline(object):
|
||||
mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
|
||||
db = mongo[MONGO_DB]
|
||||
col_name = os.environ.get('CRAWLAB_COLLECTION') or 'sites'
|
||||
col = db[col_name]
|
||||
|
||||
def process_item(self, item, spider):
|
||||
item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
|
||||
item['_id'] = item['domain']
|
||||
if self.col.find_one({'_id': item['_id']}) is None:
|
||||
self.col.save(item)
|
||||
return item
|
||||
90
spiders/chinaz/chinaz/settings.py
Normal file
90
spiders/chinaz/chinaz/settings.py
Normal file
@@ -0,0 +1,90 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Scrapy settings for chinaz project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://doc.scrapy.org/en/latest/topics/settings.html
|
||||
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = 'chinaz'
|
||||
|
||||
SPIDER_MODULES = ['chinaz.spiders']
|
||||
NEWSPIDER_MODULE = 'chinaz.spiders'
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
#USER_AGENT = 'chinaz (+http://www.yourdomain.com)'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
#DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
# 'Accept-Language': 'en',
|
||||
#}
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# 'chinaz.middlewares.ChinazSpiderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
#DOWNLOADER_MIDDLEWARES = {
|
||||
# 'chinaz.middlewares.ChinazDownloaderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://doc.scrapy.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||
#}
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
'chinaz.pipelines.MongoPipeline': 300,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = 'httpcache'
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
4
spiders/chinaz/chinaz/spiders/__init__.py
Normal file
4
spiders/chinaz/chinaz/spiders/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
29
spiders/chinaz/chinaz/spiders/chinaz_spider.py
Normal file
29
spiders/chinaz/chinaz/spiders/chinaz_spider.py
Normal file
@@ -0,0 +1,29 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import scrapy
|
||||
from chinaz.items import ChinazItem
|
||||
|
||||
|
||||
class ChinazSpiderSpider(scrapy.Spider):
|
||||
name = 'chinaz_spider'
|
||||
allowed_domains = ['chinaz.com']
|
||||
start_urls = ['http://top.chinaz.com/hangye/']
|
||||
|
||||
def parse(self, response):
|
||||
for item in response.css('.listCentent > li'):
|
||||
name = item.css('h3.rightTxtHead > a::text').extract_first()
|
||||
domain = item.css('h3.rightTxtHead > span::text').extract_first()
|
||||
description = item.css('p.RtCInfo::text').extract_first()
|
||||
rank = item.css('.RtCRateCent > strong::text').extract_first()
|
||||
rank = int(rank)
|
||||
yield ChinazItem(
|
||||
_id=domain,
|
||||
name=name,
|
||||
domain=domain,
|
||||
description=description,
|
||||
rank=rank,
|
||||
)
|
||||
|
||||
# pagination
|
||||
a_list = response.css('.ListPageWrap > a::attr("href")').extract()
|
||||
url = 'http://top.chinaz.com/hangye/' + a_list[-1]
|
||||
yield scrapy.Request(url=url)
|
||||
11
spiders/chinaz/scrapy.cfg
Normal file
11
spiders/chinaz/scrapy.cfg
Normal file
@@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = chinaz.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = chinaz
|
||||
0
spiders/jd/jd/__init__.py
Normal file
0
spiders/jd/jd/__init__.py
Normal file
14
spiders/jd/jd/items.py
Normal file
14
spiders/jd/jd/items.py
Normal file
@@ -0,0 +1,14 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://doc.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class JdItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
name = scrapy.Field()
|
||||
price = scrapy.Field()
|
||||
103
spiders/jd/jd/middlewares.py
Normal file
103
spiders/jd/jd/middlewares.py
Normal file
@@ -0,0 +1,103 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your spider middleware
|
||||
#
|
||||
# See documentation in:
|
||||
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class JdSpiderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the spider middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
# Called for each response that goes through the spider
|
||||
# middleware and into the spider.
|
||||
|
||||
# Should return None or raise an exception.
|
||||
return None
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
# Called with the results returned from the Spider, after
|
||||
# it has processed the response.
|
||||
|
||||
# Must return an iterable of Request, dict or Item objects.
|
||||
for i in result:
|
||||
yield i
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
# Called when a spider or process_spider_input() method
|
||||
# (from other spider middleware) raises an exception.
|
||||
|
||||
# Should return either None or an iterable of Response, dict
|
||||
# or Item objects.
|
||||
pass
|
||||
|
||||
def process_start_requests(self, start_requests, spider):
|
||||
# Called with the start requests of the spider, and works
|
||||
# similarly to the process_spider_output() method, except
|
||||
# that it doesn’t have a response associated.
|
||||
|
||||
# Must return only requests (not items).
|
||||
for r in start_requests:
|
||||
yield r
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
|
||||
|
||||
class JdDownloaderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the downloader middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# Called for each request that goes through the downloader
|
||||
# middleware.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this request
|
||||
# - or return a Response object
|
||||
# - or return a Request object
|
||||
# - or raise IgnoreRequest: process_exception() methods of
|
||||
# installed downloader middleware will be called
|
||||
return None
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
# Called with the response returned from the downloader.
|
||||
|
||||
# Must either;
|
||||
# - return a Response object
|
||||
# - return a Request object
|
||||
# - or raise IgnoreRequest
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
# Called when a download handler or a process_request()
|
||||
# (from other downloader middleware) raises an exception.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this exception
|
||||
# - return a Response object: stops process_exception() chain
|
||||
# - return a Request object: stops process_exception() chain
|
||||
pass
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
17
spiders/jd/jd/pipelines.py
Normal file
17
spiders/jd/jd/pipelines.py
Normal file
@@ -0,0 +1,17 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
from pymongo import MongoClient
|
||||
|
||||
|
||||
class JdPipeline(object):
|
||||
mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
|
||||
db = mongo[MONGO_DB]
|
||||
col_name = os.environ.get('CRAWLAB_COLLECTION') or 'jd_products'
|
||||
col = db[col_name]
|
||||
|
||||
def process_item(self, item, spider):
|
||||
return item
|
||||
90
spiders/jd/jd/settings.py
Normal file
90
spiders/jd/jd/settings.py
Normal file
@@ -0,0 +1,90 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Scrapy settings for jd project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://doc.scrapy.org/en/latest/topics/settings.html
|
||||
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = 'jd'
|
||||
|
||||
SPIDER_MODULES = ['jd.spiders']
|
||||
NEWSPIDER_MODULE = 'jd.spiders'
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
#USER_AGENT = 'jd (+http://www.yourdomain.com)'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
#DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
# 'Accept-Language': 'en',
|
||||
#}
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# 'jd.middlewares.JdSpiderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
#DOWNLOADER_MIDDLEWARES = {
|
||||
# 'jd.middlewares.JdDownloaderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://doc.scrapy.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||
#}
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
'jd.pipelines.JdPipeline': 300,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = 'httpcache'
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
4
spiders/jd/jd/spiders/__init__.py
Normal file
4
spiders/jd/jd/spiders/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
11
spiders/jd/jd/spiders/jd_spider.py
Normal file
11
spiders/jd/jd/spiders/jd_spider.py
Normal file
@@ -0,0 +1,11 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import scrapy
|
||||
|
||||
|
||||
class JdSpiderSpider(scrapy.Spider):
|
||||
name = 'jd_spider'
|
||||
allowed_domains = ['jd.com']
|
||||
start_urls = ['http://jd.com/']
|
||||
|
||||
def parse(self, response):
|
||||
pass
|
||||
11
spiders/jd/scrapy.cfg
Normal file
11
spiders/jd/scrapy.cfg
Normal file
@@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = jd.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = jd
|
||||
Reference in New Issue
Block a user