Merge pull request #25 from tikazyq/develop

Develop
This commit is contained in:
Marvin Zhang
2019-05-10 21:27:10 +08:00
committed by GitHub
31 changed files with 963 additions and 34 deletions

View File

@@ -9,6 +9,7 @@ from flask import Flask
from flask_cors import CORS
from flask_restful import Api
# from flask_restplus import Api
from routes.sites import SiteApi
from utils.log import other
from constants.node import NodeStatus
from db.manager import db_manager
@@ -68,6 +69,9 @@ api.add_resource(StatsApi,
api.add_resource(ScheduleApi,
'/api/schedules',
'/api/schedules/<string:id>')
api.add_resource(SiteApi,
'/api/sites',
'/api/sites/<string:id>')
def monitor_nodes_status(celery_app):

View File

@@ -13,7 +13,7 @@ class DbManager(object):
"""
def __init__(self):
self.mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
self.mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT, connect=False)
self.db = self.mongo[MONGO_DB]
def save(self, col_name: str, item: dict, **kwargs) -> None:

72
crawlab/routes/sites.py Normal file
View File

@@ -0,0 +1,72 @@
import json
from bson import ObjectId
from pymongo import ASCENDING
from db.manager import db_manager
from routes.base import BaseApi
from utils import jsonify
class SiteApi(BaseApi):
col_name = 'sites'
arguments = (
('keyword', str),
('category', str),
)
def get(self, id: str = None, action: str = None):
# action by id
if action is not None:
if not hasattr(self, action):
return {
'status': 'ok',
'code': 400,
'error': 'action "%s" invalid' % action
}, 400
return getattr(self, action)(id)
elif id is not None:
site = db_manager.get(col_name=self.col_name, id=id)
return jsonify(site)
# list tasks
args = self.parser.parse_args()
page_size = args.get('page_size') or 10
page_num = args.get('page_num') or 1
filter_str = args.get('filter')
keyword = args.get('keyword')
filter_ = {}
if filter_str is not None:
filter_ = json.loads(filter_str)
if keyword is not None:
filter_['$or'] = [
{'description': {'$regex': keyword}},
{'name': {'$regex': keyword}},
{'domain': {'$regex': keyword}}
]
items = db_manager.list(
col_name=self.col_name,
cond=filter_,
limit=page_size,
skip=page_size * (page_num - 1),
sort_key='rank',
sort_direction=ASCENDING
)
sites = []
for site in items:
# get spider count
site['spider_count'] = db_manager.count('spiders', {'site': site['_id']})
sites.append(site)
return {
'status': 'ok',
'total_count': db_manager.count(self.col_name, filter_),
'page_num': page_num,
'page_size': page_size,
'items': jsonify(sites)
}

View File

@@ -61,6 +61,9 @@ class SpiderApi(BaseApi):
# spider schedule cron enabled
('envs', str),
# spider site
('site', str),
)
def get(self, id=None, action=None):
@@ -125,6 +128,12 @@ class SpiderApi(BaseApi):
if last_task is not None:
spider['task_ts'] = last_task['create_ts']
# get site
if spider.get('site') is not None:
site = db_manager.get('sites', spider['site'])
if site is not None:
spider['site_name'] = site['name']
# file stats
stats = get_file_suffix_stats(dir_path)

View File

@@ -36,7 +36,6 @@ class TaskApi(BaseApi):
'code': 400,
'error': 'action "%s" invalid' % action
}, 400
# other.info(f"到这了{action},{id}")
return getattr(self, action)(id)
elif id is not None:
@@ -78,9 +77,6 @@ class TaskApi(BaseApi):
sort_key='create_ts')
items = []
for task in tasks:
# celery tasks
# _task = db_manager.get('tasks_celery', id=task['_id'])
# get spider
_spider = db_manager.get(col_name='spiders', id=str(task['spider_id']))

View File

@@ -9,7 +9,7 @@ from db.manager import db_manager
class Scheduler(object):
mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT, connect=False)
task_col = 'apscheduler_jobs'
# scheduler jobstore

View File

@@ -1,6 +1,6 @@
{
"name": "crawlab",
"version": "0.1.0",
"version": "0.2.0",
"private": true,
"scripts": {
"serve": "cross-env NODE_ENV=development vue-cli-service serve --ip=0.0.0.0",

View File

@@ -23,6 +23,14 @@
<el-input v-model="spiderForm.col" :placeholder="$t('Results Collection')"
:disabled="isView"></el-input>
</el-form-item>
<el-form-item :label="$t('Site')">
<el-autocomplete v-model="spiderForm.site"
:placeholder="$t('Site')"
:fetch-suggestions="fetchSiteSuggestions"
clearable
@select="onSiteSelect">
</el-autocomplete>
</el-form-item>
<el-form-item :label="$t('Spider Type')">
<el-select v-model="spiderForm.type" :placeholder="$t('Spider Type')" :disabled="isView" clearable>
<el-option value="scrapy" label="Scrapy"></el-option>
@@ -38,26 +46,6 @@
<el-option value="go" label="Go"></el-option>
</el-select>
</el-form-item>
<!--<el-form-item :label="$t('Schedule Enabled')">-->
<!--<el-switch v-model="spiderForm.cron_enabled" :disabled="isView">-->
<!--</el-switch>-->
<!--</el-form-item>-->
<!--<el-form-item :label="$t('Schedule Cron')" v-if="spiderForm.cron_enabled"-->
<!--prop="cron"-->
<!--:rules="cronRules"-->
<!--:inline-message="true">-->
<!--<template slot="label">-->
<!--<el-tooltip :content="$t('Cron Format: [second] [minute] [hour] [day of month] [month] [day of week]')"-->
<!--placement="top">-->
<!--<span>-->
<!--{{$t('Schedule Cron')}}-->
<!--<i class="fa fa-exclamation-circle"></i>-->
<!--</span>-->
<!--</el-tooltip>-->
<!--</template>-->
<!--<el-input v-model="spiderForm.cron" :placeholder="$t('Schedule Cron')"-->
<!--:disabled="isView"></el-input>-->
<!--</el-form-item>-->
</el-form>
</el-row>
<el-row class="button-container" v-if="!isView">
@@ -172,6 +160,22 @@ export default {
})
}
})
},
fetchSiteSuggestions (keyword, callback) {
this.$request.get('/sites', {
keyword: keyword,
page_num: 1,
page_size: 100
}).then(response => {
const data = response.data.items.map(d => {
d.value = `${d.name} | ${d.domain}`
return d
})
callback(data)
})
},
onSiteSelect (item) {
this.spiderForm.site = item._id
}
}
}
@@ -187,4 +191,8 @@ export default {
width: 100%;
text-align: right;
}
.el-autocomplete {
width: 100%;
}
</style>

View File

@@ -10,6 +10,7 @@ export default {
'Task Detail': '任务详情',
'Schedules': '定时任务',
'Deploys': '部署',
'Sites': '网站',
// 标签
Overview: '概览',
@@ -70,7 +71,7 @@ export default {
// 节点状态
Online: '在线',
Offline: '线',
Offline: '线',
Unavailable: '未知',
// 爬虫
@@ -130,6 +131,15 @@ export default {
'Parameters': '参数',
'Add Schedule': '添加定时任务',
// 网站
'Site': '网站',
'Rank': '排名',
'Domain': '域名',
'Category': '类别',
'Select': '请选择',
'Select Category': '请选择类别',
'Spider Count': '爬虫数',
// 文件
'Choose Folder': '选择文件',

View File

@@ -183,6 +183,26 @@ export const constantRouterMap = [
}
]
},
{
name: 'Site',
path: '/sites',
component: Layout,
meta: {
title: 'Site',
icon: 'fa fa-sitemap'
},
children: [
{
path: '',
name: 'SiteList',
component: () => import('../views/site/SiteList'),
meta: {
title: 'Sites',
icon: 'fa fa-sitemap'
}
}
]
},
{ path: '*', redirect: '/404', hidden: true }
]

View File

@@ -11,6 +11,7 @@ import task from './modules/task'
import file from './modules/file'
import schedule from './modules/schedule'
import lang from './modules/lang'
import site from './modules/site'
import getters from './getters'
Vue.use(Vuex)
@@ -27,7 +28,8 @@ const store = new Vuex.Store({
task,
file,
schedule,
lang
lang,
site
},
getters
})

View File

@@ -0,0 +1,67 @@
import request from '../../api/request'
const state = {
siteList: [],
// filter
filter: {
category: undefined
},
keyword: '',
// pagination
pageNum: 1,
pageSize: 10,
totalCount: 0
}
const getters = {}
const mutations = {
SET_KEYWORD (state, value) {
state.keyword = value
},
SET_SITE_LIST (state, value) {
state.siteList = value
},
SET_PAGE_NUM (state, value) {
state.pageNum = value
},
SET_PAGE_SIZE (state, value) {
state.pageSize = value
},
SET_TOTAL_COUNT (state, value) {
state.totalCount = value
}
}
const actions = {
editSite ({ state, dispatch }, payload) {
const { id, category } = payload
return request.post(`/sites/${id}`, {
category
})
},
getSiteList ({ state, commit }) {
return request.get('/sites', {
page_num: state.pageNum,
page_size: state.pageSize,
keyword: state.keyword || undefined,
filter: {
category: state.filter.category || undefined
}
})
.then(response => {
commit('SET_SITE_LIST', response.data.items)
commit('SET_TOTAL_COUNT', response.data.total_count)
})
}
}
export default {
namespaced: true,
state,
getters,
mutations,
actions
}

View File

@@ -55,7 +55,7 @@ const mutations = {
},
SET_NODE_STATS (state, value) {
state.nodeStats = value
},
}
}
const actions = {
@@ -74,7 +74,8 @@ const actions = {
lang: state.spiderForm.lang,
col: state.spiderForm.col,
cron: state.spiderForm.cron,
cron_enabled: state.spiderForm.cron_enabled ? 1 : 0
cron_enabled: state.spiderForm.cron_enabled ? 1 : 0,
site: state.spiderForm.site
})
.then(() => {
dispatch('getSpiderList')
@@ -89,7 +90,8 @@ const actions = {
lang: state.spiderForm.lang,
col: state.spiderForm.col,
cron: state.spiderForm.cron,
cron_enabled: state.spiderForm.cron_enabled ? 1 : 0
cron_enabled: state.spiderForm.cron_enabled ? 1 : 0,
site: state.spiderForm.site
})
.then(() => {
dispatch('getSpiderList')

View File

@@ -0,0 +1,205 @@
<template>
<div class="app-container">
<!--filter-->
<div class="filter">
<el-input prefix-icon="el-icon-search"
:placeholder="$t('Search')"
class="filter-search"
v-model="keyword">
</el-input>
<el-select v-model="filter.category" class="filter-category" :placeholder="$t('Select Category')" clearable>
<el-option v-for="op in categoryList" :key="op" :value="op" :label="op"></el-option>
</el-select>
<el-button type="success"
icon="el-icon-refresh"
class="btn refresh"
@click="onSearch">
{{$t('Search')}}
</el-button>
</div>
<!--table list-->
<el-table :data="siteList"
class="table"
:header-cell-style="{background:'rgb(48, 65, 86)',color:'white'}"
border>
<template v-for="col in columns">
<el-table-column v-if="col.name === 'category'"
:key="col.name"
:label="$t(col.label)"
:width="col.width"
:align="col.align">
<template slot-scope="scope">
<el-select v-model="scope.row[col.name]"
:placeholder="$t('Select')"
@change="onRowChange(scope.row)">
<el-option v-for="op in categoryList"
:key="op"
:value="op"
:label="op">
</el-option>
</el-select>
</template>
</el-table-column>
<el-table-column v-else-if="col.name === 'domain'"
:key="col.name"
:label="$t(col.label)"
:width="col.width"
:align="col.align">
<template slot-scope="scope">
<a class="domain" :href="'http://' + scope.row[col.name]" target="_blank">
{{scope.row[col.name]}}
</a>
</template>
</el-table-column>
<el-table-column v-else
:key="col.name"
:property="col.name"
:label="$t(col.label)"
:sortable="col.sortable"
:align="col.align || 'center'"
:width="col.width">
</el-table-column>
</template>
<el-table-column :label="$t('Action')" align="left" width="120">
<template slot-scope="scope">
<el-tooltip :content="$t('View')" placement="top">
<el-button type="primary" icon="el-icon-search" size="mini" @click="onView(scope.row)"></el-button>
</el-tooltip>
<!--<el-tooltip :content="$t('Remove')" placement="top">-->
<!--<el-button type="danger" icon="el-icon-delete" size="mini" @click="onRemove(scope.row)"></el-button>-->
<!--</el-tooltip>-->
</template>
</el-table-column>
</el-table>
<div class="pagination">
<el-pagination
@current-change="onPageChange"
@size-change="onPageChange"
:current-page.sync="pageNum"
:page-sizes="[10, 20, 50, 100]"
:page-size.sync="pageSize"
layout="sizes, prev, pager, next"
:total="totalCount">
</el-pagination>
</div>
</div>
</template>
<script>
import {
mapState
} from 'vuex'
export default {
name: 'SiteList',
data () {
return {
categoryList: [
'新闻',
'搜索引擎',
'综合',
'金融',
'购物',
'社交',
'视频',
'音乐',
'资讯',
'政企官网',
'其他'
],
columns: [
{ name: 'rank', label: 'Rank', align: 'center', width: '80' },
{ name: 'name', label: 'Name', align: 'left', width: '120' },
{ name: 'domain', label: 'Domain', align: 'left', width: '150' },
{ name: 'description', label: 'Description', align: 'left' },
{ name: 'category', label: 'Category', align: 'center', width: '180' },
{ name: 'spider_count', label: 'Spider Count', align: 'center', width: '60' }
]
}
},
computed: {
...mapState('site', [
'filter',
'siteList',
'totalCount'
]),
keyword: {
get () {
return this.$store.state.site.keyword
},
set (value) {
this.$store.commit('site/SET_KEYWORD', value)
}
},
pageNum: {
get () {
return this.$store.state.site.pageNum
},
set (value) {
this.$store.commit('site/SET_PAGE_NUM', value)
}
},
pageSize: {
get () {
return this.$store.state.site.pageSize
},
set (value) {
this.$store.commit('site/SET_PAGE_SIZE', value)
}
}
},
methods: {
onSearch () {
this.$store.dispatch('site/getSiteList')
},
onPageChange () {
this.$store.dispatch('site/getSiteList')
},
onRowChange (row) {
this.$store.dispatch('site/editSite', {
id: row.domain,
category: row.category
})
}
},
created () {
this.$store.dispatch('site/getSiteList')
}
}
</script>
<style scoped>
.filter {
display: flex;
}
.filter .filter-search {
width: 180px;
}
.filter .filter-category {
width: 180px;
margin-left: 20px;
}
.filter .btn {
margin-left: 20px;
}
.table {
margin-top: 20px;
}
.table >>> .el-select .el-input__inner {
height: 32px;
}
.table >>> .el-select .el-select__caret {
line-height: 32px;
}
.table >>> .domain {
text-decoration: underline;
}
</style>

View File

@@ -160,8 +160,9 @@ export default {
// tableData,
columns: [
{ name: 'name', label: 'Name', width: 'auto' },
{ name: 'type', label: 'Spider Type', width: '160', sortable: true },
{ name: 'lang', label: 'Language', width: '160', sortable: true },
{ name: 'site_name', label: 'Site', width: '120' },
{ name: 'type', label: 'Spider Type', width: '120', sortable: true },
{ name: 'lang', label: 'Language', width: '120', sortable: true },
{ name: 'task_ts', label: 'Last Run', width: '160' },
{ name: 'last_7d_tasks', label: 'Last 7-Day Tasks', width: '80' },
{ name: 'last_5_errors', label: 'Last 5-Run Errors', width: '80' }

View File

View File

@@ -0,0 +1,18 @@
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class ChinazItem(scrapy.Item):
# define the fields for your item here like:
_id = scrapy.Field()
task_id = scrapy.Field()
name = scrapy.Field()
domain = scrapy.Field()
description = scrapy.Field()
rank = scrapy.Field()

View File

@@ -0,0 +1,103 @@
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class ChinazSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class ChinazDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)

View File

@@ -0,0 +1,28 @@
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os
from pymongo import MongoClient
MONGO_HOST = os.environ.get('MONGO_HOST') or 'localhost'
MONGO_PORT = int(os.environ.get('MONGO_PORT') or '27017')
MONGO_DB = os.environ.get('MONGO_DB') or 'crawlab_test'
class MongoPipeline(object):
mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
db = mongo[MONGO_DB]
col_name = os.environ.get('CRAWLAB_COLLECTION') or 'sites'
col = db[col_name]
def process_item(self, item, spider):
item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
item['_id'] = item['domain']
if self.col.find_one({'_id': item['_id']}) is None:
self.col.save(item)
return item

View File

@@ -0,0 +1,90 @@
# -*- coding: utf-8 -*-
# Scrapy settings for chinaz project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'chinaz'
SPIDER_MODULES = ['chinaz.spiders']
NEWSPIDER_MODULE = 'chinaz.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'chinaz (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'chinaz.middlewares.ChinazSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'chinaz.middlewares.ChinazDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'chinaz.pipelines.MongoPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

View File

@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-
import scrapy
from chinaz.items import ChinazItem
class ChinazSpiderSpider(scrapy.Spider):
name = 'chinaz_spider'
allowed_domains = ['chinaz.com']
start_urls = ['http://top.chinaz.com/hangye/']
def parse(self, response):
for item in response.css('.listCentent > li'):
name = item.css('h3.rightTxtHead > a::text').extract_first()
domain = item.css('h3.rightTxtHead > span::text').extract_first()
description = item.css('p.RtCInfo::text').extract_first()
rank = item.css('.RtCRateCent > strong::text').extract_first()
rank = int(rank)
yield ChinazItem(
_id=domain,
name=name,
domain=domain,
description=description,
rank=rank,
)
# pagination
a_list = response.css('.ListPageWrap > a::attr("href")').extract()
url = 'http://top.chinaz.com/hangye/' + a_list[-1]
yield scrapy.Request(url=url)

11
spiders/chinaz/scrapy.cfg Normal file
View File

@@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = chinaz.settings
[deploy]
#url = http://localhost:6800/
project = chinaz

View File

14
spiders/jd/jd/items.py Normal file
View File

@@ -0,0 +1,14 @@
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class JdItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
price = scrapy.Field()

View File

@@ -0,0 +1,103 @@
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class JdSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class JdDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)

View File

@@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from pymongo import MongoClient
class JdPipeline(object):
mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
db = mongo[MONGO_DB]
col_name = os.environ.get('CRAWLAB_COLLECTION') or 'jd_products'
col = db[col_name]
def process_item(self, item, spider):
return item

90
spiders/jd/jd/settings.py Normal file
View File

@@ -0,0 +1,90 @@
# -*- coding: utf-8 -*-
# Scrapy settings for jd project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'jd'
SPIDER_MODULES = ['jd.spiders']
NEWSPIDER_MODULE = 'jd.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'jd (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'jd.middlewares.JdSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'jd.middlewares.JdDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'jd.pipelines.JdPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

View File

@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@@ -0,0 +1,11 @@
# -*- coding: utf-8 -*-
import scrapy
class JdSpiderSpider(scrapy.Spider):
name = 'jd_spider'
allowed_domains = ['jd.com']
start_urls = ['http://jd.com/']
def parse(self, response):
pass

11
spiders/jd/scrapy.cfg Normal file
View File

@@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = jd.settings
[deploy]
#url = http://localhost:6800/
project = jd