mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-24 17:41:03 +01:00
updated sites
This commit is contained in:
@@ -71,7 +71,8 @@ api.add_resource(ScheduleApi,
|
||||
'/api/schedules/<string:id>')
|
||||
api.add_resource(SiteApi,
|
||||
'/api/sites',
|
||||
'/api/sites/<string:id>')
|
||||
'/api/sites/<string:id>',
|
||||
'/api/sites/get/<string:action>')
|
||||
|
||||
|
||||
def monitor_nodes_status(celery_app):
|
||||
|
||||
@@ -179,5 +179,9 @@ class DbManager(object):
|
||||
col = self.db[col_name]
|
||||
col.create_index(keys=keys, **kwargs)
|
||||
|
||||
def distinct(self, col_name: str, key: str, filter: dict):
|
||||
col = self.db[col_name]
|
||||
return sorted(col.distinct(key, filter))
|
||||
|
||||
|
||||
db_manager = DbManager()
|
||||
|
||||
@@ -13,6 +13,7 @@ class SiteApi(BaseApi):
|
||||
|
||||
arguments = (
|
||||
('keyword', str),
|
||||
('main_category', str),
|
||||
('category', str),
|
||||
)
|
||||
|
||||
@@ -70,3 +71,20 @@ class SiteApi(BaseApi):
|
||||
'page_size': page_size,
|
||||
'items': jsonify(sites)
|
||||
}
|
||||
|
||||
def get_main_category_list(self, id):
|
||||
return {
|
||||
'status': 'ok',
|
||||
'items': db_manager.distinct(col_name=self.col_name, key='main_category', filter={})
|
||||
}
|
||||
|
||||
def get_category_list(self, id):
|
||||
args = self.parser.parse_args()
|
||||
filter_ = {}
|
||||
if args.get('main_category') is not None:
|
||||
filter_['main_category'] = args.get('main_category')
|
||||
return {
|
||||
'status': 'ok',
|
||||
'items': db_manager.distinct(col_name=self.col_name, key='category',
|
||||
filter=filter_)
|
||||
}
|
||||
|
||||
@@ -16,3 +16,6 @@ class ChinazItem(scrapy.Item):
|
||||
domain = scrapy.Field()
|
||||
description = scrapy.Field()
|
||||
rank = scrapy.Field()
|
||||
main_category = scrapy.Field()
|
||||
category = scrapy.Field()
|
||||
location = scrapy.Field()
|
||||
|
||||
@@ -11,19 +11,50 @@ class ChinazSpiderSpider(scrapy.Spider):
|
||||
def parse(self, response):
|
||||
for item in response.css('.listCentent > li'):
|
||||
name = item.css('h3.rightTxtHead > a::text').extract_first()
|
||||
href = item.css('h3.rightTxtHead > a::attr("href")').extract_first()
|
||||
domain = item.css('h3.rightTxtHead > span::text').extract_first()
|
||||
description = item.css('p.RtCInfo::text').extract_first()
|
||||
rank = item.css('.RtCRateCent > strong::text').extract_first()
|
||||
rank = int(rank)
|
||||
yield ChinazItem(
|
||||
item = ChinazItem(
|
||||
_id=domain,
|
||||
name=name,
|
||||
domain=domain,
|
||||
description=description,
|
||||
rank=rank,
|
||||
)
|
||||
yield scrapy.Request(
|
||||
url='http://top.chinaz.com' + href,
|
||||
callback=self.parse_item,
|
||||
meta={
|
||||
'item': item
|
||||
}
|
||||
)
|
||||
|
||||
# pagination
|
||||
a_list = response.css('.ListPageWrap > a::attr("href")').extract()
|
||||
url = 'http://top.chinaz.com/hangye/' + a_list[-1]
|
||||
yield scrapy.Request(url=url)
|
||||
yield scrapy.Request(url=url, callback=self.parse)
|
||||
|
||||
def parse_item(self, response):
|
||||
item = response.meta['item']
|
||||
|
||||
# category info extraction
|
||||
arr = response.css('.TopMainTag-show .SimSun')
|
||||
res1 = arr[0].css('a::text').extract()
|
||||
main_category = res1[0]
|
||||
if len(res1) == 1:
|
||||
category = '其他'
|
||||
else:
|
||||
category = res1[1]
|
||||
|
||||
# location info extraction
|
||||
res2 = arr[1].css('a::text').extract()
|
||||
location = res2[0]
|
||||
|
||||
# assign values to item
|
||||
item['main_category'] = main_category
|
||||
item['category'] = category
|
||||
item['location'] = location
|
||||
|
||||
yield item
|
||||
|
||||
Reference in New Issue
Block a user