updated sites

This commit is contained in:
Marvin Zhang
2019-05-22 22:33:28 +08:00
parent 2e01c6a08f
commit 967981d3e3
5 changed files with 60 additions and 3 deletions

View File

@@ -71,7 +71,8 @@ api.add_resource(ScheduleApi,
'/api/schedules/<string:id>')
api.add_resource(SiteApi,
'/api/sites',
'/api/sites/<string:id>')
'/api/sites/<string:id>',
'/api/sites/get/<string:action>')
def monitor_nodes_status(celery_app):

View File

@@ -179,5 +179,9 @@ class DbManager(object):
col = self.db[col_name]
col.create_index(keys=keys, **kwargs)
def distinct(self, col_name: str, key: str, filter: dict):
col = self.db[col_name]
return sorted(col.distinct(key, filter))
db_manager = DbManager()

View File

@@ -13,6 +13,7 @@ class SiteApi(BaseApi):
arguments = (
('keyword', str),
('main_category', str),
('category', str),
)
@@ -70,3 +71,20 @@ class SiteApi(BaseApi):
'page_size': page_size,
'items': jsonify(sites)
}
def get_main_category_list(self, id):
return {
'status': 'ok',
'items': db_manager.distinct(col_name=self.col_name, key='main_category', filter={})
}
def get_category_list(self, id):
args = self.parser.parse_args()
filter_ = {}
if args.get('main_category') is not None:
filter_['main_category'] = args.get('main_category')
return {
'status': 'ok',
'items': db_manager.distinct(col_name=self.col_name, key='category',
filter=filter_)
}

View File

@@ -16,3 +16,6 @@ class ChinazItem(scrapy.Item):
domain = scrapy.Field()
description = scrapy.Field()
rank = scrapy.Field()
main_category = scrapy.Field()
category = scrapy.Field()
location = scrapy.Field()

View File

@@ -11,19 +11,50 @@ class ChinazSpiderSpider(scrapy.Spider):
def parse(self, response):
for item in response.css('.listCentent > li'):
name = item.css('h3.rightTxtHead > a::text').extract_first()
href = item.css('h3.rightTxtHead > a::attr("href")').extract_first()
domain = item.css('h3.rightTxtHead > span::text').extract_first()
description = item.css('p.RtCInfo::text').extract_first()
rank = item.css('.RtCRateCent > strong::text').extract_first()
rank = int(rank)
yield ChinazItem(
item = ChinazItem(
_id=domain,
name=name,
domain=domain,
description=description,
rank=rank,
)
yield scrapy.Request(
url='http://top.chinaz.com' + href,
callback=self.parse_item,
meta={
'item': item
}
)
# pagination
a_list = response.css('.ListPageWrap > a::attr("href")').extract()
url = 'http://top.chinaz.com/hangye/' + a_list[-1]
yield scrapy.Request(url=url)
yield scrapy.Request(url=url, callback=self.parse)
def parse_item(self, response):
item = response.meta['item']
# category info extraction
arr = response.css('.TopMainTag-show .SimSun')
res1 = arr[0].css('a::text').extract()
main_category = res1[0]
if len(res1) == 1:
category = '其他'
else:
category = res1[1]
# location info extraction
res2 = arr[1].css('a::text').extract()
location = res2[0]
# assign values to item
item['main_category'] = main_category
item['category'] = category
item['location'] = location
yield item