From 967981d3e3b2ade3f4368d242540a9eb38651858 Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Wed, 22 May 2019 22:33:28 +0800 Subject: [PATCH] updated sites --- crawlab/app.py | 3 +- crawlab/db/manager.py | 4 +++ crawlab/routes/sites.py | 18 ++++++++++ spiders/chinaz/chinaz/items.py | 3 ++ .../chinaz/chinaz/spiders/chinaz_spider.py | 35 +++++++++++++++++-- 5 files changed, 60 insertions(+), 3 deletions(-) diff --git a/crawlab/app.py b/crawlab/app.py index c984f140..3a60c19e 100644 --- a/crawlab/app.py +++ b/crawlab/app.py @@ -71,7 +71,8 @@ api.add_resource(ScheduleApi, '/api/schedules/') api.add_resource(SiteApi, '/api/sites', - '/api/sites/') + '/api/sites/', + '/api/sites/get/') def monitor_nodes_status(celery_app): diff --git a/crawlab/db/manager.py b/crawlab/db/manager.py index 4c5535e7..e80aada6 100644 --- a/crawlab/db/manager.py +++ b/crawlab/db/manager.py @@ -179,5 +179,9 @@ class DbManager(object): col = self.db[col_name] col.create_index(keys=keys, **kwargs) + def distinct(self, col_name: str, key: str, filter: dict): + col = self.db[col_name] + return sorted(col.distinct(key, filter)) + db_manager = DbManager() diff --git a/crawlab/routes/sites.py b/crawlab/routes/sites.py index 6874af2b..e49dbe37 100644 --- a/crawlab/routes/sites.py +++ b/crawlab/routes/sites.py @@ -13,6 +13,7 @@ class SiteApi(BaseApi): arguments = ( ('keyword', str), + ('main_category', str), ('category', str), ) @@ -70,3 +71,20 @@ class SiteApi(BaseApi): 'page_size': page_size, 'items': jsonify(sites) } + + def get_main_category_list(self, id): + return { + 'status': 'ok', + 'items': db_manager.distinct(col_name=self.col_name, key='main_category', filter={}) + } + + def get_category_list(self, id): + args = self.parser.parse_args() + filter_ = {} + if args.get('main_category') is not None: + filter_['main_category'] = args.get('main_category') + return { + 'status': 'ok', + 'items': db_manager.distinct(col_name=self.col_name, key='category', + filter=filter_) + } diff --git a/spiders/chinaz/chinaz/items.py b/spiders/chinaz/chinaz/items.py index dbec9f33..1fdcac1b 100644 --- a/spiders/chinaz/chinaz/items.py +++ b/spiders/chinaz/chinaz/items.py @@ -16,3 +16,6 @@ class ChinazItem(scrapy.Item): domain = scrapy.Field() description = scrapy.Field() rank = scrapy.Field() + main_category = scrapy.Field() + category = scrapy.Field() + location = scrapy.Field() diff --git a/spiders/chinaz/chinaz/spiders/chinaz_spider.py b/spiders/chinaz/chinaz/spiders/chinaz_spider.py index 2359daa9..b2d0e24c 100644 --- a/spiders/chinaz/chinaz/spiders/chinaz_spider.py +++ b/spiders/chinaz/chinaz/spiders/chinaz_spider.py @@ -11,19 +11,50 @@ class ChinazSpiderSpider(scrapy.Spider): def parse(self, response): for item in response.css('.listCentent > li'): name = item.css('h3.rightTxtHead > a::text').extract_first() + href = item.css('h3.rightTxtHead > a::attr("href")').extract_first() domain = item.css('h3.rightTxtHead > span::text').extract_first() description = item.css('p.RtCInfo::text').extract_first() rank = item.css('.RtCRateCent > strong::text').extract_first() rank = int(rank) - yield ChinazItem( + item = ChinazItem( _id=domain, name=name, domain=domain, description=description, rank=rank, ) + yield scrapy.Request( + url='http://top.chinaz.com' + href, + callback=self.parse_item, + meta={ + 'item': item + } + ) # pagination a_list = response.css('.ListPageWrap > a::attr("href")').extract() url = 'http://top.chinaz.com/hangye/' + a_list[-1] - yield scrapy.Request(url=url) + yield scrapy.Request(url=url, callback=self.parse) + + def parse_item(self, response): + item = response.meta['item'] + + # category info extraction + arr = response.css('.TopMainTag-show .SimSun') + res1 = arr[0].css('a::text').extract() + main_category = res1[0] + if len(res1) == 1: + category = '其他' + else: + category = res1[1] + + # location info extraction + res2 = arr[1].css('a::text').extract() + location = res2[0] + + # assign values to item + item['main_category'] = main_category + item['category'] = category + item['location'] = location + + yield item