From 8de15565fda4b88dc84c43b54feaab9457ab58fe Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Tue, 23 Jul 2019 21:10:52 +0800 Subject: [PATCH] code cleanup --- .../juejin_spider 2.js | 82 ----------------- .../5ca468298ffaed82c65f0f90/juejin_spider.js | 82 ----------------- .../juejin/__init__.py | 0 .../5ca46b508ffaed82c65f0f9c/juejin/items.py | 17 ---- .../juejin/middlewares.py | 56 ------------ .../juejin/pipelines.py | 25 ------ .../juejin/settings.py | 89 ------------------- .../juejin/spiders/__init__ 2.py | 4 - .../juejin/spiders/__init__.py | 4 - .../juejin/spiders/juejin_spider.py | 17 ---- .../5ca46b508ffaed82c65f0f9c/scrapy.cfg | 11 --- deployfile/5ca46b508ffaed82c65f0f9c/start.py | 2 - .../juejin_spider 2.js | 82 ----------------- .../5ca46b508ffaed82c65f0fa0/juejin_spider.js | 82 ----------------- 14 files changed, 553 deletions(-) delete mode 100644 deployfile/5ca468298ffaed82c65f0f90/juejin_spider 2.js delete mode 100644 deployfile/5ca468298ffaed82c65f0f90/juejin_spider.js delete mode 100644 deployfile/5ca46b508ffaed82c65f0f9c/juejin/__init__.py delete mode 100644 deployfile/5ca46b508ffaed82c65f0f9c/juejin/items.py delete mode 100644 deployfile/5ca46b508ffaed82c65f0f9c/juejin/middlewares.py delete mode 100644 deployfile/5ca46b508ffaed82c65f0f9c/juejin/pipelines.py delete mode 100644 deployfile/5ca46b508ffaed82c65f0f9c/juejin/settings.py delete mode 100644 deployfile/5ca46b508ffaed82c65f0f9c/juejin/spiders/__init__ 2.py delete mode 100644 deployfile/5ca46b508ffaed82c65f0f9c/juejin/spiders/__init__.py delete mode 100644 deployfile/5ca46b508ffaed82c65f0f9c/juejin/spiders/juejin_spider.py delete mode 100644 deployfile/5ca46b508ffaed82c65f0f9c/scrapy.cfg delete mode 100644 deployfile/5ca46b508ffaed82c65f0f9c/start.py delete mode 100644 deployfile/5ca46b508ffaed82c65f0fa0/juejin_spider 2.js delete mode 100644 deployfile/5ca46b508ffaed82c65f0fa0/juejin_spider.js diff --git a/deployfile/5ca468298ffaed82c65f0f90/juejin_spider 2.js b/deployfile/5ca468298ffaed82c65f0f90/juejin_spider 2.js deleted file mode 100644 index 7a02f5a2..00000000 --- a/deployfile/5ca468298ffaed82c65f0f90/juejin_spider 2.js +++ /dev/null @@ -1,82 +0,0 @@ -const puppeteer = require('puppeteer'); -const MongoClient = require('mongodb').MongoClient; - -(async () => { - // browser - const browser = await (puppeteer.launch({ - headless: true - })); - - // define start url - const url = 'https://juejin.im'; - - // start a new page - const page = await browser.newPage(); - - // navigate to url - try { - await page.goto(url, {waitUntil: 'domcontentloaded'}); - await page.waitFor(2000); - } catch (e) { - console.error(e); - - // close browser - browser.close(); - - // exit code 1 indicating an error happened - code = 1; - process.emit("exit "); - process.reallyExit(code); - - return - } - - // scroll down to fetch more data - for (let i = 0; i < 100; i++) { - console.log('Pressing PageDown...'); - await page.keyboard.press('PageDown', 200); - await page.waitFor(100); - } - - // scrape data - const results = await page.evaluate(() => { - let results = []; - document.querySelectorAll('.entry-list > .item').forEach(el => { - if (!el.querySelector('.title')) return; - results.push({ - url: 'https://juejin.com' + el.querySelector('.title').getAttribute('href'), - title: el.querySelector('.title').innerText - }); - }); - return results; - }); - - // open database connection - const client = await MongoClient.connect('mongodb://192.168.99.100:27017'); - let db = await client.db('crawlab_test'); - const colName = process.env.CRAWLAB_COLLECTION || 'results_juejin'; - const taskId = process.env.CRAWLAB_TASK_ID; - const col = db.collection(colName); - - // save to database - for (let i = 0; i < results.length; i++) { - // de-duplication - const r = await col.findOne({url: results[i]}); - if (r) continue; - - // assign taskID - results[i].task_id = taskId; - results[i].source = 'juejin'; - - // insert row - await col.insertOne(results[i]); - } - - console.log(`results.length: ${results.length}`); - - // close database connection - client.close(); - - // shutdown browser - browser.close(); -})(); \ No newline at end of file diff --git a/deployfile/5ca468298ffaed82c65f0f90/juejin_spider.js b/deployfile/5ca468298ffaed82c65f0f90/juejin_spider.js deleted file mode 100644 index 3cf2bcac..00000000 --- a/deployfile/5ca468298ffaed82c65f0f90/juejin_spider.js +++ /dev/null @@ -1,82 +0,0 @@ -const puppeteer = require('puppeteer'); -const MongoClient = require('mongodb').MongoClient; - -(async () => { - // browser - const browser = await (puppeteer.launch({ - headless: true - })); - - // define start url - const url = 'https://juejin.im'; - - // start a new page - const page = await browser.newPage(); - - // navigate to url - try { - await page.goto(url, {waitUntil: 'domcontentloaded'}); - await page.waitFor(2000); - } catch (e) { - console.error(e); - - // close browser - browser.close(); - - // exit code 1 indicating an error happened - code = 1; - process.emit("exit "); - process.reallyExit(code); - - return - } - - // scroll down to fetch more data - for (let i = 0; i < 100; i++) { - console.log('Pressing PageDown...'); - await page.keyboard.press('PageDown', 200); - await page.waitFor(100); - } - - // scrape data - const results = await page.evaluate(() => { - let results = []; - document.querySelectorAll('.entry-list > .item').forEach(el => { - if (!el.querySelector('.title')) return; - results.push({ - url: 'https://juejin.com' + el.querySelector('.title').getAttribute('href'), - title: el.querySelector('.title').innerText - }); - }); - return results; - }); - - // open database connection - const client = await MongoClient.connect('mongodb://127.0.0.1:27017'); - let db = await client.db('crawlab_test'); - const colName = process.env.CRAWLAB_COLLECTION || 'results_juejin'; - const taskId = process.env.CRAWLAB_TASK_ID; - const col = db.collection(colName); - - // save to database - for (let i = 0; i < results.length; i++) { - // de-duplication - const r = await col.findOne({url: results[i]}); - if (r) continue; - - // assign taskID - results[i].task_id = taskId; - results[i].source = 'juejin'; - - // insert row - await col.insertOne(results[i]); - } - - console.log(`results.length: ${results.length}`); - - // close database connection - client.close(); - - // shutdown browser - browser.close(); -})(); \ No newline at end of file diff --git a/deployfile/5ca46b508ffaed82c65f0f9c/juejin/__init__.py b/deployfile/5ca46b508ffaed82c65f0f9c/juejin/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/deployfile/5ca46b508ffaed82c65f0f9c/juejin/items.py b/deployfile/5ca46b508ffaed82c65f0f9c/juejin/items.py deleted file mode 100644 index 2c4717dd..00000000 --- a/deployfile/5ca46b508ffaed82c65f0f9c/juejin/items.py +++ /dev/null @@ -1,17 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# http://doc.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class JuejinItem(scrapy.Item): - # define the fields for your item here like: - _id = scrapy.Field() - title = scrapy.Field() - link = scrapy.Field() - like = scrapy.Field() - task_id = scrapy.Field() diff --git a/deployfile/5ca46b508ffaed82c65f0f9c/juejin/middlewares.py b/deployfile/5ca46b508ffaed82c65f0f9c/juejin/middlewares.py deleted file mode 100644 index 9d5225a2..00000000 --- a/deployfile/5ca46b508ffaed82c65f0f9c/juejin/middlewares.py +++ /dev/null @@ -1,56 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# http://doc.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class JuejinSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Response, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/deployfile/5ca46b508ffaed82c65f0f9c/juejin/pipelines.py b/deployfile/5ca46b508ffaed82c65f0f9c/juejin/pipelines.py deleted file mode 100644 index b34aac50..00000000 --- a/deployfile/5ca46b508ffaed82c65f0f9c/juejin/pipelines.py +++ /dev/null @@ -1,25 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html -import os - -from pymongo import MongoClient - -MONGO_HOST = '127.0.0.1' -MONGO_PORT = 27017 -MONGO_DB = 'crawlab_test' - - -class JuejinPipeline(object): - mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT) - db = mongo[MONGO_DB] - col_name = os.environ.get('CRAWLAB_COLLECTION','test') - col = db[col_name] - - def process_item(self, item, spider): - item['task_id'] = os.environ.get('CRAWLAB_TASK_ID') - self.col.save(item) - return item diff --git a/deployfile/5ca46b508ffaed82c65f0f9c/juejin/settings.py b/deployfile/5ca46b508ffaed82c65f0f9c/juejin/settings.py deleted file mode 100644 index 44f8866c..00000000 --- a/deployfile/5ca46b508ffaed82c65f0f9c/juejin/settings.py +++ /dev/null @@ -1,89 +0,0 @@ -# -*- coding: utf-8 -*- - -# Scrapy settings for juejin project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# http://doc.scrapy.org/en/latest/topics/settings.html -# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html -# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'juejin' - -SPIDER_MODULES = ['juejin.spiders'] -NEWSPIDER_MODULE = 'juejin.spiders' - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -# CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -# DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -# CONCURRENT_REQUESTS_PER_DOMAIN = 16 -# CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -# COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -# TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -# DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -# } - -# Enable or disable spider middlewares -# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html -# SPIDER_MIDDLEWARES = { -# 'juejin.middlewares.JuejinSpiderMiddleware': 543, -# } - -# Enable or disable downloader middlewares -# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html -# DOWNLOADER_MIDDLEWARES = { -# 'juejin.middlewares.MyCustomDownloaderMiddleware': 543, -# } - -# Enable or disable extensions -# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html -# EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -# } - -# Configure item pipelines -# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'juejin.pipelines.JuejinPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See http://doc.scrapy.org/en/latest/topics/autothrottle.html -# AUTOTHROTTLE_ENABLED = True -# The initial download delay -# AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -# AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -# AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -# HTTPCACHE_ENABLED = True -# HTTPCACHE_EXPIRATION_SECS = 0 -# HTTPCACHE_DIR = 'httpcache' -# HTTPCACHE_IGNORE_HTTP_CODES = [] -# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/deployfile/5ca46b508ffaed82c65f0f9c/juejin/spiders/__init__ 2.py b/deployfile/5ca46b508ffaed82c65f0f9c/juejin/spiders/__init__ 2.py deleted file mode 100644 index ebd689ac..00000000 --- a/deployfile/5ca46b508ffaed82c65f0f9c/juejin/spiders/__init__ 2.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/deployfile/5ca46b508ffaed82c65f0f9c/juejin/spiders/__init__.py b/deployfile/5ca46b508ffaed82c65f0f9c/juejin/spiders/__init__.py deleted file mode 100644 index ebd689ac..00000000 --- a/deployfile/5ca46b508ffaed82c65f0f9c/juejin/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/deployfile/5ca46b508ffaed82c65f0f9c/juejin/spiders/juejin_spider.py b/deployfile/5ca46b508ffaed82c65f0f9c/juejin/spiders/juejin_spider.py deleted file mode 100644 index 28df5be7..00000000 --- a/deployfile/5ca46b508ffaed82c65f0f9c/juejin/spiders/juejin_spider.py +++ /dev/null @@ -1,17 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -from juejin.items import JuejinItem - - -class JuejinSpiderSpider(scrapy.Spider): - name = 'juejin_spider' - allowed_domains = ['juejin.com'] - start_urls = ['https://juejin.im/search?query=celery'] - - def parse(self, response): - for item in response.css('ul.main-list > li.item'): - yield JuejinItem( - title=item.css('.title span').extract_first(), - link=item.css('a::attr("href")').extract_first(), - like=item.css('.like .count::text').extract_first(), - ) diff --git a/deployfile/5ca46b508ffaed82c65f0f9c/scrapy.cfg b/deployfile/5ca46b508ffaed82c65f0f9c/scrapy.cfg deleted file mode 100644 index 38ba44f1..00000000 --- a/deployfile/5ca46b508ffaed82c65f0f9c/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.org/en/latest/deploy.html - -[settings] -default = juejin.settings - -[deploy] -#url = http://localhost:6800/ -project = juejin diff --git a/deployfile/5ca46b508ffaed82c65f0f9c/start.py b/deployfile/5ca46b508ffaed82c65f0f9c/start.py deleted file mode 100644 index ec2f47dd..00000000 --- a/deployfile/5ca46b508ffaed82c65f0f9c/start.py +++ /dev/null @@ -1,2 +0,0 @@ -from scrapy import cmdline -cmdline.execute(["scrapy","crawl","juejin_spider"]) \ No newline at end of file diff --git a/deployfile/5ca46b508ffaed82c65f0fa0/juejin_spider 2.js b/deployfile/5ca46b508ffaed82c65f0fa0/juejin_spider 2.js deleted file mode 100644 index 3cf2bcac..00000000 --- a/deployfile/5ca46b508ffaed82c65f0fa0/juejin_spider 2.js +++ /dev/null @@ -1,82 +0,0 @@ -const puppeteer = require('puppeteer'); -const MongoClient = require('mongodb').MongoClient; - -(async () => { - // browser - const browser = await (puppeteer.launch({ - headless: true - })); - - // define start url - const url = 'https://juejin.im'; - - // start a new page - const page = await browser.newPage(); - - // navigate to url - try { - await page.goto(url, {waitUntil: 'domcontentloaded'}); - await page.waitFor(2000); - } catch (e) { - console.error(e); - - // close browser - browser.close(); - - // exit code 1 indicating an error happened - code = 1; - process.emit("exit "); - process.reallyExit(code); - - return - } - - // scroll down to fetch more data - for (let i = 0; i < 100; i++) { - console.log('Pressing PageDown...'); - await page.keyboard.press('PageDown', 200); - await page.waitFor(100); - } - - // scrape data - const results = await page.evaluate(() => { - let results = []; - document.querySelectorAll('.entry-list > .item').forEach(el => { - if (!el.querySelector('.title')) return; - results.push({ - url: 'https://juejin.com' + el.querySelector('.title').getAttribute('href'), - title: el.querySelector('.title').innerText - }); - }); - return results; - }); - - // open database connection - const client = await MongoClient.connect('mongodb://127.0.0.1:27017'); - let db = await client.db('crawlab_test'); - const colName = process.env.CRAWLAB_COLLECTION || 'results_juejin'; - const taskId = process.env.CRAWLAB_TASK_ID; - const col = db.collection(colName); - - // save to database - for (let i = 0; i < results.length; i++) { - // de-duplication - const r = await col.findOne({url: results[i]}); - if (r) continue; - - // assign taskID - results[i].task_id = taskId; - results[i].source = 'juejin'; - - // insert row - await col.insertOne(results[i]); - } - - console.log(`results.length: ${results.length}`); - - // close database connection - client.close(); - - // shutdown browser - browser.close(); -})(); \ No newline at end of file diff --git a/deployfile/5ca46b508ffaed82c65f0fa0/juejin_spider.js b/deployfile/5ca46b508ffaed82c65f0fa0/juejin_spider.js deleted file mode 100644 index 3cf2bcac..00000000 --- a/deployfile/5ca46b508ffaed82c65f0fa0/juejin_spider.js +++ /dev/null @@ -1,82 +0,0 @@ -const puppeteer = require('puppeteer'); -const MongoClient = require('mongodb').MongoClient; - -(async () => { - // browser - const browser = await (puppeteer.launch({ - headless: true - })); - - // define start url - const url = 'https://juejin.im'; - - // start a new page - const page = await browser.newPage(); - - // navigate to url - try { - await page.goto(url, {waitUntil: 'domcontentloaded'}); - await page.waitFor(2000); - } catch (e) { - console.error(e); - - // close browser - browser.close(); - - // exit code 1 indicating an error happened - code = 1; - process.emit("exit "); - process.reallyExit(code); - - return - } - - // scroll down to fetch more data - for (let i = 0; i < 100; i++) { - console.log('Pressing PageDown...'); - await page.keyboard.press('PageDown', 200); - await page.waitFor(100); - } - - // scrape data - const results = await page.evaluate(() => { - let results = []; - document.querySelectorAll('.entry-list > .item').forEach(el => { - if (!el.querySelector('.title')) return; - results.push({ - url: 'https://juejin.com' + el.querySelector('.title').getAttribute('href'), - title: el.querySelector('.title').innerText - }); - }); - return results; - }); - - // open database connection - const client = await MongoClient.connect('mongodb://127.0.0.1:27017'); - let db = await client.db('crawlab_test'); - const colName = process.env.CRAWLAB_COLLECTION || 'results_juejin'; - const taskId = process.env.CRAWLAB_TASK_ID; - const col = db.collection(colName); - - // save to database - for (let i = 0; i < results.length; i++) { - // de-duplication - const r = await col.findOne({url: results[i]}); - if (r) continue; - - // assign taskID - results[i].task_id = taskId; - results[i].source = 'juejin'; - - // insert row - await col.insertOne(results[i]); - } - - console.log(`results.length: ${results.length}`); - - // close database connection - client.close(); - - // shutdown browser - browser.close(); -})(); \ No newline at end of file