From 9aeff8181c8ed9daafafa4e4b15bf63303e50758 Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Fri, 15 Mar 2019 20:22:09 +0800 Subject: [PATCH] added spiders --- spiders/csdn/csdn_spider.js | 82 +++++++++++++++++++++ spiders/juejin_node/juejin_spider.js | 81 ++++++++++++++++++++ spiders/segmentfault/package.json | 15 ---- spiders/segmentfault/segmentfault_spider.js | 18 +++-- 4 files changed, 175 insertions(+), 21 deletions(-) create mode 100644 spiders/csdn/csdn_spider.js create mode 100644 spiders/juejin_node/juejin_spider.js delete mode 100644 spiders/segmentfault/package.json diff --git a/spiders/csdn/csdn_spider.js b/spiders/csdn/csdn_spider.js new file mode 100644 index 00000000..c93c46eb --- /dev/null +++ b/spiders/csdn/csdn_spider.js @@ -0,0 +1,82 @@ +const puppeteer = require('puppeteer'); +const MongoClient = require('mongodb').MongoClient; + +(async () => { + // browser + const browser = await (puppeteer.launch({ + headless: true + })); + + // define start url + const url = 'https://www.csdn.net'; + + // start a new page + const page = await browser.newPage(); + + // navigate to url + try { + await page.goto(url, {waitUntil: 'domcontentloaded'}); + await page.waitFor(2000); + } catch (e) { + console.error(e); + + // close browser + browser.close(); + + // exit code 1 indicating an error happened + code = 1; + process.emit("exit "); + process.reallyExit(code); + + return + } + + // scroll down to fetch more data + for (let i = 0; i < 100; i++) { + console.log('Pressing PageDown...'); + await page.keyboard.press('PageDown', 200); + await page.waitFor(100); + } + + // scrape data + const results = await page.evaluate(() => { + let results = []; + document.querySelectorAll('#feedlist_id > li').forEach(el => { + const $a = el.querySelector('.title > h2 > a'); + if (!$a) return; + results.push({ + url: $a.getAttribute('href'), + title: $a.innerText + }); + }); + return results; + }); + + // open database connection + const client = await MongoClient.connect('mongodb://192.168.99.100:27017'); + let db = await client.db('crawlab_test'); + const colName = process.env.CRAWLAB_COLLECTION || 'results_juejin'; + const taskId = process.env.CRAWLAB_TASK_ID; + const col = db.collection(colName); + + // save to database + for (let i = 0; i < results.length; i++) { + // de-duplication + const r = await col.findOne({url: results[i]}); + if (r) continue; + + // assign taskID + results[i].task_id = taskId; + + // insert row + await col.insertOne(results[i]); + } + + console.log(`results.length: ${results.length}`); + + // close database connection + client.close(); + + // shutdown browser + browser.close(); +})(); \ No newline at end of file diff --git a/spiders/juejin_node/juejin_spider.js b/spiders/juejin_node/juejin_spider.js new file mode 100644 index 00000000..f0a05900 --- /dev/null +++ b/spiders/juejin_node/juejin_spider.js @@ -0,0 +1,81 @@ +const puppeteer = require('puppeteer'); +const MongoClient = require('mongodb').MongoClient; + +(async () => { + // browser + const browser = await (puppeteer.launch({ + headless: true + })); + + // define start url + const url = 'https://juejin.im'; + + // start a new page + const page = await browser.newPage(); + + // navigate to url + try { + await page.goto(url, {waitUntil: 'domcontentloaded'}); + await page.waitFor(2000); + } catch (e) { + console.error(e); + + // close browser + browser.close(); + + // exit code 1 indicating an error happened + code = 1; + process.emit("exit "); + process.reallyExit(code); + + return + } + + // scroll down to fetch more data + for (let i = 0; i < 100; i++) { + console.log('Pressing PageDown...'); + await page.keyboard.press('PageDown', 200); + await page.waitFor(100); + } + + // scrape data + const results = await page.evaluate(() => { + let results = []; + document.querySelectorAll('.entry-list > .item').forEach(el => { + if (!el.querySelector('.title')) return; + results.push({ + url: 'https://juejin.com' + el.querySelector('.title').getAttribute('href'), + title: el.querySelector('.title').innerText + }); + }); + return results; + }); + + // open database connection + const client = await MongoClient.connect('mongodb://192.168.99.100:27017'); + let db = await client.db('crawlab_test'); + const colName = process.env.CRAWLAB_COLLECTION || 'results_juejin'; + const taskId = process.env.CRAWLAB_TASK_ID; + const col = db.collection(colName); + + // save to database + for (let i = 0; i < results.length; i++) { + // de-duplication + const r = await col.findOne({url: results[i]}); + if (r) continue; + + // assign taskID + results[i].task_id = taskId; + + // insert row + await col.insertOne(results[i]); + } + + console.log(`results.length: ${results.length}`); + + // close database connection + client.close(); + + // shutdown browser + browser.close(); +})(); \ No newline at end of file diff --git a/spiders/segmentfault/package.json b/spiders/segmentfault/package.json deleted file mode 100644 index 65a0f774..00000000 --- a/spiders/segmentfault/package.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "name": "segmentfault", - "version": "1.0.0", - "description": "", - "main": "segmentfault_spider.js", - "scripts": { - "test": "echo \"Error: no test specified\" && exit 1" - }, - "author": "", - "license": "ISC", - "dependencies": { - "mongodb": "^3.1.13", - "puppeteer": "^1.13.0" - } -} diff --git a/spiders/segmentfault/segmentfault_spider.js b/spiders/segmentfault/segmentfault_spider.js index d71d0762..e7b57e8c 100644 --- a/spiders/segmentfault/segmentfault_spider.js +++ b/spiders/segmentfault/segmentfault_spider.js @@ -4,7 +4,7 @@ const MongoClient = require('mongodb').MongoClient; (async () => { // browser const browser = await (puppeteer.launch({ - timeout: 10000 + headless: true })); // define start url @@ -58,11 +58,17 @@ const MongoClient = require('mongodb').MongoClient; const col = db.collection(colName); // save to database - await results.forEach(d => { - d.task_id = taskId; - console.log(d); - col.insertOne(d); - }); + for (let i = 0; i < results.length; i++) { + // de-duplication + const r = await col.findOne({url: results[i]}); + if (r) continue; + + // assign taskID + results[i].task_id = taskId; + + // insert row + await col.insertOne(results[i]); + } console.log(`results.length: ${results.length}`);