added spiders

This commit is contained in:
Marvin Zhang
2019-03-15 20:22:09 +08:00
parent 53a80046d1
commit 9aeff8181c
4 changed files with 175 additions and 21 deletions

View File

@@ -1,15 +0,0 @@
{
"name": "segmentfault",
"version": "1.0.0",
"description": "",
"main": "segmentfault_spider.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
"license": "ISC",
"dependencies": {
"mongodb": "^3.1.13",
"puppeteer": "^1.13.0"
}
}

View File

@@ -4,7 +4,7 @@ const MongoClient = require('mongodb').MongoClient;
(async () => {
// browser
const browser = await (puppeteer.launch({
timeout: 10000
headless: true
}));
// define start url
@@ -58,11 +58,17 @@ const MongoClient = require('mongodb').MongoClient;
const col = db.collection(colName);
// save to database
await results.forEach(d => {
d.task_id = taskId;
console.log(d);
col.insertOne(d);
});
for (let i = 0; i < results.length; i++) {
// de-duplication
const r = await col.findOne({url: results[i]});
if (r) continue;
// assign taskID
results[i].task_id = taskId;
// insert row
await col.insertOne(results[i]);
}
console.log(`results.length: ${results.length}`);