mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-27 17:50:53 +01:00
added spiders
This commit is contained in:
@@ -1,15 +0,0 @@
|
||||
{
|
||||
"name": "segmentfault",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "segmentfault_spider.js",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"mongodb": "^3.1.13",
|
||||
"puppeteer": "^1.13.0"
|
||||
}
|
||||
}
|
||||
@@ -4,7 +4,7 @@ const MongoClient = require('mongodb').MongoClient;
|
||||
(async () => {
|
||||
// browser
|
||||
const browser = await (puppeteer.launch({
|
||||
timeout: 10000
|
||||
headless: true
|
||||
}));
|
||||
|
||||
// define start url
|
||||
@@ -58,11 +58,17 @@ const MongoClient = require('mongodb').MongoClient;
|
||||
const col = db.collection(colName);
|
||||
|
||||
// save to database
|
||||
await results.forEach(d => {
|
||||
d.task_id = taskId;
|
||||
console.log(d);
|
||||
col.insertOne(d);
|
||||
});
|
||||
for (let i = 0; i < results.length; i++) {
|
||||
// de-duplication
|
||||
const r = await col.findOne({url: results[i]});
|
||||
if (r) continue;
|
||||
|
||||
// assign taskID
|
||||
results[i].task_id = taskId;
|
||||
|
||||
// insert row
|
||||
await col.insertOne(results[i]);
|
||||
}
|
||||
|
||||
console.log(`results.length: ${results.length}`);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user