Files
crawlab/spiders/csdn/csdn_spider.js
2019-04-16 20:19:22 +08:00

87 lines
2.0 KiB
JavaScript

const puppeteer = require('puppeteer');
const MongoClient = require('mongodb').MongoClient;
const MONGO_HOST = process.env.MONGO_HOST;
const MONGO_PORT = process.env.MONGO_PORT;
const MONGO_DB = process.env.MONGO_DB;
(async () => {
// browser
const browser = await (puppeteer.launch({
headless: true
}));
// define start url
const url = 'https://www.csdn.net';
// start a new page
const page = await browser.newPage();
// navigate to url
try {
await page.goto(url, {waitUntil: 'domcontentloaded'});
await page.waitFor(2000);
} catch (e) {
console.error(e);
// close browser
browser.close();
// exit code 1 indicating an error happened
code = 1;
process.emit("exit ");
process.reallyExit(code);
return
}
// scroll down to fetch more data
for (let i = 0; i < 100; i++) {
console.log('Pressing PageDown...');
await page.keyboard.press('PageDown', 200);
await page.waitFor(100);
}
// scrape data
const results = await page.evaluate(() => {
let results = [];
document.querySelectorAll('#feedlist_id > li').forEach(el => {
const $a = el.querySelector('.title > h2 > a');
if (!$a) return;
results.push({
url: $a.getAttribute('href'),
title: $a.innerText
});
});
return results;
});
// open database connection
const client = await MongoClient.connect(`mongodb://${MONGO_HOST}:${MONGO_PORT}`);
let db = await client.db(MONGO_DB);
const colName = process.env.CRAWLAB_COLLECTION || 'results_juejin';
const taskId = process.env.CRAWLAB_TASK_ID;
const col = db.collection(colName);
// save to database
for (let i = 0; i < results.length; i++) {
// de-duplication
const r = await col.findOne({url: results[i]});
if (r) continue;
// assign taskID
results[i].task_id = taskId;
results[i].source = 'csdn';
// insert row
await col.insertOne(results[i]);
}
console.log(`results.length: ${results.length}`);
// close database connection
client.close();
// shutdown browser
browser.close();
})();