Files
crawlab/spiders/segmentfault/segmentfault_spider.js
2019-03-12 13:10:24 +08:00

54 lines
1.2 KiB
JavaScript

const puppeteer = require('puppeteer');
const MongoClient = require('mongodb').MongoClient;
(async () => {
// browser
const browser = await (puppeteer.launch({
timeout: 15000
}));
// define start url
const url = 'https://segmentfault.com/newest';
// start a new page
const page = await browser.newPage();
// navigate to url
await page.goto(url);
await page.waitFor(2000);
// take a screenshot
await page.screenshot({path: 'screenshot.png'});
// scrape data
const results = await page.evaluate(() => {
let results = [];
document.querySelectorAll('.news-list .news-item .news__item-title').forEach(el => {
results.push({
title: el.innerText
})
});
return results;
});
// open database connection
const client = await MongoClient.connect('mongodb://localhost/crawlab_test');
let db = await client.db('test');
const colName = process.env.CRAWLAB_COLLECTION;
const taskId = process.env.CRAWLAB_TASK_ID;
const col = db.collection(colName);
// save to database
await results.forEach(d => {
d.task_id = taskId;
col.save(d);
});
// close database connection
db.close();
console.log(results);
// shutdown browser
browser.close();
})();