mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-22 17:31:03 +01:00
80 lines
1.9 KiB
JavaScript
80 lines
1.9 KiB
JavaScript
const puppeteer = require('puppeteer');
|
|
const MongoClient = require('mongodb').MongoClient;
|
|
|
|
(async () => {
|
|
// browser
|
|
const browser = await (puppeteer.launch({
|
|
headless: true
|
|
}));
|
|
|
|
// define start url
|
|
const url = 'https://segmentfault.com/newest';
|
|
|
|
// start a new page
|
|
const page = await browser.newPage();
|
|
|
|
// navigate to url
|
|
try {
|
|
await page.goto(url);
|
|
await page.waitFor(2000);
|
|
} catch (e) {
|
|
console.error(e);
|
|
|
|
// close browser
|
|
browser.close();
|
|
|
|
// exit code 1 indicating an error happened
|
|
code = 1;
|
|
process.emit("exit ");
|
|
process.reallyExit(code);
|
|
|
|
return
|
|
}
|
|
|
|
// scroll down to fetch more data
|
|
for (let i = 0; i < 10; i++) {
|
|
console.log('Pressing PageDown...');
|
|
await page.keyboard.press('PageDown', 200);
|
|
await page.waitFor(500);
|
|
}
|
|
|
|
// scrape data
|
|
const results = await page.evaluate(() => {
|
|
let results = [];
|
|
document.querySelectorAll('.news-list .news-item').forEach(el => {
|
|
results.push({
|
|
url: 'https://segmentfault.com' + el.querySelector('.news__item-info > a').getAttribute('href'),
|
|
title: el.querySelector('.news__item-title').innerText
|
|
})
|
|
});
|
|
return results;
|
|
});
|
|
|
|
// open database connection
|
|
const client = await MongoClient.connect('mongodb://192.168.99.100:27017');
|
|
let db = await client.db('crawlab_test');
|
|
const colName = process.env.CRAWLAB_COLLECTION || 'results_segmentfault';
|
|
const taskId = process.env.CRAWLAB_TASK_ID;
|
|
const col = db.collection(colName);
|
|
|
|
// save to database
|
|
for (let i = 0; i < results.length; i++) {
|
|
// de-duplication
|
|
const r = await col.findOne({url: results[i]});
|
|
if (r) continue;
|
|
|
|
// assign taskID
|
|
results[i].task_id = taskId;
|
|
|
|
// insert row
|
|
await col.insertOne(results[i]);
|
|
}
|
|
|
|
console.log(`results.length: ${results.length}`);
|
|
|
|
// close database connection
|
|
client.close();
|
|
|
|
// shutdown browser
|
|
browser.close();
|
|
})(); |