mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-22 17:31:03 +01:00
61 lines
1.5 KiB
JavaScript
61 lines
1.5 KiB
JavaScript
const puppeteer = require('puppeteer');
|
|
const MongoClient = require('mongodb').MongoClient;
|
|
|
|
(async () => {
|
|
// browser
|
|
const browser = await (puppeteer.launch({
|
|
headless: true
|
|
}));
|
|
|
|
// page
|
|
const page = await browser.newPage();
|
|
|
|
// open database connection
|
|
const client = await MongoClient.connect('mongodb://127.0.0.1:27017');
|
|
let db = await client.db('crawlab_test');
|
|
const colName = process.env.CRAWLAB_COLLECTION || 'results';
|
|
const col = db.collection(colName);
|
|
const col_src = db.collection('results');
|
|
|
|
const results = await col_src.find({content: {$exists: false}}).toArray();
|
|
for (let i = 0; i < results.length; i++) {
|
|
let item = results[i];
|
|
|
|
// define article anchor
|
|
let anchor;
|
|
if (item.source === 'juejin') {
|
|
anchor = '.article-content';
|
|
} else if (item.source === 'segmentfault') {
|
|
anchor = '.article';
|
|
} else if (item.source === 'csdn') {
|
|
anchor = '#content_views';
|
|
} else {
|
|
continue;
|
|
}
|
|
|
|
console.log(`anchor: ${anchor}`);
|
|
|
|
// navigate to the article
|
|
try {
|
|
await page.goto(item.url, {waitUntil: 'domcontentloaded'});
|
|
await page.waitFor(2000);
|
|
} catch (e) {
|
|
console.error(e);
|
|
continue;
|
|
}
|
|
|
|
// scrape article content
|
|
item.content = await page.$eval(anchor, el => el.innerHTML);
|
|
|
|
// save to database
|
|
await col.save(item);
|
|
console.log(`saved item: ${JSON.stringify(item)}`)
|
|
}
|
|
|
|
// close mongodb
|
|
client.close();
|
|
|
|
// close browser
|
|
browser.close();
|
|
|
|
})(); |