added article_spider.js

This commit is contained in:
Marvin Zhang
2019-03-20 20:58:27 +08:00
parent bbdec7c61e
commit cf17909482
5 changed files with 67 additions and 2 deletions

View File

@@ -45,7 +45,8 @@ class Scheduler(object):
day = cron_arr[3]
month = cron_arr[4]
day_of_week = cron_arr[5]
self.scheduler.add_job(func=self.execute_spider, trigger='cron', args=(str(spider['_id']),), jobstore='mongo',
self.scheduler.add_job(func=self.execute_spider, trigger='cron', args=(str(spider['_id']),),
jobstore='mongo',
day_of_week=day_of_week, month=month, day=day, hour=hour, minute=minute,
second=second)

View File

@@ -0,0 +1,61 @@
const puppeteer = require('puppeteer');
const MongoClient = require('mongodb').MongoClient;
(async () => {
// browser
const browser = await (puppeteer.launch({
headless: true
}));
// page
const page = await browser.newPage();
// open database connection
const client = await MongoClient.connect('mongodb://192.168.99.100:27017');
let db = await client.db('crawlab_test');
const colName = process.env.CRAWLAB_COLLECTION || 'results';
const col = db.collection(colName);
const col_src = db.collection('results');
const results = await col_src.find({content: {$exists: false}}).toArray();
for (let i = 0; i < results.length; i++) {
let item = results[i];
// define article anchor
let anchor;
if (item.source === 'juejin') {
anchor = '.article-content';
} else if (item.source === 'segmentfault') {
anchor = '.article';
} else if (item.source === 'csdn') {
anchor = '#content_views';
} else {
continue;
}
console.log(`anchor: ${anchor}`);
// navigate to the article
try {
await page.goto(item.url, {waitUntil: 'domcontentloaded'});
await page.waitFor(2000);
} catch (e) {
console.error(e);
continue;
}
// scrape article content
item.content = await page.$eval(anchor, el => el.innerHTML);
// save to database
await col.save(item);
console.log(`saved item: ${JSON.stringify(item)}`)
}
// close mongodb
client.close();
// close browser
browser.close();
})();

View File

@@ -67,6 +67,7 @@ const MongoClient = require('mongodb').MongoClient;
// assign taskID
results[i].task_id = taskId;
results[i].source = 'csdn';
// insert row
await col.insertOne(results[i]);

View File

@@ -66,6 +66,7 @@ const MongoClient = require('mongodb').MongoClient;
// assign taskID
results[i].task_id = taskId;
results[i].source = 'juejin';
// insert row
await col.insertOne(results[i]);

View File

@@ -15,7 +15,7 @@ const MongoClient = require('mongodb').MongoClient;
// navigate to url
try {
await page.goto(url);
await page.goto(url, {waitUntil: 'domcontentloaded'});
await page.waitFor(2000);
} catch (e) {
console.error(e);
@@ -65,6 +65,7 @@ const MongoClient = require('mongodb').MongoClient;
// assign taskID
results[i].task_id = taskId;
results[i].source = 'segmentfault';
// insert row
await col.insertOne(results[i]);