From 640c6d2137cf09464b6e420025bba9a1786c6e45 Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Wed, 20 Mar 2019 20:58:27 +0800 Subject: [PATCH] added article_spider.js --- crawlab/tasks/scheduler.py | 3 +- spiders/article/article_spider.js | 61 +++++++++++++++++++++ spiders/csdn/csdn_spider.js | 1 + spiders/juejin_node/juejin_spider.js | 1 + spiders/segmentfault/segmentfault_spider.js | 3 +- 5 files changed, 67 insertions(+), 2 deletions(-) create mode 100644 spiders/article/article_spider.js diff --git a/crawlab/tasks/scheduler.py b/crawlab/tasks/scheduler.py index 540b7158..d4249bf7 100644 --- a/crawlab/tasks/scheduler.py +++ b/crawlab/tasks/scheduler.py @@ -45,7 +45,8 @@ class Scheduler(object): day = cron_arr[3] month = cron_arr[4] day_of_week = cron_arr[5] - self.scheduler.add_job(func=self.execute_spider, trigger='cron', args=(str(spider['_id']),), jobstore='mongo', + self.scheduler.add_job(func=self.execute_spider, trigger='cron', args=(str(spider['_id']),), + jobstore='mongo', day_of_week=day_of_week, month=month, day=day, hour=hour, minute=minute, second=second) diff --git a/spiders/article/article_spider.js b/spiders/article/article_spider.js new file mode 100644 index 00000000..0629e738 --- /dev/null +++ b/spiders/article/article_spider.js @@ -0,0 +1,61 @@ +const puppeteer = require('puppeteer'); +const MongoClient = require('mongodb').MongoClient; + +(async () => { + // browser + const browser = await (puppeteer.launch({ + headless: true + })); + + // page + const page = await browser.newPage(); + + // open database connection + const client = await MongoClient.connect('mongodb://192.168.99.100:27017'); + let db = await client.db('crawlab_test'); + const colName = process.env.CRAWLAB_COLLECTION || 'results'; + const col = db.collection(colName); + const col_src = db.collection('results'); + + const results = await col_src.find({content: {$exists: false}}).toArray(); + for (let i = 0; i < results.length; i++) { + let item = results[i]; + + // define article anchor + let anchor; + if (item.source === 'juejin') { + anchor = '.article-content'; + } else if (item.source === 'segmentfault') { + anchor = '.article'; + } else if (item.source === 'csdn') { + anchor = '#content_views'; + } else { + continue; + } + + console.log(`anchor: ${anchor}`); + + // navigate to the article + try { + await page.goto(item.url, {waitUntil: 'domcontentloaded'}); + await page.waitFor(2000); + } catch (e) { + console.error(e); + continue; + } + + // scrape article content + item.content = await page.$eval(anchor, el => el.innerHTML); + + // save to database + await col.save(item); + console.log(`saved item: ${JSON.stringify(item)}`) + } + + // close mongodb + client.close(); + + // close browser + browser.close(); + +})(); \ No newline at end of file diff --git a/spiders/csdn/csdn_spider.js b/spiders/csdn/csdn_spider.js index c93c46eb..501336b9 100644 --- a/spiders/csdn/csdn_spider.js +++ b/spiders/csdn/csdn_spider.js @@ -67,6 +67,7 @@ const MongoClient = require('mongodb').MongoClient; // assign taskID results[i].task_id = taskId; + results[i].source = 'csdn'; // insert row await col.insertOne(results[i]); diff --git a/spiders/juejin_node/juejin_spider.js b/spiders/juejin_node/juejin_spider.js index f0a05900..7a02f5a2 100644 --- a/spiders/juejin_node/juejin_spider.js +++ b/spiders/juejin_node/juejin_spider.js @@ -66,6 +66,7 @@ const MongoClient = require('mongodb').MongoClient; // assign taskID results[i].task_id = taskId; + results[i].source = 'juejin'; // insert row await col.insertOne(results[i]); diff --git a/spiders/segmentfault/segmentfault_spider.js b/spiders/segmentfault/segmentfault_spider.js index e7b57e8c..6a5c16ca 100644 --- a/spiders/segmentfault/segmentfault_spider.js +++ b/spiders/segmentfault/segmentfault_spider.js @@ -15,7 +15,7 @@ const MongoClient = require('mongodb').MongoClient; // navigate to url try { - await page.goto(url); + await page.goto(url, {waitUntil: 'domcontentloaded'}); await page.waitFor(2000); } catch (e) { console.error(e); @@ -65,6 +65,7 @@ const MongoClient = require('mongodb').MongoClient; // assign taskID results[i].task_id = taskId; + results[i].source = 'segmentfault'; // insert row await col.insertOne(results[i]);