updated README.md

code cleanup
This commit is contained in:
Marvin Zhang
2019-04-03 19:58:21 +08:00
parent 2895aebc01
commit 563ecea96f
92 changed files with 38 additions and 7829 deletions

View File

@@ -1,61 +0,0 @@
const puppeteer = require('puppeteer');
const MongoClient = require('mongodb').MongoClient;
(async () => {
// browser
const browser = await (puppeteer.launch({
headless: true
}));
// page
const page = await browser.newPage();
// open database connection
const client = await MongoClient.connect('mongodb://127.0.0.1:27017');
let db = await client.db('crawlab_test');
const colName = process.env.CRAWLAB_COLLECTION || 'results';
const col = db.collection(colName);
const col_src = db.collection('results');
const results = await col_src.find({content: {$exists: false}}).toArray();
for (let i = 0; i < results.length; i++) {
let item = results[i];
// define article anchor
let anchor;
if (item.source === 'juejin') {
anchor = '.article-content';
} else if (item.source === 'segmentfault') {
anchor = '.article';
} else if (item.source === 'csdn') {
anchor = '#content_views';
} else {
continue;
}
console.log(`anchor: ${anchor}`);
// navigate to the article
try {
await page.goto(item.url, {waitUntil: 'domcontentloaded'});
await page.waitFor(2000);
} catch (e) {
console.error(e);
continue;
}
// scrape article content
item.content = await page.$eval(anchor, el => el.innerHTML);
// save to database
await col.save(item);
console.log(`saved item: ${JSON.stringify(item)}`)
}
// close mongodb
client.close();
// close browser
browser.close();
})();

View File

@@ -1,83 +0,0 @@
const puppeteer = require('puppeteer');
const MongoClient = require('mongodb').MongoClient;
(async () => {
// browser
const browser = await (puppeteer.launch({
headless: true
}));
// define start url
const url = 'https://www.csdn.net';
// start a new page
const page = await browser.newPage();
// navigate to url
try {
await page.goto(url, {waitUntil: 'domcontentloaded'});
await page.waitFor(2000);
} catch (e) {
console.error(e);
// close browser
browser.close();
// exit code 1 indicating an error happened
code = 1;
process.emit("exit ");
process.reallyExit(code);
return
}
// scroll down to fetch more data
for (let i = 0; i < 100; i++) {
console.log('Pressing PageDown...');
await page.keyboard.press('PageDown', 200);
await page.waitFor(100);
}
// scrape data
const results = await page.evaluate(() => {
let results = [];
document.querySelectorAll('#feedlist_id > li').forEach(el => {
const $a = el.querySelector('.title > h2 > a');
if (!$a) return;
results.push({
url: $a.getAttribute('href'),
title: $a.innerText
});
});
return results;
});
// open database connection
const client = await MongoClient.connect('mongodb://127.0.0.1:27017');
let db = await client.db('crawlab_test');
const colName = process.env.CRAWLAB_COLLECTION || 'results_juejin';
const taskId = process.env.CRAWLAB_TASK_ID;
const col = db.collection(colName);
// save to database
for (let i = 0; i < results.length; i++) {
// de-duplication
const r = await col.findOne({url: results[i]});
if (r) continue;
// assign taskID
results[i].task_id = taskId;
results[i].source = 'csdn';
// insert row
await col.insertOne(results[i]);
}
console.log(`results.length: ${results.length}`);
// close database connection
client.close();
// shutdown browser
browser.close();
})();

View File

@@ -1,4 +0,0 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@@ -1,82 +0,0 @@
const puppeteer = require('puppeteer');
const MongoClient = require('mongodb').MongoClient;
(async () => {
// browser
const browser = await (puppeteer.launch({
headless: true
}));
// define start url
const url = 'https://juejin.im';
// start a new page
const page = await browser.newPage();
// navigate to url
try {
await page.goto(url, {waitUntil: 'domcontentloaded'});
await page.waitFor(2000);
} catch (e) {
console.error(e);
// close browser
browser.close();
// exit code 1 indicating an error happened
code = 1;
process.emit("exit ");
process.reallyExit(code);
return
}
// scroll down to fetch more data
for (let i = 0; i < 100; i++) {
console.log('Pressing PageDown...');
await page.keyboard.press('PageDown', 200);
await page.waitFor(100);
}
// scrape data
const results = await page.evaluate(() => {
let results = [];
document.querySelectorAll('.entry-list > .item').forEach(el => {
if (!el.querySelector('.title')) return;
results.push({
url: 'https://juejin.com' + el.querySelector('.title').getAttribute('href'),
title: el.querySelector('.title').innerText
});
});
return results;
});
// open database connection
const client = await MongoClient.connect('mongodb://127.0.0.1:27017');
let db = await client.db('crawlab_test');
const colName = process.env.CRAWLAB_COLLECTION || 'results_juejin';
const taskId = process.env.CRAWLAB_TASK_ID;
const col = db.collection(colName);
// save to database
for (let i = 0; i < results.length; i++) {
// de-duplication
const r = await col.findOne({url: results[i]});
if (r) continue;
// assign taskID
results[i].task_id = taskId;
results[i].source = 'juejin';
// insert row
await col.insertOne(results[i]);
}
console.log(`results.length: ${results.length}`);
// close database connection
client.close();
// shutdown browser
browser.close();
})();

View File

@@ -1,81 +0,0 @@
const puppeteer = require('puppeteer');
const MongoClient = require('mongodb').MongoClient;
(async () => {
// browser
const browser = await (puppeteer.launch({
headless: true
}));
// define start url
const url = 'https://segmentfault.com/newest';
// start a new page
const page = await browser.newPage();
// navigate to url
try {
await page.goto(url, {waitUntil: 'domcontentloaded'});
await page.waitFor(2000);
} catch (e) {
console.error(e);
// close browser
browser.close();
// exit code 1 indicating an error happened
code = 1;
process.emit("exit ");
process.reallyExit(code);
return
}
// scroll down to fetch more data
for (let i = 0; i < 10; i++) {
console.log('Pressing PageDown...');
await page.keyboard.press('PageDown', 200);
await page.waitFor(500);
}
// scrape data
const results = await page.evaluate(() => {
let results = [];
document.querySelectorAll('.news-list .news-item').forEach(el => {
results.push({
url: 'https://segmentfault.com' + el.querySelector('.news__item-info > a').getAttribute('href'),
title: el.querySelector('.news__item-title').innerText
})
});
return results;
});
// open database connection
const client = await MongoClient.connect('mongodb://127.0.0.1:27017');
let db = await client.db('crawlab_test');
const colName = process.env.CRAWLAB_COLLECTION || 'results_segmentfault';
const taskId = process.env.CRAWLAB_TASK_ID;
const col = db.collection(colName);
// save to database
for (let i = 0; i < results.length; i++) {
// de-duplication
const r = await col.findOne({url: results[i]});
if (r) continue;
// assign taskID
results[i].task_id = taskId;
results[i].source = 'segmentfault';
// insert row
await col.insertOne(results[i]);
}
console.log(`results.length: ${results.length}`);
// close database connection
client.close();
// shutdown browser
browser.close();
})();