mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-28 17:50:56 +01:00
updated README.md
code cleanup
This commit is contained in:
@@ -1,61 +0,0 @@
|
||||
const puppeteer = require('puppeteer');
|
||||
const MongoClient = require('mongodb').MongoClient;
|
||||
|
||||
(async () => {
|
||||
// browser
|
||||
const browser = await (puppeteer.launch({
|
||||
headless: true
|
||||
}));
|
||||
|
||||
// page
|
||||
const page = await browser.newPage();
|
||||
|
||||
// open database connection
|
||||
const client = await MongoClient.connect('mongodb://127.0.0.1:27017');
|
||||
let db = await client.db('crawlab_test');
|
||||
const colName = process.env.CRAWLAB_COLLECTION || 'results';
|
||||
const col = db.collection(colName);
|
||||
const col_src = db.collection('results');
|
||||
|
||||
const results = await col_src.find({content: {$exists: false}}).toArray();
|
||||
for (let i = 0; i < results.length; i++) {
|
||||
let item = results[i];
|
||||
|
||||
// define article anchor
|
||||
let anchor;
|
||||
if (item.source === 'juejin') {
|
||||
anchor = '.article-content';
|
||||
} else if (item.source === 'segmentfault') {
|
||||
anchor = '.article';
|
||||
} else if (item.source === 'csdn') {
|
||||
anchor = '#content_views';
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
console.log(`anchor: ${anchor}`);
|
||||
|
||||
// navigate to the article
|
||||
try {
|
||||
await page.goto(item.url, {waitUntil: 'domcontentloaded'});
|
||||
await page.waitFor(2000);
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
continue;
|
||||
}
|
||||
|
||||
// scrape article content
|
||||
item.content = await page.$eval(anchor, el => el.innerHTML);
|
||||
|
||||
// save to database
|
||||
await col.save(item);
|
||||
console.log(`saved item: ${JSON.stringify(item)}`)
|
||||
}
|
||||
|
||||
// close mongodb
|
||||
client.close();
|
||||
|
||||
// close browser
|
||||
browser.close();
|
||||
|
||||
})();
|
||||
@@ -1,83 +0,0 @@
|
||||
const puppeteer = require('puppeteer');
|
||||
const MongoClient = require('mongodb').MongoClient;
|
||||
|
||||
(async () => {
|
||||
// browser
|
||||
const browser = await (puppeteer.launch({
|
||||
headless: true
|
||||
}));
|
||||
|
||||
// define start url
|
||||
const url = 'https://www.csdn.net';
|
||||
|
||||
// start a new page
|
||||
const page = await browser.newPage();
|
||||
|
||||
// navigate to url
|
||||
try {
|
||||
await page.goto(url, {waitUntil: 'domcontentloaded'});
|
||||
await page.waitFor(2000);
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
|
||||
// close browser
|
||||
browser.close();
|
||||
|
||||
// exit code 1 indicating an error happened
|
||||
code = 1;
|
||||
process.emit("exit ");
|
||||
process.reallyExit(code);
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// scroll down to fetch more data
|
||||
for (let i = 0; i < 100; i++) {
|
||||
console.log('Pressing PageDown...');
|
||||
await page.keyboard.press('PageDown', 200);
|
||||
await page.waitFor(100);
|
||||
}
|
||||
|
||||
// scrape data
|
||||
const results = await page.evaluate(() => {
|
||||
let results = [];
|
||||
document.querySelectorAll('#feedlist_id > li').forEach(el => {
|
||||
const $a = el.querySelector('.title > h2 > a');
|
||||
if (!$a) return;
|
||||
results.push({
|
||||
url: $a.getAttribute('href'),
|
||||
title: $a.innerText
|
||||
});
|
||||
});
|
||||
return results;
|
||||
});
|
||||
|
||||
// open database connection
|
||||
const client = await MongoClient.connect('mongodb://127.0.0.1:27017');
|
||||
let db = await client.db('crawlab_test');
|
||||
const colName = process.env.CRAWLAB_COLLECTION || 'results_juejin';
|
||||
const taskId = process.env.CRAWLAB_TASK_ID;
|
||||
const col = db.collection(colName);
|
||||
|
||||
// save to database
|
||||
for (let i = 0; i < results.length; i++) {
|
||||
// de-duplication
|
||||
const r = await col.findOne({url: results[i]});
|
||||
if (r) continue;
|
||||
|
||||
// assign taskID
|
||||
results[i].task_id = taskId;
|
||||
results[i].source = 'csdn';
|
||||
|
||||
// insert row
|
||||
await col.insertOne(results[i]);
|
||||
}
|
||||
|
||||
console.log(`results.length: ${results.length}`);
|
||||
|
||||
// close database connection
|
||||
client.close();
|
||||
|
||||
// shutdown browser
|
||||
browser.close();
|
||||
})();
|
||||
@@ -1,4 +0,0 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
@@ -1,82 +0,0 @@
|
||||
const puppeteer = require('puppeteer');
|
||||
const MongoClient = require('mongodb').MongoClient;
|
||||
|
||||
(async () => {
|
||||
// browser
|
||||
const browser = await (puppeteer.launch({
|
||||
headless: true
|
||||
}));
|
||||
|
||||
// define start url
|
||||
const url = 'https://juejin.im';
|
||||
|
||||
// start a new page
|
||||
const page = await browser.newPage();
|
||||
|
||||
// navigate to url
|
||||
try {
|
||||
await page.goto(url, {waitUntil: 'domcontentloaded'});
|
||||
await page.waitFor(2000);
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
|
||||
// close browser
|
||||
browser.close();
|
||||
|
||||
// exit code 1 indicating an error happened
|
||||
code = 1;
|
||||
process.emit("exit ");
|
||||
process.reallyExit(code);
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// scroll down to fetch more data
|
||||
for (let i = 0; i < 100; i++) {
|
||||
console.log('Pressing PageDown...');
|
||||
await page.keyboard.press('PageDown', 200);
|
||||
await page.waitFor(100);
|
||||
}
|
||||
|
||||
// scrape data
|
||||
const results = await page.evaluate(() => {
|
||||
let results = [];
|
||||
document.querySelectorAll('.entry-list > .item').forEach(el => {
|
||||
if (!el.querySelector('.title')) return;
|
||||
results.push({
|
||||
url: 'https://juejin.com' + el.querySelector('.title').getAttribute('href'),
|
||||
title: el.querySelector('.title').innerText
|
||||
});
|
||||
});
|
||||
return results;
|
||||
});
|
||||
|
||||
// open database connection
|
||||
const client = await MongoClient.connect('mongodb://127.0.0.1:27017');
|
||||
let db = await client.db('crawlab_test');
|
||||
const colName = process.env.CRAWLAB_COLLECTION || 'results_juejin';
|
||||
const taskId = process.env.CRAWLAB_TASK_ID;
|
||||
const col = db.collection(colName);
|
||||
|
||||
// save to database
|
||||
for (let i = 0; i < results.length; i++) {
|
||||
// de-duplication
|
||||
const r = await col.findOne({url: results[i]});
|
||||
if (r) continue;
|
||||
|
||||
// assign taskID
|
||||
results[i].task_id = taskId;
|
||||
results[i].source = 'juejin';
|
||||
|
||||
// insert row
|
||||
await col.insertOne(results[i]);
|
||||
}
|
||||
|
||||
console.log(`results.length: ${results.length}`);
|
||||
|
||||
// close database connection
|
||||
client.close();
|
||||
|
||||
// shutdown browser
|
||||
browser.close();
|
||||
})();
|
||||
@@ -1,81 +0,0 @@
|
||||
const puppeteer = require('puppeteer');
|
||||
const MongoClient = require('mongodb').MongoClient;
|
||||
|
||||
(async () => {
|
||||
// browser
|
||||
const browser = await (puppeteer.launch({
|
||||
headless: true
|
||||
}));
|
||||
|
||||
// define start url
|
||||
const url = 'https://segmentfault.com/newest';
|
||||
|
||||
// start a new page
|
||||
const page = await browser.newPage();
|
||||
|
||||
// navigate to url
|
||||
try {
|
||||
await page.goto(url, {waitUntil: 'domcontentloaded'});
|
||||
await page.waitFor(2000);
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
|
||||
// close browser
|
||||
browser.close();
|
||||
|
||||
// exit code 1 indicating an error happened
|
||||
code = 1;
|
||||
process.emit("exit ");
|
||||
process.reallyExit(code);
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// scroll down to fetch more data
|
||||
for (let i = 0; i < 10; i++) {
|
||||
console.log('Pressing PageDown...');
|
||||
await page.keyboard.press('PageDown', 200);
|
||||
await page.waitFor(500);
|
||||
}
|
||||
|
||||
// scrape data
|
||||
const results = await page.evaluate(() => {
|
||||
let results = [];
|
||||
document.querySelectorAll('.news-list .news-item').forEach(el => {
|
||||
results.push({
|
||||
url: 'https://segmentfault.com' + el.querySelector('.news__item-info > a').getAttribute('href'),
|
||||
title: el.querySelector('.news__item-title').innerText
|
||||
})
|
||||
});
|
||||
return results;
|
||||
});
|
||||
|
||||
// open database connection
|
||||
const client = await MongoClient.connect('mongodb://127.0.0.1:27017');
|
||||
let db = await client.db('crawlab_test');
|
||||
const colName = process.env.CRAWLAB_COLLECTION || 'results_segmentfault';
|
||||
const taskId = process.env.CRAWLAB_TASK_ID;
|
||||
const col = db.collection(colName);
|
||||
|
||||
// save to database
|
||||
for (let i = 0; i < results.length; i++) {
|
||||
// de-duplication
|
||||
const r = await col.findOne({url: results[i]});
|
||||
if (r) continue;
|
||||
|
||||
// assign taskID
|
||||
results[i].task_id = taskId;
|
||||
results[i].source = 'segmentfault';
|
||||
|
||||
// insert row
|
||||
await col.insertOne(results[i]);
|
||||
}
|
||||
|
||||
console.log(`results.length: ${results.length}`);
|
||||
|
||||
// close database connection
|
||||
client.close();
|
||||
|
||||
// shutdown browser
|
||||
browser.close();
|
||||
})();
|
||||
Reference in New Issue
Block a user