mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-21 17:21:09 +01:00
added jd_mask_spider
This commit is contained in:
5
backend/template/spiders/jd_mask/Spiderfile
Normal file
5
backend/template/spiders/jd_mask/Spiderfile
Normal file
@@ -0,0 +1,5 @@
|
||||
name: "jd_mask"
|
||||
display_name: "京东口罩 (Puppeteer)"
|
||||
col: "results_jd"
|
||||
type: "customized"
|
||||
cmd: "node jd_mask_spider.js"
|
||||
84
backend/template/spiders/jd_mask/jd_mask_spider.js
Normal file
84
backend/template/spiders/jd_mask/jd_mask_spider.js
Normal file
@@ -0,0 +1,84 @@
|
||||
const crawlab = require('crawlab-sdk');
|
||||
const PCR = require('puppeteer-chromium-resolver');
|
||||
|
||||
const crawlDetail = async (page, url) => {
|
||||
await page.goto(url);
|
||||
await page.waitForSelector('#choose-btns');
|
||||
await page.waitFor(500);
|
||||
|
||||
const hasStock = await page.evaluate(() => {
|
||||
return !document.querySelector('.J-notify-stock');
|
||||
});
|
||||
return hasStock;
|
||||
};
|
||||
|
||||
const crawlPage = async (page) => {
|
||||
const items = await page.evaluate(() => {
|
||||
const items = [];
|
||||
document.querySelectorAll('.gl-item').forEach(el => {
|
||||
items.push({
|
||||
title: el.querySelector('.p-name > a').getAttribute('title'),
|
||||
url: 'https:' + el.querySelector('.p-name > a').getAttribute('href'),
|
||||
});
|
||||
});
|
||||
return items;
|
||||
});
|
||||
|
||||
for (let i = 0; i < items.length; i++) {
|
||||
const item = items[i];
|
||||
item['has_stock'] = await crawlDetail(page, item.url);
|
||||
await crawlab.saveItem(item);
|
||||
}
|
||||
|
||||
await page.waitFor(1000);
|
||||
};
|
||||
|
||||
const main = async () => {
|
||||
const pcr = await PCR({
|
||||
folderName: '.chromium-browser-snapshots',
|
||||
hosts: ["https://storage.googleapis.com", "https://npm.taobao.org/mirrors"],
|
||||
retry: 3
|
||||
});
|
||||
|
||||
const browser = await pcr.puppeteer.launch({
|
||||
headless: true,
|
||||
args: ['--no-sandbox'],
|
||||
executablePath: pcr.executablePath
|
||||
}).catch(function (error) {
|
||||
console.log(error);
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
|
||||
await page.goto('https://www.jd.com/chanpin/270170.html');
|
||||
await page.waitForSelector('#J_goodsList');
|
||||
await page.waitFor(1000);
|
||||
|
||||
await crawlPage(page);
|
||||
|
||||
while (true) {
|
||||
const hasNext = await page.evaluate(() => {
|
||||
if (!document.querySelector('.pn-next')) return false
|
||||
return !document.querySelector('.pn-next.disabled')
|
||||
});
|
||||
|
||||
if (!hasNext) break;
|
||||
|
||||
await page.click('.pn-next');
|
||||
await page.waitFor(1000);
|
||||
await crawlPage(page);
|
||||
}
|
||||
|
||||
await browser.close();
|
||||
};
|
||||
|
||||
(async () => {
|
||||
try {
|
||||
await main()
|
||||
} catch (e) {
|
||||
console.error(e)
|
||||
}
|
||||
|
||||
await crawlab.close();
|
||||
// process.exit();
|
||||
})();
|
||||
@@ -30,7 +30,6 @@ services:
|
||||
# CRAWLAB_NOTIFICATION_MAIL_SENDERIDENTITY: admin@exmaple.com # sender ID 发送者 ID
|
||||
# CRAWLAB_NOTIFICATION_MAIL_SMTP_USER: username # SMTP username SMTP 用户名
|
||||
# CRAWLAB_NOTIFICATION_MAIL_SMTP_PASSWORD: password # SMTP password SMTP 密码
|
||||
# CRAWLAB_SERVER_LANG_NODE: "Y" # 预安装 Node.js 语言环境
|
||||
ports:
|
||||
- "8080:8080" # frontend port mapping 前端端口映射
|
||||
depends_on:
|
||||
|
||||
Reference in New Issue
Block a user