added jd_mask_spider

This commit is contained in:
marvzhang
2020-02-10 11:15:37 +08:00
parent 9c2bdb077d
commit 143550f3c6
3 changed files with 89 additions and 1 deletions

View File

@@ -0,0 +1,5 @@
name: "jd_mask"
display_name: "京东口罩 (Puppeteer)"
col: "results_jd"
type: "customized"
cmd: "node jd_mask_spider.js"

View File

@@ -0,0 +1,84 @@
const crawlab = require('crawlab-sdk');
const PCR = require('puppeteer-chromium-resolver');
const crawlDetail = async (page, url) => {
await page.goto(url);
await page.waitForSelector('#choose-btns');
await page.waitFor(500);
const hasStock = await page.evaluate(() => {
return !document.querySelector('.J-notify-stock');
});
return hasStock;
};
const crawlPage = async (page) => {
const items = await page.evaluate(() => {
const items = [];
document.querySelectorAll('.gl-item').forEach(el => {
items.push({
title: el.querySelector('.p-name > a').getAttribute('title'),
url: 'https:' + el.querySelector('.p-name > a').getAttribute('href'),
});
});
return items;
});
for (let i = 0; i < items.length; i++) {
const item = items[i];
item['has_stock'] = await crawlDetail(page, item.url);
await crawlab.saveItem(item);
}
await page.waitFor(1000);
};
const main = async () => {
const pcr = await PCR({
folderName: '.chromium-browser-snapshots',
hosts: ["https://storage.googleapis.com", "https://npm.taobao.org/mirrors"],
retry: 3
});
const browser = await pcr.puppeteer.launch({
headless: true,
args: ['--no-sandbox'],
executablePath: pcr.executablePath
}).catch(function (error) {
console.log(error);
});
const page = await browser.newPage();
await page.goto('https://www.jd.com/chanpin/270170.html');
await page.waitForSelector('#J_goodsList');
await page.waitFor(1000);
await crawlPage(page);
while (true) {
const hasNext = await page.evaluate(() => {
if (!document.querySelector('.pn-next')) return false
return !document.querySelector('.pn-next.disabled')
});
if (!hasNext) break;
await page.click('.pn-next');
await page.waitFor(1000);
await crawlPage(page);
}
await browser.close();
};
(async () => {
try {
await main()
} catch (e) {
console.error(e)
}
await crawlab.close();
// process.exit();
})();

View File

@@ -30,7 +30,6 @@ services:
# CRAWLAB_NOTIFICATION_MAIL_SENDERIDENTITY: admin@exmaple.com # sender ID 发送者 ID
# CRAWLAB_NOTIFICATION_MAIL_SMTP_USER: username # SMTP username SMTP 用户名
# CRAWLAB_NOTIFICATION_MAIL_SMTP_PASSWORD: password # SMTP password SMTP 密码
# CRAWLAB_SERVER_LANG_NODE: "Y" # 预安装 Node.js 语言环境
ports:
- "8080:8080" # frontend port mapping 前端端口映射
depends_on: