From 143550f3c66a5f86c7ce18e7d26ebf8cb36f7ebe Mon Sep 17 00:00:00 2001 From: marvzhang Date: Mon, 10 Feb 2020 11:15:37 +0800 Subject: [PATCH] added jd_mask_spider --- backend/template/spiders/jd_mask/Spiderfile | 5 ++ .../spiders/jd_mask/jd_mask_spider.js | 84 +++++++++++++++++++ docker-compose.yml | 1 - 3 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 backend/template/spiders/jd_mask/Spiderfile create mode 100644 backend/template/spiders/jd_mask/jd_mask_spider.js diff --git a/backend/template/spiders/jd_mask/Spiderfile b/backend/template/spiders/jd_mask/Spiderfile new file mode 100644 index 00000000..b53ba963 --- /dev/null +++ b/backend/template/spiders/jd_mask/Spiderfile @@ -0,0 +1,5 @@ +name: "jd_mask" +display_name: "京东口罩 (Puppeteer)" +col: "results_jd" +type: "customized" +cmd: "node jd_mask_spider.js" \ No newline at end of file diff --git a/backend/template/spiders/jd_mask/jd_mask_spider.js b/backend/template/spiders/jd_mask/jd_mask_spider.js new file mode 100644 index 00000000..dfa5c808 --- /dev/null +++ b/backend/template/spiders/jd_mask/jd_mask_spider.js @@ -0,0 +1,84 @@ +const crawlab = require('crawlab-sdk'); +const PCR = require('puppeteer-chromium-resolver'); + +const crawlDetail = async (page, url) => { + await page.goto(url); + await page.waitForSelector('#choose-btns'); + await page.waitFor(500); + + const hasStock = await page.evaluate(() => { + return !document.querySelector('.J-notify-stock'); + }); + return hasStock; +}; + +const crawlPage = async (page) => { + const items = await page.evaluate(() => { + const items = []; + document.querySelectorAll('.gl-item').forEach(el => { + items.push({ + title: el.querySelector('.p-name > a').getAttribute('title'), + url: 'https:' + el.querySelector('.p-name > a').getAttribute('href'), + }); + }); + return items; + }); + + for (let i = 0; i < items.length; i++) { + const item = items[i]; + item['has_stock'] = await crawlDetail(page, item.url); + await crawlab.saveItem(item); + } + + await page.waitFor(1000); +}; + +const main = async () => { + const pcr = await PCR({ + folderName: '.chromium-browser-snapshots', + hosts: ["https://storage.googleapis.com", "https://npm.taobao.org/mirrors"], + retry: 3 + }); + + const browser = await pcr.puppeteer.launch({ + headless: true, + args: ['--no-sandbox'], + executablePath: pcr.executablePath + }).catch(function (error) { + console.log(error); + }); + + const page = await browser.newPage(); + + await page.goto('https://www.jd.com/chanpin/270170.html'); + await page.waitForSelector('#J_goodsList'); + await page.waitFor(1000); + + await crawlPage(page); + + while (true) { + const hasNext = await page.evaluate(() => { + if (!document.querySelector('.pn-next')) return false + return !document.querySelector('.pn-next.disabled') + }); + + if (!hasNext) break; + + await page.click('.pn-next'); + await page.waitFor(1000); + await crawlPage(page); + } + + await browser.close(); +}; + +(async () => { + try { + await main() + } catch (e) { + console.error(e) + } + + await crawlab.close(); + // process.exit(); +})(); \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 6ad3efd1..637083b2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -30,7 +30,6 @@ services: # CRAWLAB_NOTIFICATION_MAIL_SENDERIDENTITY: admin@exmaple.com # sender ID 发送者 ID # CRAWLAB_NOTIFICATION_MAIL_SMTP_USER: username # SMTP username SMTP 用户名 # CRAWLAB_NOTIFICATION_MAIL_SMTP_PASSWORD: password # SMTP password SMTP 密码 - # CRAWLAB_SERVER_LANG_NODE: "Y" # 预安装 Node.js 语言环境 ports: - "8080:8080" # frontend port mapping 前端端口映射 depends_on: