From 143550f3c66a5f86c7ce18e7d26ebf8cb36f7ebe Mon Sep 17 00:00:00 2001 From: marvzhang Date: Mon, 10 Feb 2020 11:15:37 +0800 Subject: [PATCH 1/8] added jd_mask_spider --- backend/template/spiders/jd_mask/Spiderfile | 5 ++ .../spiders/jd_mask/jd_mask_spider.js | 84 +++++++++++++++++++ docker-compose.yml | 1 - 3 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 backend/template/spiders/jd_mask/Spiderfile create mode 100644 backend/template/spiders/jd_mask/jd_mask_spider.js diff --git a/backend/template/spiders/jd_mask/Spiderfile b/backend/template/spiders/jd_mask/Spiderfile new file mode 100644 index 00000000..b53ba963 --- /dev/null +++ b/backend/template/spiders/jd_mask/Spiderfile @@ -0,0 +1,5 @@ +name: "jd_mask" +display_name: "京东口罩 (Puppeteer)" +col: "results_jd" +type: "customized" +cmd: "node jd_mask_spider.js" \ No newline at end of file diff --git a/backend/template/spiders/jd_mask/jd_mask_spider.js b/backend/template/spiders/jd_mask/jd_mask_spider.js new file mode 100644 index 00000000..dfa5c808 --- /dev/null +++ b/backend/template/spiders/jd_mask/jd_mask_spider.js @@ -0,0 +1,84 @@ +const crawlab = require('crawlab-sdk'); +const PCR = require('puppeteer-chromium-resolver'); + +const crawlDetail = async (page, url) => { + await page.goto(url); + await page.waitForSelector('#choose-btns'); + await page.waitFor(500); + + const hasStock = await page.evaluate(() => { + return !document.querySelector('.J-notify-stock'); + }); + return hasStock; +}; + +const crawlPage = async (page) => { + const items = await page.evaluate(() => { + const items = []; + document.querySelectorAll('.gl-item').forEach(el => { + items.push({ + title: el.querySelector('.p-name > a').getAttribute('title'), + url: 'https:' + el.querySelector('.p-name > a').getAttribute('href'), + }); + }); + return items; + }); + + for (let i = 0; i < items.length; i++) { + const item = items[i]; + item['has_stock'] = await crawlDetail(page, item.url); + await crawlab.saveItem(item); + } + + await page.waitFor(1000); +}; + +const main = async () => { + const pcr = await PCR({ + folderName: '.chromium-browser-snapshots', + hosts: ["https://storage.googleapis.com", "https://npm.taobao.org/mirrors"], + retry: 3 + }); + + const browser = await pcr.puppeteer.launch({ + headless: true, + args: ['--no-sandbox'], + executablePath: pcr.executablePath + }).catch(function (error) { + console.log(error); + }); + + const page = await browser.newPage(); + + await page.goto('https://www.jd.com/chanpin/270170.html'); + await page.waitForSelector('#J_goodsList'); + await page.waitFor(1000); + + await crawlPage(page); + + while (true) { + const hasNext = await page.evaluate(() => { + if (!document.querySelector('.pn-next')) return false + return !document.querySelector('.pn-next.disabled') + }); + + if (!hasNext) break; + + await page.click('.pn-next'); + await page.waitFor(1000); + await crawlPage(page); + } + + await browser.close(); +}; + +(async () => { + try { + await main() + } catch (e) { + console.error(e) + } + + await crawlab.close(); + // process.exit(); +})(); \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 6ad3efd1..637083b2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -30,7 +30,6 @@ services: # CRAWLAB_NOTIFICATION_MAIL_SENDERIDENTITY: admin@exmaple.com # sender ID 发送者 ID # CRAWLAB_NOTIFICATION_MAIL_SMTP_USER: username # SMTP username SMTP 用户名 # CRAWLAB_NOTIFICATION_MAIL_SMTP_PASSWORD: password # SMTP password SMTP 密码 - # CRAWLAB_SERVER_LANG_NODE: "Y" # 预安装 Node.js 语言环境 ports: - "8080:8080" # frontend port mapping 前端端口映射 depends_on: From 6b32709a390df5900c92e1bc780ca23532331344 Mon Sep 17 00:00:00 2001 From: marvzhang Date: Mon, 10 Feb 2020 13:23:34 +0800 Subject: [PATCH 2/8] =?UTF-8?q?=E5=85=BC=E5=AE=B9Node.js=E7=88=AC=E8=99=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/services/task.go | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/backend/services/task.go b/backend/services/task.go index 15513977..f911159d 100644 --- a/backend/services/task.go +++ b/backend/services/task.go @@ -18,11 +18,11 @@ import ( "github.com/spf13/viper" "os" "os/exec" + "path" "path/filepath" "runtime" "runtime/debug" "strconv" - "strings" "sync" "syscall" "time" @@ -110,14 +110,11 @@ func AssignTask(task model.Task) error { func SetEnv(cmd *exec.Cmd, envs []model.Env, taskId string, dataCol string) *exec.Cmd { // 默认把Node.js的全局node_modules加入环境变量 envPath := os.Getenv("PATH") - for _, _path := range strings.Split(envPath, ":") { - if strings.Contains(_path, "/.nvm/versions/node/") { - pathNodeModules := strings.Replace(_path, "/bin", "/lib/node_modules", -1) - _ = os.Setenv("PATH", pathNodeModules+":"+envPath) - _ = os.Setenv("NODE_PATH", pathNodeModules) - break - } - } + homePath := os.Getenv("HOME") + nodeVersion := "v8.12.0" + nodePath := path.Join(homePath, ".nvm/versions/node", nodeVersion, "lib/node_modules") + _ = os.Setenv("PATH", nodePath+":"+envPath) + _ = os.Setenv("NODE_PATH", nodePath) // 默认环境变量 cmd.Env = append(os.Environ(), "CRAWLAB_TASK_ID="+taskId) From 70e911ade6ad463eb3483ac07e2df3753b47bfe1 Mon Sep 17 00:00:00 2001 From: marvzhang Date: Mon, 10 Feb 2020 14:26:14 +0800 Subject: [PATCH 3/8] updated install-nodejs.sh --- backend/scripts/install-nodejs.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backend/scripts/install-nodejs.sh b/backend/scripts/install-nodejs.sh index 358cbc15..4342d638 100644 --- a/backend/scripts/install-nodejs.sh +++ b/backend/scripts/install-nodejs.sh @@ -18,5 +18,4 @@ export PATH=$NODE_PATH:$PATH # install default dependencies npm config set PUPPETEER_DOWNLOAD_HOST=https://npm.taobao.org/mirrors -npm install puppeteer -g --unsafe-perm=true -npm install puppeteer-core puppeteer-chromium-resolver crawlab-sdk -g --ignore-scripts +npm install puppeteer-chromium-resolver crawlab-sdk -g --unsafe-perm=true From c50a9bb63e0a7bb8b3296958dba10bd15dcb892d Mon Sep 17 00:00:00 2001 From: marvzhang Date: Mon, 10 Feb 2020 17:04:53 +0800 Subject: [PATCH 4/8] updated install-nodejs.sh --- Dockerfile.local | 2 +- backend/scripts/install-nodejs.sh | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/Dockerfile.local b/Dockerfile.local index ccc803d6..eb10563d 100644 --- a/Dockerfile.local +++ b/Dockerfile.local @@ -15,7 +15,7 @@ WORKDIR /app # install frontend RUN npm config set unsafe-perm true -RUN npm install -g cnpm && cnpm install +RUN npm install -g cnpm --registry=https://registry.npm.taobao.org && cnpm install RUN npm run build:prod diff --git a/backend/scripts/install-nodejs.sh b/backend/scripts/install-nodejs.sh index 4342d638..4eeb8707 100644 --- a/backend/scripts/install-nodejs.sh +++ b/backend/scripts/install-nodejs.sh @@ -6,6 +6,7 @@ export NVM_DIR="$([ -z "${XDG_CONFIG_HOME-}" ] && printf %s "${HOME}/.nvm" || pr [ -s "$NVM_DIR/nvm.sh" ] && \. "$NVM_DIR/nvm.sh" # This loads nvm # install Node.js v8.12 +# export NVM_NODEJS_ORG_MIRROR=http://npm.taobao.org/mirrors/node nvm install 8.12 # create soft links @@ -16,6 +17,15 @@ ln -s $HOME/.nvm/versions/node/v8.12.0/bin/node /usr/local/bin/node export NODE_PATH=$HOME.nvm/versions/node/v8.12.0/lib/node_modules export PATH=$NODE_PATH:$PATH +# install apt dependencies +apt-get install -y wget +wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \ + && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \ + && apt-get update \ + && apt-get install -y google-chrome-unstable \ + --no-install-recommends \ + && rm -rf /var/lib/apt/lists/* + # install default dependencies -npm config set PUPPETEER_DOWNLOAD_HOST=https://npm.taobao.org/mirrors -npm install puppeteer-chromium-resolver crawlab-sdk -g --unsafe-perm=true +export PUPPETEER_DOWNLOAD_HOST=https://npm.taobao.org/mirrors +npm install puppeteer-chromium-resolver crawlab-sdk -g --unsafe-perm=true From 47a494a0120ae9f5941e937c8e37e5855ae040f8 Mon Sep 17 00:00:00 2001 From: marvzhang Date: Mon, 10 Feb 2020 17:11:10 +0800 Subject: [PATCH 5/8] updated etl --- backend/scripts/install-nodejs.sh | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/backend/scripts/install-nodejs.sh b/backend/scripts/install-nodejs.sh index 4eeb8707..6fd8cfb0 100644 --- a/backend/scripts/install-nodejs.sh +++ b/backend/scripts/install-nodejs.sh @@ -17,14 +17,28 @@ ln -s $HOME/.nvm/versions/node/v8.12.0/bin/node /usr/local/bin/node export NODE_PATH=$HOME.nvm/versions/node/v8.12.0/lib/node_modules export PATH=$NODE_PATH:$PATH -# install apt dependencies -apt-get install -y wget -wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \ - && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \ - && apt-get update \ - && apt-get install -y google-chrome-unstable \ - --no-install-recommends \ - && rm -rf /var/lib/apt/lists/* +# # install chromium +# # See https://crbug.com/795759 +# RUN apt-get update && apt-get install -yq libgconf-2-4 + +# # Install latest chrome dev package and fonts to support major +# # charsets (Chinese, Japanese, Arabic, Hebrew, Thai and a few others) +# # Note: this installs the necessary libs to make the bundled version +# # of Chromium that Puppeteer +# # installs, work. +# RUN apt-get update && apt-get install -y wget --no-install-recommends \ +# && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \ +# && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \ +# && apt-get update \ +# && apt-get install -y google-chrome-unstable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst ttf-freefont \ +# --no-install-recommends \ +# && rm -rf /var/lib/apt/lists/* \ +# && apt-get purge --auto-remove -y curl \ +# && rm -rf /src/*.deb + +# # It's a good idea to use dumb-init to help prevent zombie chrome processes. +# ADD https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64 /usr/local/bin/dumb-init +# RUN chmod +x /usr/local/bin/dumb-init # install default dependencies export PUPPETEER_DOWNLOAD_HOST=https://npm.taobao.org/mirrors From 3e2bdb59f4241a4bd23e7bb67bdfb1b5ee0dfaa5 Mon Sep 17 00:00:00 2001 From: marvzhang Date: Mon, 10 Feb 2020 17:41:14 +0800 Subject: [PATCH 6/8] updated install-nodejs.sh --- backend/scripts/install-nodejs.sh | 44 ++++++++++++++----------------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/backend/scripts/install-nodejs.sh b/backend/scripts/install-nodejs.sh index 6fd8cfb0..2aa05fba 100644 --- a/backend/scripts/install-nodejs.sh +++ b/backend/scripts/install-nodejs.sh @@ -6,7 +6,7 @@ export NVM_DIR="$([ -z "${XDG_CONFIG_HOME-}" ] && printf %s "${HOME}/.nvm" || pr [ -s "$NVM_DIR/nvm.sh" ] && \. "$NVM_DIR/nvm.sh" # This loads nvm # install Node.js v8.12 -# export NVM_NODEJS_ORG_MIRROR=http://npm.taobao.org/mirrors/node +export NVM_NODEJS_ORG_MIRROR=http://npm.taobao.org/mirrors/node nvm install 8.12 # create soft links @@ -17,29 +17,25 @@ ln -s $HOME/.nvm/versions/node/v8.12.0/bin/node /usr/local/bin/node export NODE_PATH=$HOME.nvm/versions/node/v8.12.0/lib/node_modules export PATH=$NODE_PATH:$PATH -# # install chromium -# # See https://crbug.com/795759 -# RUN apt-get update && apt-get install -yq libgconf-2-4 +# install chromium +# See https://crbug.com/795759 +RUN apt-get update && apt-get install -yq libgconf-2-4 -# # Install latest chrome dev package and fonts to support major -# # charsets (Chinese, Japanese, Arabic, Hebrew, Thai and a few others) -# # Note: this installs the necessary libs to make the bundled version -# # of Chromium that Puppeteer -# # installs, work. -# RUN apt-get update && apt-get install -y wget --no-install-recommends \ -# && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \ -# && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \ -# && apt-get update \ -# && apt-get install -y google-chrome-unstable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst ttf-freefont \ -# --no-install-recommends \ -# && rm -rf /var/lib/apt/lists/* \ -# && apt-get purge --auto-remove -y curl \ -# && rm -rf /src/*.deb - -# # It's a good idea to use dumb-init to help prevent zombie chrome processes. -# ADD https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64 /usr/local/bin/dumb-init -# RUN chmod +x /usr/local/bin/dumb-init +# Install latest chrome dev package and fonts to support major +# charsets (Chinese, Japanese, Arabic, Hebrew, Thai and a few others) +# Note: this installs the necessary libs to make the bundled version +# of Chromium that Puppeteer +# installs, work. +RUN apt-get update && apt-get install -y wget --no-install-recommends \ + && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \ + && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \ + && apt-get update \ + && apt-get install -y google-chrome-unstable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst ttf-freefont \ + --no-install-recommends \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get purge --auto-remove -y curl \ + && rm -rf /src/*.deb # install default dependencies -export PUPPETEER_DOWNLOAD_HOST=https://npm.taobao.org/mirrors -npm install puppeteer-chromium-resolver crawlab-sdk -g --unsafe-perm=true +PUPPETEER_DOWNLOAD_HOST=https://npm.taobao.org/mirrors +npm install puppeteer-chromium-resolver crawlab-sdk -g --unsafe-perm=true --registry=https://registry.npm.taobao.org From 8df923fd3cee934ec12e34f2541fc427fd798e60 Mon Sep 17 00:00:00 2001 From: marvzhang Date: Mon, 10 Feb 2020 17:45:47 +0800 Subject: [PATCH 7/8] updated install-nodejs.sh --- backend/scripts/install-nodejs.sh | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/backend/scripts/install-nodejs.sh b/backend/scripts/install-nodejs.sh index 2aa05fba..66531c90 100644 --- a/backend/scripts/install-nodejs.sh +++ b/backend/scripts/install-nodejs.sh @@ -19,22 +19,14 @@ export PATH=$NODE_PATH:$PATH # install chromium # See https://crbug.com/795759 -RUN apt-get update && apt-get install -yq libgconf-2-4 +apt-get update && apt-get install -yq libgconf-2-4 # Install latest chrome dev package and fonts to support major # charsets (Chinese, Japanese, Arabic, Hebrew, Thai and a few others) # Note: this installs the necessary libs to make the bundled version # of Chromium that Puppeteer # installs, work. -RUN apt-get update && apt-get install -y wget --no-install-recommends \ - && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \ - && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \ - && apt-get update \ - && apt-get install -y google-chrome-unstable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst ttf-freefont \ - --no-install-recommends \ - && rm -rf /var/lib/apt/lists/* \ - && apt-get purge --auto-remove -y curl \ - && rm -rf /src/*.deb +apt-get update && apt-get install -y --no-install-recommends gconf-service libasound2 libatk1.0-0 libatk-bridge2.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release xdg-utils wget # install default dependencies PUPPETEER_DOWNLOAD_HOST=https://npm.taobao.org/mirrors From c3b1c8040aa6f3478560b33cfab101be8480f11f Mon Sep 17 00:00:00 2001 From: marvzhang Date: Mon, 10 Feb 2020 19:28:47 +0800 Subject: [PATCH 8/8] updated install-nodejs.sh --- backend/scripts/install-nodejs.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/scripts/install-nodejs.sh b/backend/scripts/install-nodejs.sh index 66531c90..ef0ce392 100644 --- a/backend/scripts/install-nodejs.sh +++ b/backend/scripts/install-nodejs.sh @@ -30,4 +30,5 @@ apt-get update && apt-get install -y --no-install-recommends gconf-service libas # install default dependencies PUPPETEER_DOWNLOAD_HOST=https://npm.taobao.org/mirrors +npm config set puppeteer_download_host=https://npm.taobao.org/mirrors npm install puppeteer-chromium-resolver crawlab-sdk -g --unsafe-perm=true --registry=https://registry.npm.taobao.org