diff --git a/Dockerfile.local b/Dockerfile.local index ccc803d6..eb10563d 100644 --- a/Dockerfile.local +++ b/Dockerfile.local @@ -15,7 +15,7 @@ WORKDIR /app # install frontend RUN npm config set unsafe-perm true -RUN npm install -g cnpm && cnpm install +RUN npm install -g cnpm --registry=https://registry.npm.taobao.org && cnpm install RUN npm run build:prod diff --git a/backend/scripts/install-nodejs.sh b/backend/scripts/install-nodejs.sh index 358cbc15..ef0ce392 100644 --- a/backend/scripts/install-nodejs.sh +++ b/backend/scripts/install-nodejs.sh @@ -6,6 +6,7 @@ export NVM_DIR="$([ -z "${XDG_CONFIG_HOME-}" ] && printf %s "${HOME}/.nvm" || pr [ -s "$NVM_DIR/nvm.sh" ] && \. "$NVM_DIR/nvm.sh" # This loads nvm # install Node.js v8.12 +export NVM_NODEJS_ORG_MIRROR=http://npm.taobao.org/mirrors/node nvm install 8.12 # create soft links @@ -16,7 +17,18 @@ ln -s $HOME/.nvm/versions/node/v8.12.0/bin/node /usr/local/bin/node export NODE_PATH=$HOME.nvm/versions/node/v8.12.0/lib/node_modules export PATH=$NODE_PATH:$PATH +# install chromium +# See https://crbug.com/795759 +apt-get update && apt-get install -yq libgconf-2-4 + +# Install latest chrome dev package and fonts to support major +# charsets (Chinese, Japanese, Arabic, Hebrew, Thai and a few others) +# Note: this installs the necessary libs to make the bundled version +# of Chromium that Puppeteer +# installs, work. +apt-get update && apt-get install -y --no-install-recommends gconf-service libasound2 libatk1.0-0 libatk-bridge2.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release xdg-utils wget + # install default dependencies -npm config set PUPPETEER_DOWNLOAD_HOST=https://npm.taobao.org/mirrors -npm install puppeteer -g --unsafe-perm=true -npm install puppeteer-core puppeteer-chromium-resolver crawlab-sdk -g --ignore-scripts +PUPPETEER_DOWNLOAD_HOST=https://npm.taobao.org/mirrors +npm config set puppeteer_download_host=https://npm.taobao.org/mirrors +npm install puppeteer-chromium-resolver crawlab-sdk -g --unsafe-perm=true --registry=https://registry.npm.taobao.org diff --git a/backend/services/task.go b/backend/services/task.go index 15513977..f911159d 100644 --- a/backend/services/task.go +++ b/backend/services/task.go @@ -18,11 +18,11 @@ import ( "github.com/spf13/viper" "os" "os/exec" + "path" "path/filepath" "runtime" "runtime/debug" "strconv" - "strings" "sync" "syscall" "time" @@ -110,14 +110,11 @@ func AssignTask(task model.Task) error { func SetEnv(cmd *exec.Cmd, envs []model.Env, taskId string, dataCol string) *exec.Cmd { // 默认把Node.js的全局node_modules加入环境变量 envPath := os.Getenv("PATH") - for _, _path := range strings.Split(envPath, ":") { - if strings.Contains(_path, "/.nvm/versions/node/") { - pathNodeModules := strings.Replace(_path, "/bin", "/lib/node_modules", -1) - _ = os.Setenv("PATH", pathNodeModules+":"+envPath) - _ = os.Setenv("NODE_PATH", pathNodeModules) - break - } - } + homePath := os.Getenv("HOME") + nodeVersion := "v8.12.0" + nodePath := path.Join(homePath, ".nvm/versions/node", nodeVersion, "lib/node_modules") + _ = os.Setenv("PATH", nodePath+":"+envPath) + _ = os.Setenv("NODE_PATH", nodePath) // 默认环境变量 cmd.Env = append(os.Environ(), "CRAWLAB_TASK_ID="+taskId) diff --git a/backend/template/spiders/jd_mask/Spiderfile b/backend/template/spiders/jd_mask/Spiderfile new file mode 100644 index 00000000..b53ba963 --- /dev/null +++ b/backend/template/spiders/jd_mask/Spiderfile @@ -0,0 +1,5 @@ +name: "jd_mask" +display_name: "京东口罩 (Puppeteer)" +col: "results_jd" +type: "customized" +cmd: "node jd_mask_spider.js" \ No newline at end of file diff --git a/backend/template/spiders/jd_mask/jd_mask_spider.js b/backend/template/spiders/jd_mask/jd_mask_spider.js new file mode 100644 index 00000000..dfa5c808 --- /dev/null +++ b/backend/template/spiders/jd_mask/jd_mask_spider.js @@ -0,0 +1,84 @@ +const crawlab = require('crawlab-sdk'); +const PCR = require('puppeteer-chromium-resolver'); + +const crawlDetail = async (page, url) => { + await page.goto(url); + await page.waitForSelector('#choose-btns'); + await page.waitFor(500); + + const hasStock = await page.evaluate(() => { + return !document.querySelector('.J-notify-stock'); + }); + return hasStock; +}; + +const crawlPage = async (page) => { + const items = await page.evaluate(() => { + const items = []; + document.querySelectorAll('.gl-item').forEach(el => { + items.push({ + title: el.querySelector('.p-name > a').getAttribute('title'), + url: 'https:' + el.querySelector('.p-name > a').getAttribute('href'), + }); + }); + return items; + }); + + for (let i = 0; i < items.length; i++) { + const item = items[i]; + item['has_stock'] = await crawlDetail(page, item.url); + await crawlab.saveItem(item); + } + + await page.waitFor(1000); +}; + +const main = async () => { + const pcr = await PCR({ + folderName: '.chromium-browser-snapshots', + hosts: ["https://storage.googleapis.com", "https://npm.taobao.org/mirrors"], + retry: 3 + }); + + const browser = await pcr.puppeteer.launch({ + headless: true, + args: ['--no-sandbox'], + executablePath: pcr.executablePath + }).catch(function (error) { + console.log(error); + }); + + const page = await browser.newPage(); + + await page.goto('https://www.jd.com/chanpin/270170.html'); + await page.waitForSelector('#J_goodsList'); + await page.waitFor(1000); + + await crawlPage(page); + + while (true) { + const hasNext = await page.evaluate(() => { + if (!document.querySelector('.pn-next')) return false + return !document.querySelector('.pn-next.disabled') + }); + + if (!hasNext) break; + + await page.click('.pn-next'); + await page.waitFor(1000); + await crawlPage(page); + } + + await browser.close(); +}; + +(async () => { + try { + await main() + } catch (e) { + console.error(e) + } + + await crawlab.close(); + // process.exit(); +})(); \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 6ad3efd1..637083b2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -30,7 +30,6 @@ services: # CRAWLAB_NOTIFICATION_MAIL_SENDERIDENTITY: admin@exmaple.com # sender ID 发送者 ID # CRAWLAB_NOTIFICATION_MAIL_SMTP_USER: username # SMTP username SMTP 用户名 # CRAWLAB_NOTIFICATION_MAIL_SMTP_PASSWORD: password # SMTP password SMTP 密码 - # CRAWLAB_SERVER_LANG_NODE: "Y" # 预安装 Node.js 语言环境 ports: - "8080:8080" # frontend port mapping 前端端口映射 depends_on: