Merge pull request #536 from crawlab-team/develop

Develop
This commit is contained in:
Marvin Zhang
2020-02-10 19:29:32 +08:00
committed by GitHub
6 changed files with 111 additions and 14 deletions

View File

@@ -15,7 +15,7 @@ WORKDIR /app
# install frontend
RUN npm config set unsafe-perm true
RUN npm install -g cnpm && cnpm install
RUN npm install -g cnpm --registry=https://registry.npm.taobao.org && cnpm install
RUN npm run build:prod

View File

@@ -6,6 +6,7 @@ export NVM_DIR="$([ -z "${XDG_CONFIG_HOME-}" ] && printf %s "${HOME}/.nvm" || pr
[ -s "$NVM_DIR/nvm.sh" ] && \. "$NVM_DIR/nvm.sh" # This loads nvm
# install Node.js v8.12
export NVM_NODEJS_ORG_MIRROR=http://npm.taobao.org/mirrors/node
nvm install 8.12
# create soft links
@@ -16,7 +17,18 @@ ln -s $HOME/.nvm/versions/node/v8.12.0/bin/node /usr/local/bin/node
export NODE_PATH=$HOME.nvm/versions/node/v8.12.0/lib/node_modules
export PATH=$NODE_PATH:$PATH
# install chromium
# See https://crbug.com/795759
apt-get update && apt-get install -yq libgconf-2-4
# Install latest chrome dev package and fonts to support major
# charsets (Chinese, Japanese, Arabic, Hebrew, Thai and a few others)
# Note: this installs the necessary libs to make the bundled version
# of Chromium that Puppeteer
# installs, work.
apt-get update && apt-get install -y --no-install-recommends gconf-service libasound2 libatk1.0-0 libatk-bridge2.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release xdg-utils wget
# install default dependencies
npm config set PUPPETEER_DOWNLOAD_HOST=https://npm.taobao.org/mirrors
npm install puppeteer -g --unsafe-perm=true
npm install puppeteer-core puppeteer-chromium-resolver crawlab-sdk -g --ignore-scripts
PUPPETEER_DOWNLOAD_HOST=https://npm.taobao.org/mirrors
npm config set puppeteer_download_host=https://npm.taobao.org/mirrors
npm install puppeteer-chromium-resolver crawlab-sdk -g --unsafe-perm=true --registry=https://registry.npm.taobao.org

View File

@@ -18,11 +18,11 @@ import (
"github.com/spf13/viper"
"os"
"os/exec"
"path"
"path/filepath"
"runtime"
"runtime/debug"
"strconv"
"strings"
"sync"
"syscall"
"time"
@@ -110,14 +110,11 @@ func AssignTask(task model.Task) error {
func SetEnv(cmd *exec.Cmd, envs []model.Env, taskId string, dataCol string) *exec.Cmd {
// 默认把Node.js的全局node_modules加入环境变量
envPath := os.Getenv("PATH")
for _, _path := range strings.Split(envPath, ":") {
if strings.Contains(_path, "/.nvm/versions/node/") {
pathNodeModules := strings.Replace(_path, "/bin", "/lib/node_modules", -1)
_ = os.Setenv("PATH", pathNodeModules+":"+envPath)
_ = os.Setenv("NODE_PATH", pathNodeModules)
break
}
}
homePath := os.Getenv("HOME")
nodeVersion := "v8.12.0"
nodePath := path.Join(homePath, ".nvm/versions/node", nodeVersion, "lib/node_modules")
_ = os.Setenv("PATH", nodePath+":"+envPath)
_ = os.Setenv("NODE_PATH", nodePath)
// 默认环境变量
cmd.Env = append(os.Environ(), "CRAWLAB_TASK_ID="+taskId)

View File

@@ -0,0 +1,5 @@
name: "jd_mask"
display_name: "京东口罩 (Puppeteer)"
col: "results_jd"
type: "customized"
cmd: "node jd_mask_spider.js"

View File

@@ -0,0 +1,84 @@
const crawlab = require('crawlab-sdk');
const PCR = require('puppeteer-chromium-resolver');
const crawlDetail = async (page, url) => {
await page.goto(url);
await page.waitForSelector('#choose-btns');
await page.waitFor(500);
const hasStock = await page.evaluate(() => {
return !document.querySelector('.J-notify-stock');
});
return hasStock;
};
const crawlPage = async (page) => {
const items = await page.evaluate(() => {
const items = [];
document.querySelectorAll('.gl-item').forEach(el => {
items.push({
title: el.querySelector('.p-name > a').getAttribute('title'),
url: 'https:' + el.querySelector('.p-name > a').getAttribute('href'),
});
});
return items;
});
for (let i = 0; i < items.length; i++) {
const item = items[i];
item['has_stock'] = await crawlDetail(page, item.url);
await crawlab.saveItem(item);
}
await page.waitFor(1000);
};
const main = async () => {
const pcr = await PCR({
folderName: '.chromium-browser-snapshots',
hosts: ["https://storage.googleapis.com", "https://npm.taobao.org/mirrors"],
retry: 3
});
const browser = await pcr.puppeteer.launch({
headless: true,
args: ['--no-sandbox'],
executablePath: pcr.executablePath
}).catch(function (error) {
console.log(error);
});
const page = await browser.newPage();
await page.goto('https://www.jd.com/chanpin/270170.html');
await page.waitForSelector('#J_goodsList');
await page.waitFor(1000);
await crawlPage(page);
while (true) {
const hasNext = await page.evaluate(() => {
if (!document.querySelector('.pn-next')) return false
return !document.querySelector('.pn-next.disabled')
});
if (!hasNext) break;
await page.click('.pn-next');
await page.waitFor(1000);
await crawlPage(page);
}
await browser.close();
};
(async () => {
try {
await main()
} catch (e) {
console.error(e)
}
await crawlab.close();
// process.exit();
})();

View File

@@ -30,7 +30,6 @@ services:
# CRAWLAB_NOTIFICATION_MAIL_SENDERIDENTITY: admin@exmaple.com # sender ID 发送者 ID
# CRAWLAB_NOTIFICATION_MAIL_SMTP_USER: username # SMTP username SMTP 用户名
# CRAWLAB_NOTIFICATION_MAIL_SMTP_PASSWORD: password # SMTP password SMTP 密码
# CRAWLAB_SERVER_LANG_NODE: "Y" # 预安装 Node.js 语言环境
ports:
- "8080:8080" # frontend port mapping 前端端口映射
depends_on: