mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-22 17:31:03 +01:00
@@ -15,7 +15,7 @@ WORKDIR /app
|
||||
|
||||
# install frontend
|
||||
RUN npm config set unsafe-perm true
|
||||
RUN npm install -g cnpm && cnpm install
|
||||
RUN npm install -g cnpm --registry=https://registry.npm.taobao.org && cnpm install
|
||||
|
||||
RUN npm run build:prod
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ export NVM_DIR="$([ -z "${XDG_CONFIG_HOME-}" ] && printf %s "${HOME}/.nvm" || pr
|
||||
[ -s "$NVM_DIR/nvm.sh" ] && \. "$NVM_DIR/nvm.sh" # This loads nvm
|
||||
|
||||
# install Node.js v8.12
|
||||
export NVM_NODEJS_ORG_MIRROR=http://npm.taobao.org/mirrors/node
|
||||
nvm install 8.12
|
||||
|
||||
# create soft links
|
||||
@@ -16,7 +17,18 @@ ln -s $HOME/.nvm/versions/node/v8.12.0/bin/node /usr/local/bin/node
|
||||
export NODE_PATH=$HOME.nvm/versions/node/v8.12.0/lib/node_modules
|
||||
export PATH=$NODE_PATH:$PATH
|
||||
|
||||
# install chromium
|
||||
# See https://crbug.com/795759
|
||||
apt-get update && apt-get install -yq libgconf-2-4
|
||||
|
||||
# Install latest chrome dev package and fonts to support major
|
||||
# charsets (Chinese, Japanese, Arabic, Hebrew, Thai and a few others)
|
||||
# Note: this installs the necessary libs to make the bundled version
|
||||
# of Chromium that Puppeteer
|
||||
# installs, work.
|
||||
apt-get update && apt-get install -y --no-install-recommends gconf-service libasound2 libatk1.0-0 libatk-bridge2.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release xdg-utils wget
|
||||
|
||||
# install default dependencies
|
||||
npm config set PUPPETEER_DOWNLOAD_HOST=https://npm.taobao.org/mirrors
|
||||
npm install puppeteer -g --unsafe-perm=true
|
||||
npm install puppeteer-core puppeteer-chromium-resolver crawlab-sdk -g --ignore-scripts
|
||||
PUPPETEER_DOWNLOAD_HOST=https://npm.taobao.org/mirrors
|
||||
npm config set puppeteer_download_host=https://npm.taobao.org/mirrors
|
||||
npm install puppeteer-chromium-resolver crawlab-sdk -g --unsafe-perm=true --registry=https://registry.npm.taobao.org
|
||||
|
||||
@@ -18,11 +18,11 @@ import (
|
||||
"github.com/spf13/viper"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"runtime/debug"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
@@ -110,14 +110,11 @@ func AssignTask(task model.Task) error {
|
||||
func SetEnv(cmd *exec.Cmd, envs []model.Env, taskId string, dataCol string) *exec.Cmd {
|
||||
// 默认把Node.js的全局node_modules加入环境变量
|
||||
envPath := os.Getenv("PATH")
|
||||
for _, _path := range strings.Split(envPath, ":") {
|
||||
if strings.Contains(_path, "/.nvm/versions/node/") {
|
||||
pathNodeModules := strings.Replace(_path, "/bin", "/lib/node_modules", -1)
|
||||
_ = os.Setenv("PATH", pathNodeModules+":"+envPath)
|
||||
_ = os.Setenv("NODE_PATH", pathNodeModules)
|
||||
break
|
||||
}
|
||||
}
|
||||
homePath := os.Getenv("HOME")
|
||||
nodeVersion := "v8.12.0"
|
||||
nodePath := path.Join(homePath, ".nvm/versions/node", nodeVersion, "lib/node_modules")
|
||||
_ = os.Setenv("PATH", nodePath+":"+envPath)
|
||||
_ = os.Setenv("NODE_PATH", nodePath)
|
||||
|
||||
// 默认环境变量
|
||||
cmd.Env = append(os.Environ(), "CRAWLAB_TASK_ID="+taskId)
|
||||
|
||||
5
backend/template/spiders/jd_mask/Spiderfile
Normal file
5
backend/template/spiders/jd_mask/Spiderfile
Normal file
@@ -0,0 +1,5 @@
|
||||
name: "jd_mask"
|
||||
display_name: "京东口罩 (Puppeteer)"
|
||||
col: "results_jd"
|
||||
type: "customized"
|
||||
cmd: "node jd_mask_spider.js"
|
||||
84
backend/template/spiders/jd_mask/jd_mask_spider.js
Normal file
84
backend/template/spiders/jd_mask/jd_mask_spider.js
Normal file
@@ -0,0 +1,84 @@
|
||||
const crawlab = require('crawlab-sdk');
|
||||
const PCR = require('puppeteer-chromium-resolver');
|
||||
|
||||
const crawlDetail = async (page, url) => {
|
||||
await page.goto(url);
|
||||
await page.waitForSelector('#choose-btns');
|
||||
await page.waitFor(500);
|
||||
|
||||
const hasStock = await page.evaluate(() => {
|
||||
return !document.querySelector('.J-notify-stock');
|
||||
});
|
||||
return hasStock;
|
||||
};
|
||||
|
||||
const crawlPage = async (page) => {
|
||||
const items = await page.evaluate(() => {
|
||||
const items = [];
|
||||
document.querySelectorAll('.gl-item').forEach(el => {
|
||||
items.push({
|
||||
title: el.querySelector('.p-name > a').getAttribute('title'),
|
||||
url: 'https:' + el.querySelector('.p-name > a').getAttribute('href'),
|
||||
});
|
||||
});
|
||||
return items;
|
||||
});
|
||||
|
||||
for (let i = 0; i < items.length; i++) {
|
||||
const item = items[i];
|
||||
item['has_stock'] = await crawlDetail(page, item.url);
|
||||
await crawlab.saveItem(item);
|
||||
}
|
||||
|
||||
await page.waitFor(1000);
|
||||
};
|
||||
|
||||
const main = async () => {
|
||||
const pcr = await PCR({
|
||||
folderName: '.chromium-browser-snapshots',
|
||||
hosts: ["https://storage.googleapis.com", "https://npm.taobao.org/mirrors"],
|
||||
retry: 3
|
||||
});
|
||||
|
||||
const browser = await pcr.puppeteer.launch({
|
||||
headless: true,
|
||||
args: ['--no-sandbox'],
|
||||
executablePath: pcr.executablePath
|
||||
}).catch(function (error) {
|
||||
console.log(error);
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
|
||||
await page.goto('https://www.jd.com/chanpin/270170.html');
|
||||
await page.waitForSelector('#J_goodsList');
|
||||
await page.waitFor(1000);
|
||||
|
||||
await crawlPage(page);
|
||||
|
||||
while (true) {
|
||||
const hasNext = await page.evaluate(() => {
|
||||
if (!document.querySelector('.pn-next')) return false
|
||||
return !document.querySelector('.pn-next.disabled')
|
||||
});
|
||||
|
||||
if (!hasNext) break;
|
||||
|
||||
await page.click('.pn-next');
|
||||
await page.waitFor(1000);
|
||||
await crawlPage(page);
|
||||
}
|
||||
|
||||
await browser.close();
|
||||
};
|
||||
|
||||
(async () => {
|
||||
try {
|
||||
await main()
|
||||
} catch (e) {
|
||||
console.error(e)
|
||||
}
|
||||
|
||||
await crawlab.close();
|
||||
// process.exit();
|
||||
})();
|
||||
@@ -30,7 +30,6 @@ services:
|
||||
# CRAWLAB_NOTIFICATION_MAIL_SENDERIDENTITY: admin@exmaple.com # sender ID 发送者 ID
|
||||
# CRAWLAB_NOTIFICATION_MAIL_SMTP_USER: username # SMTP username SMTP 用户名
|
||||
# CRAWLAB_NOTIFICATION_MAIL_SMTP_PASSWORD: password # SMTP password SMTP 密码
|
||||
# CRAWLAB_SERVER_LANG_NODE: "Y" # 预安装 Node.js 语言环境
|
||||
ports:
|
||||
- "8080:8080" # frontend port mapping 前端端口映射
|
||||
depends_on:
|
||||
|
||||
Reference in New Issue
Block a user