Merge pull request #130 from tikazyq/develop

Develop
This commit is contained in:
Marvin Zhang
2019-08-15 11:59:43 +08:00
committed by GitHub
21 changed files with 379 additions and 54 deletions

78
Dockerfile.master.apline Normal file
View File

@@ -0,0 +1,78 @@
FROM golang:1.12-alpine AS backend-build
WORKDIR /go/src/app
COPY ./backend .
ENV GO111MODULE on
ENV GOPROXY https://mirrors.aliyun.com/goproxy/
RUN go install -v ./...
FROM node:8.16.0-alpine AS frontend-build
ADD ./frontend /app
WORKDIR /app
# install frontend
RUN npm install -g yarn && yarn install --registry=https://registry.npm.taobao.org
RUN npm run build:prod
# images
FROM python:alpine
ADD . /app
RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.aliyun.com/g' /etc/apk/repositories
# install packages
RUN apk update && apk add --no-cache --virtual .build-deps \
gcc \
linux-headers \
musl-dev \
libffi-dev \
libxml2-dev \
libxslt-dev \
openssl-dev
# install backend
RUN pip install scrapy pymongo bs4 requests -i https://pypi.tuna.tsinghua.edu.cn/simple
# copy backend files
COPY --from=backend-build /go/src/app/conf ./conf
COPY --from=backend-build /go/bin/crawlab /usr/local/bin
# install nginx
RUN apk add --no-cache nginx openrc
RUN apk del .build-deps
# copy frontend files
COPY --from=frontend-build /app/dist /app/dist
COPY --from=frontend-build /app/conf/crawlab.conf /etc/nginx/conf.d
VOLUME [ /sys/fs/cgroup ]
RUN sed -i 's/#rc_sys=""/rc_sys="lxc"/g' /etc/rc.conf && \
echo 'rc_provide="loopback net"' >> /etc/rc.conf && \
sed -i 's/^#\(rc_logger="YES"\)$/\1/' /etc/rc.conf && \
sed -i '/tty/d' /etc/inittab && \
sed -i 's/hostname $opts/# hostname $opts/g' /etc/init.d/hostname && \
sed -i 's/mount -t tmpfs/# mount -t tmpfs/g' /lib/rc/sh/init.sh && \
sed -i 's/cgroup_add_service /# cgroup_add_service /g' /lib/rc/sh/openrc-run.sh && \
rm -rf /var/cache/apk/* && \
mkdir -p /run/openrc && \
touch /run/openrc/softlevel && \
/sbin/openrc
# working directory
WORKDIR /app/backend
# frontend port
EXPOSE 8080
# backend port
EXPOSE 8000
# start backend
CMD ["/bin/sh", "/app/docker_init.sh"]

43
Dockerfile.worker.apline Normal file
View File

@@ -0,0 +1,43 @@
FROM golang:1.12-alpine AS backend-build
WORKDIR /go/src/app
COPY ./backend .
ENV GO111MODULE on
ENV GOPROXY https://mirrors.aliyun.com/goproxy/
RUN go install -v ./...
# images
FROM python:alpine
ADD . /app
RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.aliyun.com/g' /etc/apk/repositories
RUN apk update && apk add --no-cache --virtual .build-deps \
gcc \
linux-headers \
musl-dev \
libffi-dev \
libxml2-dev \
libxslt-dev \
openssl-dev
# install backend
RUN pip install scrapy pymongo bs4 requests -i https://pypi.tuna.tsinghua.edu.cn/simple
# copy backend files
COPY --from=backend-build /go/src/app/conf ./conf
COPY --from=backend-build /go/bin/crawlab /usr/local/bin
RUN apk del .build-deps
# working directory
WORKDIR /app/backend
# backend port
EXPOSE 8000
# start backend
CMD crawlab

View File

@@ -17,8 +17,8 @@
## 安装
三种方式:
1. [Docker](https://tikazyq.github.io/crawlab/Installation/Docker.html)(推荐)
2. [直接部署](https://tikazyq.github.io/crawlab/Installation/Direct.html)(了解内核)
1. [Docker](https://tikazyq.github.io/crawlab-docs/Installation/Docker.html)(推荐)
2. [直接部署](https://tikazyq.github.io/crawlab-docs/Installation/Direct.html)(了解内核)
### 要求Docker
- Docker 18.03+
@@ -87,11 +87,11 @@ services:
docker-compose up
```
Docker部署的详情请见[相关文档](https://tikazyq.github.io/crawlab/Installation/Docker.html)。
Docker部署的详情请见[相关文档](https://tikazyq.github.io/crawlab-docs/Installation/Docker.html)。
### 直接部署
请参考[相关文档](https://tikazyq.github.io/crawlab/Installation/Direct.html)。
请参考[相关文档](https://tikazyq.github.io/crawlab-docs/Installation/Direct.html)。
## 截图

View File

@@ -17,8 +17,8 @@ Golang-based distributed web crawler management platform, supporting various lan
## Installation
Two methods:
1. [Docker](https://tikazyq.github.io/crawlab/Installation/Docker.html) (Recommended)
2. [Direct Deploy](https://tikazyq.github.io/crawlab/Installation/Direct.html) (Check Internal Kernel)
1. [Docker](https://tikazyq.github.io/crawlab-docs/Installation/Docker.html) (Recommended)
2. [Direct Deploy](https://tikazyq.github.io/crawlab-docs/Installation/Direct.html) (Check Internal Kernel)
### Pre-requisite (Docker)
- Docker 18.03+
@@ -88,7 +88,7 @@ Then execute the command below, and Crawlab Master Node + MongoDB + Redis will s
docker-compose up
```
For Docker Deployment details, please refer to [relevant documentation](https://tikazyq.github.io/crawlab/Installation/Docker.html).
For Docker Deployment details, please refer to [relevant documentation](https://tikazyq.github.io/crawlab-docs/Installation/Docker.html).
## Screenshot

View File

@@ -20,6 +20,10 @@ server:
port: 8000
master: "N"
secret: "crawlab"
register:
# mac地址 或者 ip地址如果是ip则需要手动指定IP
type: "mac"
ip: ""
spider:
path: "/app/spiders"
task:

View File

@@ -5,12 +5,12 @@ import (
"crawlab/database"
"crawlab/lib/cron"
"crawlab/model"
"crawlab/services/register"
"encoding/json"
"fmt"
"github.com/apex/log"
"github.com/globalsign/mgo/bson"
"github.com/spf13/viper"
"net"
"runtime/debug"
"time"
)
@@ -49,43 +49,10 @@ const (
No = "N"
)
// 获取本机的IP地址
// TODO: 考虑多个IP地址的情况
func GetIp() (string, error) {
addrList, err := net.InterfaceAddrs()
if err != nil {
return "", err
}
for _, value := range addrList {
if ipNet, ok := value.(*net.IPNet); ok && !ipNet.IP.IsLoopback() {
if ipNet.IP.To4() != nil {
return ipNet.IP.String(), nil
}
}
}
return "", nil
}
// 获取本机的MAC地址
func GetMac() (string, error) {
interfaces, err := net.Interfaces()
if err != nil {
debug.PrintStack()
return "", err
}
for _, inter := range interfaces {
if inter.HardwareAddr != nil {
mac := inter.HardwareAddr.String()
return mac, nil
}
}
return "", nil
}
// 获取本机节点
func GetCurrentNode() (model.Node, error) {
// 获取本机MAC地址
mac, err := GetMac()
value, err := register.GetRegister().GetValue()
if err != nil {
debug.PrintStack()
return model.Node{}, err
@@ -101,14 +68,14 @@ func GetCurrentNode() (model.Node, error) {
}
// 尝试获取节点
node, err = model.GetNodeByMac(mac)
node, err = model.GetNodeByMac(value)
// 如果获取失败
if err != nil {
// 如果为主节点,表示为第一次注册,插入节点信息
if IsMaster() {
// 获取本机IP地址
ip, err := GetIp()
ip, err := register.GetRegister().GetIp()
if err != nil {
debug.PrintStack()
return model.Node{}, err
@@ -117,8 +84,8 @@ func GetCurrentNode() (model.Node, error) {
node = model.Node{
Id: bson.NewObjectId(),
Ip: ip,
Name: mac,
Mac: mac,
Name: value,
Mac: value,
IsMaster: true,
}
if err := node.Add(); err != nil {
@@ -155,12 +122,12 @@ func IsMasterNode(id string) bool {
// 获取节点数据
func GetNodeData() (Data, error) {
mac, err := GetMac()
val, err := register.GetRegister().GetValue()
if err != nil {
return Data{}, err
}
value, err := database.RedisClient.HGet("nodes", mac)
value, err := database.RedisClient.HGet("nodes", val)
data := Data{}
if err := json.Unmarshal([]byte(value), &data); err != nil {
return data, err
@@ -269,14 +236,14 @@ func UpdateNodeStatus() {
// 更新节点数据
func UpdateNodeData() {
// 获取MAC地址
mac, err := GetMac()
val, err := register.GetRegister().GetValue()
if err != nil {
log.Errorf(err.Error())
return
}
// 获取IP地址
ip, err := GetIp()
ip, err := register.GetRegister().GetIp()
if err != nil {
log.Errorf(err.Error())
return
@@ -284,7 +251,7 @@ func UpdateNodeData() {
// 构造节点数据
data := Data{
Mac: mac,
Mac: val,
Ip: ip,
Master: IsMaster(),
UpdateTs: time.Now(),
@@ -298,7 +265,7 @@ func UpdateNodeData() {
debug.PrintStack()
return
}
if err := database.RedisClient.HSet("nodes", mac, string(dataBytes)); err != nil {
if err := database.RedisClient.HSet("nodes", val, string(dataBytes)); err != nil {
log.Errorf(err.Error())
return
}

View File

@@ -0,0 +1,105 @@
package register
import (
"github.com/apex/log"
"github.com/spf13/viper"
"net"
"reflect"
"runtime/debug"
)
type Register interface {
// 注册的key类型
GetKey() string
// 注册的key的值唯一标识节点
GetValue() (string, error)
// 注册的节点IP
GetIp() (string, error)
}
// mac 地址注册
type MacRegister struct{}
func (mac *MacRegister) GetKey() string {
return "mac"
}
func (mac *MacRegister) GetValue() (string, error) {
interfaces, err := net.Interfaces()
if err != nil {
log.Errorf("get interfaces error:" + err.Error())
debug.PrintStack()
return "", err
}
for _, inter := range interfaces {
if inter.HardwareAddr != nil {
mac := inter.HardwareAddr.String()
return mac, nil
}
}
return "", nil
}
func (mac *MacRegister) GetIp() (string, error) {
return getIp()
}
// ip 注册
type IpRegister struct {
Ip string
}
func (ip *IpRegister) GetKey() string {
return "ip"
}
func (ip *IpRegister) GetValue() (string, error) {
return ip.Ip, nil
}
func (ip *IpRegister) GetIp() (string, error) {
return ip.Ip, nil
}
// 获取本机的IP地址
// TODO: 考虑多个IP地址的情况
func getIp() (string, error) {
addrList, err := net.InterfaceAddrs()
if err != nil {
return "", err
}
for _, value := range addrList {
if ipNet, ok := value.(*net.IPNet); ok && !ipNet.IP.IsLoopback() {
if ipNet.IP.To4() != nil {
return ipNet.IP.String(), nil
}
}
}
return "", nil
}
var register Register
// 获得注册器
func GetRegister() Register {
if register != nil {
return register
}
registerType := viper.GetString("server.register.type")
if registerType == "mac" {
register = &MacRegister{}
} else {
ip := viper.GetString("server.register.ip")
if ip == "" {
log.Error("server.register.ip is empty")
debug.PrintStack()
return nil
}
register = &IpRegister{
Ip: ip,
}
}
log.Info("register type is :" + reflect.TypeOf(register).String())
return register
}

View File

@@ -106,7 +106,7 @@ func DeCompress(srcFile *os.File, dstPath string) error {
}
// 创建新文件
newFile, err := os.Create(filepath.Join(dstPath, innerFile.Name))
newFile, err := os.OpenFile(filepath.Join(dstPath, innerFile.Name), os.O_RDWR|os.O_CREATE|os.O_TRUNC, info.Mode())
if err != nil {
log.Errorf("Unzip File Error : " + err.Error())
debug.PrintStack()

7
examples/README.md Normal file
View File

@@ -0,0 +1,7 @@
# Examples
* frontend 前端镜像制作
* master Master节点镜像制作
* worker Worker节点镜像制作
* run_docker_master.sh 运行Master节点示例脚本
* run_docker_worker.sh 运行Worker节点示例脚本

View File

@@ -0,0 +1,20 @@
FROM alpine:latest
# 安装nginx
RUN mkdir /run/nginx && apk add nginx
# 拷贝编译文件
COPY dist /app/dist
# 拷贝nginx代理文件
COPY crawlab.conf /etc/nginx/conf.d
# 拷贝执行脚本
COPY docker_init.sh /app/docker_init.sh
# 定义后端API脚本
ENV CRAWLAB_API_ADDRESS http://localhost:8000
EXPOSE 8080
CMD ["/bin/sh", "/app/docker_init.sh"]

View File

@@ -0,0 +1,5 @@
# 前端镜像制作
前端需要手动build拿到编译后的文件然后放入此目录进行镜像构建。
容器运行的时候需要指定`CRAWLAB_API_ADDRESS`环境变量用于访问后端API接口

View File

@@ -0,0 +1,13 @@
server {
gzip on;
gzip_min_length 1k;
gzip_buffers 4 16k;
#gzip_http_version 1.0;
gzip_comp_level 2;
gzip_types text/plain application/javascript application/x-javascript text/css application/xml text/javascript application/x-httpd-php image/jpeg image/gif image/png;
gzip_vary off;
gzip_disable "MSIE [1-6]\.";
listen 8080;
root /app/dist;
index index.html;
}

View File

@@ -0,0 +1,23 @@
#!/bin/sh
# replace default api path to new one
if [ "${CRAWLAB_API_ADDRESS}" = "" ];
then
:
else
jspath=`ls /app/dist/js/app.*.js`
sed -i "s?localhost:8000?${CRAWLAB_API_ADDRESS}?g" ${jspath}
fi
# replace base url
if [ "${CRAWLAB_BASE_URL}" = "" ];
then
:
else
indexpath=/app/dist/index.html
sed -i "s?/js/?${CRAWLAB_BASE_URL}/js/?g" ${indexpath}
sed -i "s?/css/?${CRAWLAB_BASE_URL}/css/?g" ${indexpath}
fi
# start nginx
nginx -g 'daemon off;'

View File

@@ -0,0 +1,20 @@
FROM alpine:latest
# 配置工作目录
WORKDIR /opt/crawlab
# 拷贝配置文件
COPY config.yml /opt/crawlab/conf/config.yml
# 拷贝可执行文件
COPY crawlab /usr/local/bin
# 创建spiders文件用于存放爬虫 授权可执行文件
RUN mkdir -p /opt/crawlab/spiders && chmod +x /usr/local/bin/crawlab
# 指定为Master节点
ENV CRAWLAB_SERVER_MASTER Y
EXPOSE 8000
CMD ["crawlab"]

View File

@@ -0,0 +1,8 @@
# Master 节点镜像制作
在Dockerfile里面的二进制包需要手动在源码目录下进行构建然后再放进来。
## Linux 二进制包构建
```
CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o crawlab main.go
```

View File

@@ -0,0 +1,32 @@
api:
address: "localhost:8000"
mongo:
host: "192.168.235.26"
port: 27017
db: crawlab_local
username: "root"
password: "example"
authSource: "admin"
redis:
address: 192.168.235.0
password: redis-1.0
database: 29
port: 16379
log:
level: info
path: "/logs/crawlab"
server:
host: 0.0.0.0
port: 8000
master: "Y"
secret: "crawlab"
register:
# mac 或者 ip如果是ip则需要手动指定IP
type: "mac"
ip: "192.168.0.104"
spider:
path: "/spiders"
task:
workers: 4
other:
tmppath: "/tmp"

View File

@@ -1,4 +1,4 @@
# worker节点
# 本地开发环境worker节点制作
由于master和worker节点的存储信息是在redis上并且使用节点所在的mac地址作为key所以在开发本地需要启动master和worker节点会比较麻烦。
这里是一个运行worker节点的一个例子。