diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..30de240b --- /dev/null +++ b/.dockerignore @@ -0,0 +1,3 @@ +.idea +logs +*.log \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..ec67bd6d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,53 @@ +# images +FROM ubuntu:latest + +# source files +ADD . /opt/crawlab + +# set as non-interactive +ENV DEBIAN_FRONTEND noninteractive + +# environment variables +ENV NVM_DIR /usr/local/nvm +ENV NODE_VERSION 8.12.0 +ENV WORK_DIR /opt/crawlab + +# install pkg +RUN apt-get update \ + && apt-get install -y curl git net-tools iputils-ping ntp gnupg2 nginx redis python python3 python3-pip \ + && apt-get clean \ + && cp $WORK_DIR/crawlab.conf /etc/nginx/conf.d \ + && ln -s /usr/bin/pip3 /usr/local/bin/pip \ + && ln -s /usr/bin/python3 /usr/local/bin/python + +# install mongodb +RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 9DA31620334BD75D9DCB49F368818C72E52529D4 \ + && echo "deb [ arch=amd64 ] https://repo.mongodb.org/apt/ubuntu bionic/mongodb-org/4.0 multiverse" | tee /etc/apt/sources.list.d/mongodb-org-4.0.list \ + && apt-get update \ + && apt-get install -y mongodb-org \ + && apt-get clean \ + && mkdir -p /data/db + +# install nvm +RUN curl https://raw.githubusercontent.com/creationix/nvm/v0.24.0/install.sh | bash \ + && . $NVM_DIR/nvm.sh \ + && nvm install v$NODE_VERSION \ + && nvm use v$NODE_VERSION \ + && nvm alias default v$NODE_VERSION +ENV NODE_PATH $NVM_DIR/versions/node/v$NODE_VERSION/lib/node_modules +ENV PATH $NVM_DIR/versions/node/v$NODE_VERSION/bin:$PATH + +# install frontend +RUN npm install -g yarn pm2 --registry=https://registry.npm.taobao.org \ + && cd /opt/crawlab/frontend \ + && yarn install --registry=https://registry.npm.taobao.org + +# install backend +RUN pip install -U setuptools -i https://pypi.tuna.tsinghua.edu.cn/simple \ + && pip install -r /opt/crawlab/crawlab/requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple + +# start backend +EXPOSE 8080 +EXPOSE 8000 +WORKDIR /opt/crawlab +ENTRYPOINT ["/bin/sh", "/opt/crawlab/docker_init.sh"] \ No newline at end of file diff --git a/crawlab.conf b/crawlab.conf new file mode 100644 index 00000000..f0b7cef2 --- /dev/null +++ b/crawlab.conf @@ -0,0 +1,5 @@ +server { + listen 8080; + root /opt/crawlab/frontend/dist; + index index.html; +} \ No newline at end of file diff --git a/crawlab/Dockerfile b/crawlab/Dockerfile deleted file mode 100644 index b286d956..00000000 --- a/crawlab/Dockerfile +++ /dev/null @@ -1,26 +0,0 @@ -# images -#FROM python:latest -FROM ubuntu:latest - -# source files -ADD . /opt/crawlab - -# add dns -RUN cat /etc/resolv.conf - -# install python -RUN apt-get update -RUN apt-get install -y python3 python3-pip net-tools iputils-ping vim ntp - -# soft link -RUN ln -s /usr/bin/pip3 /usr/local/bin/pip -RUN ln -s /usr/bin/python3 /usr/local/bin/python - -# install required libraries -RUN pip install -U setuptools -RUN pip install -r /opt/crawlab/requirements.txt - -# execute apps -WORKDIR /opt/crawlab -CMD python ./bin/run_worker.py -CMD python app.py diff --git a/crawlab/config/config.py b/crawlab/config/config.py index e235d400..da185c57 100644 --- a/crawlab/config/config.py +++ b/crawlab/config/config.py @@ -5,15 +5,11 @@ BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__fil # 爬虫源码路径 PROJECT_SOURCE_FILE_FOLDER = os.path.join(BASE_DIR, "spiders") -# 配置python虚拟环境的路径 -PYTHON_ENV_PATH = '/Users/yeqing/.pyenv/shims/python' - # 爬虫部署路径 -# PROJECT_DEPLOY_FILE_FOLDER = '../deployfile' PROJECT_DEPLOY_FILE_FOLDER = '/var/crawlab' # 爬虫日志路径 -PROJECT_LOGS_FOLDER = '../deployfile/logs' +PROJECT_LOGS_FOLDER = '/var/log/crawlab' # 打包临时文件夹 PROJECT_TMP_FOLDER = '/tmp' @@ -36,11 +32,6 @@ CELERY_TIMEZONE = 'Asia/Shanghai' # 是否启用UTC CELERY_ENABLE_UTC = True -# Celery Scheduler Redis URL -CELERY_BEAT_SCHEDULER = 'utils.redisbeat.RedisScheduler' -CELERY_REDIS_SCHEDULER_URL = 'redis://localhost:6379' -CELERY_REDIS_SCHEDULER_KEY = 'celery:beat:order_tasks' - # flower variables FLOWER_API_ENDPOINT = 'http://localhost:5555/api' diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py index 91ab8608..2cc39fea 100644 --- a/crawlab/routes/spiders.py +++ b/crawlab/routes/spiders.py @@ -509,7 +509,10 @@ class SpiderApi(BaseApi): }, r.status_code # get html parse tree - sel = etree.HTML(r.content) + try: + sel = etree.HTML(r.content.decode('utf-8')) + except Exception as err: + sel = etree.HTML(r.content) # remove unnecessary tags unnecessary_tags = [ @@ -550,6 +553,7 @@ class SpiderApi(BaseApi): '下页', 'next page', 'next', + '>' ] for tag in sel.iter(): if tag.text is not None and tag.text.lower().strip() in next_page_text_list: @@ -654,19 +658,24 @@ class SpiderApi(BaseApi): # get list item selector item_selector = None + item_selector_type = 'css' if max_tag.get('id') is not None: item_selector = f'#{max_tag.get("id")} > {self._get_children(max_tag)[0].tag}' elif max_tag.get('class') is not None: cls_str = '.'.join([x for x in max_tag.get("class").split(' ') if x != '']) if len(sel.cssselect(f'.{cls_str}')) == 1: item_selector = f'.{cls_str} > {self._get_children(max_tag)[0].tag}' + else: + item_selector = max_tag.getroottree().getpath(max_tag) + item_selector_type = 'xpath' # get list fields fields = [] if item_selector is not None: first_tag = self._get_children(max_tag)[0] for i, tag in enumerate(self._get_text_child_tags(first_tag)): - if len(first_tag.cssselect(f'{tag.tag}')) == 1: + el_list = first_tag.cssselect(f'{tag.tag}') + if len(el_list) == 1: fields.append({ 'name': f'field{i + 1}', 'type': 'css', @@ -682,6 +691,15 @@ class SpiderApi(BaseApi): 'extract_type': 'text', 'query': f'{tag.tag}.{cls_str}', }) + else: + for j, el in enumerate(el_list): + if tag == el: + fields.append({ + 'name': f'field{i + 1}', + 'type': 'css', + 'extract_type': 'text', + 'query': f'{tag.tag}:nth-of-type({j + 1})', + }) for i, tag in enumerate(self._get_a_child_tags(self._get_children(max_tag)[0])): # if the tag is , extract its href @@ -707,6 +725,7 @@ class SpiderApi(BaseApi): return { 'status': 'ok', 'item_selector': item_selector, + 'item_selector_type': item_selector_type, 'pagination_selector': pagination_selector, 'fields': fields } diff --git a/crawlab/spiders/spiders/spiders/config_spider.py b/crawlab/spiders/spiders/spiders/config_spider.py index 13fa82bf..77e65862 100644 --- a/crawlab/spiders/spiders/spiders/config_spider.py +++ b/crawlab/spiders/spiders/spiders/config_spider.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import os import sys -from urllib.parse import urlparse +from urllib.parse import urlparse, urljoin import scrapy @@ -72,8 +72,8 @@ def get_next_url(response): # found next url if next_url is not None: if not next_url.startswith('http') and not next_url.startswith('//'): - u = urlparse(response.url) - next_url = f'{u.scheme}://{u.netloc}{next_url}' + return urljoin(response.url, next_url) + else: return next_url return None diff --git a/crawlab/tasks/spider.py b/crawlab/tasks/spider.py index 3bdc65bc..a568dc0f 100644 --- a/crawlab/tasks/spider.py +++ b/crawlab/tasks/spider.py @@ -6,7 +6,7 @@ from time import sleep from bson import ObjectId from pymongo import ASCENDING, DESCENDING -from config import PROJECT_DEPLOY_FILE_FOLDER, PROJECT_LOGS_FOLDER, PYTHON_ENV_PATH, MONGO_HOST, MONGO_PORT, MONGO_DB +from config import PROJECT_DEPLOY_FILE_FOLDER, PROJECT_LOGS_FOLDER, MONGO_HOST, MONGO_PORT, MONGO_DB from constants.task import TaskStatus from db.manager import db_manager from .celery import celery_app diff --git a/crawlab/docker-compose.yml b/docker-compose.yml similarity index 86% rename from crawlab/docker-compose.yml rename to docker-compose.yml index fe2bfb78..088fdbb8 100644 --- a/crawlab/docker-compose.yml +++ b/docker-compose.yml @@ -1,11 +1,11 @@ version: '3.3' # 表示该 Docker-Compose 文件使用的是 Version 2 file services: - web: # 指定服务名称 + app: # 指定服务名称 build: . # 指定 Dockerfile 所在路径 ports: # 指定端口映射 - "5001:5000" task: - image: crawlab:v3 + image: crawlab:latest db: image: mongo restart: always diff --git a/docker_init.sh b/docker_init.sh new file mode 100755 index 00000000..59208c8e --- /dev/null +++ b/docker_init.sh @@ -0,0 +1,17 @@ +#!/bin/sh +case $1 in + master) + cd /opt/crawlab/frontend \ + && npm run build:prod \ + && service nginx start \ + && mongod --fork --logpath /var/log/mongod.log + redis-server >> /var/log/redis-server.log 2>&1 & + python $WORK_DIR/crawlab/flower.py >> /opt/crawlab/flower.log 2>&1 & + python $WORK_DIR/crawlab/worker.py >> /opt/crawlab/worker.log 2>&1 & + python $WORK_DIR/crawlab/app.py + ;; + worker) + python $WORK_DIR/crawlab/app.py >> /opt/crawlab/app.log 2>&1 & + python $WORK_DIR/crawlab/worker.py + ;; +esac \ No newline at end of file diff --git a/frontend/Dockerfile b/frontend/Dockerfile new file mode 100644 index 00000000..bfcc0d49 --- /dev/null +++ b/frontend/Dockerfile @@ -0,0 +1,36 @@ +# images +FROM node:8.12 + +# source files +ADD . /opt/crawlab/frontend + +#更新apt-get源 使用163的源 +#RUN mv /etc/apt/sources.list /etc/apt/sources.list.bak +#COPY sources.list /etc/apt/sources.list + +# environment variables +#ENV NVM_DIR /usr/local/nvm +#ENV NODE_VERSION 8.12.0 +#ENV WORK_DIR /opt/crawlab/frontend + +# install git curl +RUN apt-get update && apt-get install -y nginx +#RUN apt-get install -y git curl + +# install nvm +#RUN curl https://raw.githubusercontent.com/creationix/nvm/v0.24.0/install.sh | bash \ +# && . $NVM_DIR/nvm.sh \ +# && nvm install v$NODE_VERSION \ +# && nvm use v$NODE_VERSION \ +# && nvm alias default v$NODE_VERSION +#ENV NODE_PATH $NVM_DIR/versions/node/v$NODE_VERSION/lib/node_modules +#ENV PATH $NVM_DIR/versions/node/v$NODE_VERSION/bin:$PATH + +# install frontend +RUN npm install -g yarn pm2 --registry=https://registry.npm.taobao.org +RUN cd /opt/crawlab/frontend && yarn install --registry=https://registry.npm.taobao.org + +# nginx config & start frontend +RUN cp $WORK_DIR/conf/crawlab.conf /etc/nginx/conf.d && service nginx reload + +CMD ["npm", "run", "build:prod"] diff --git a/frontend/conf/crawlab.conf b/frontend/conf/crawlab.conf new file mode 100644 index 00000000..da8513f3 --- /dev/null +++ b/frontend/conf/crawlab.conf @@ -0,0 +1,5 @@ +server { + listen 8080; + root /opt/crawlab/frontend/dist; + index index.html; +} diff --git a/frontend/conf/nginx.conf b/frontend/conf/nginx.conf index 72c248d2..56e4cd80 100644 --- a/frontend/conf/nginx.conf +++ b/frontend/conf/nginx.conf @@ -8,8 +8,8 @@ http { default_type application/octet-stream; server { - listen 8888; - root /Users/yeqing/projects/crawlab-frontend/dist; + listen 8080; + root /opt/dist; index index.html; location ~ .*\.(js|css)?$ { diff --git a/frontend/package.json b/frontend/package.json index e751c9f9..42bcf078 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -27,6 +27,7 @@ "vue-codemirror-lite": "^1.0.4", "vue-i18n": "^8.9.0", "vue-router": "^3.0.1", + "vue-virtual-scroll-list": "^1.3.9", "vuex": "^3.0.1" }, "devDependencies": { diff --git a/frontend/sources.list b/frontend/sources.list new file mode 100644 index 00000000..6ff85f91 --- /dev/null +++ b/frontend/sources.list @@ -0,0 +1,4 @@ +deb http://mirrors.aliyun.com/debian/ jessie main non-free contrib +deb http://mirrors.aliyun.com/debian/ jessie-proposed-updates main non-free contrib +deb-src http://mirrors.aliyun.com/debian/ jessie main non-free contrib +deb-src http://mirrors.aliyun.com/debian/ jessie-proposed-updates main non-free contrib diff --git a/frontend/src/components/Charts/keyboard.vue b/frontend/src/components/Charts/keyboard.vue deleted file mode 100644 index 857b26ae..00000000 --- a/frontend/src/components/Charts/keyboard.vue +++ /dev/null @@ -1,156 +0,0 @@ - - - diff --git a/frontend/src/components/Charts/lineMarker.vue b/frontend/src/components/Charts/lineMarker.vue deleted file mode 100644 index 5d41b022..00000000 --- a/frontend/src/components/Charts/lineMarker.vue +++ /dev/null @@ -1,227 +0,0 @@ - - - diff --git a/frontend/src/components/Charts/mixChart.vue b/frontend/src/components/Charts/mixChart.vue deleted file mode 100644 index a8e271ca..00000000 --- a/frontend/src/components/Charts/mixChart.vue +++ /dev/null @@ -1,271 +0,0 @@ - - - diff --git a/frontend/src/components/Charts/mixins/resize.js b/frontend/src/components/Charts/mixins/resize.js deleted file mode 100644 index c4c432f4..00000000 --- a/frontend/src/components/Charts/mixins/resize.js +++ /dev/null @@ -1,32 +0,0 @@ -import { debounce } from '@/utils' - -export default { - data() { - return { - sidebarElm: null - } - }, - mounted() { - this.__resizeHandler = debounce(() => { - if (this.chart) { - this.chart.resize() - } - }, 100) - window.addEventListener('resize', this.__resizeHandler) - - this.sidebarElm = document.getElementsByClassName('sidebar-container')[0] - this.sidebarElm && this.sidebarElm.addEventListener('transitionend', this.sidebarResizeHandler) - }, - beforeDestroy() { - window.removeEventListener('resize', this.__resizeHandler) - - this.sidebarElm && this.sidebarElm.removeEventListener('transitionend', this.sidebarResizeHandler) - }, - methods: { - sidebarResizeHandler(e) { - if (e.propertyName === 'width') { - this.__resizeHandler() - } - } - } -} diff --git a/frontend/src/components/Config/ConfigList.vue b/frontend/src/components/Config/ConfigList.vue index 0362b72f..56dc01c0 100644 --- a/frontend/src/components/Config/ConfigList.vue +++ b/frontend/src/components/Config/ConfigList.vue @@ -255,7 +255,9 @@ export default { .then(response => { if (response.data.item_selector) { this.$set(this.spiderForm, 'item_selector', response.data.item_selector) - this.$set(this.spiderForm, 'item_selector_type', 'css') + } + if (response.data.item_selector_type) { + this.$set(this.spiderForm, 'item_selector_type', response.data.item_selector_type) } if (response.data.fields && response.data.fields.length) { diff --git a/frontend/src/components/DragSelect/index.vue b/frontend/src/components/DragSelect/index.vue deleted file mode 100644 index 513be006..00000000 --- a/frontend/src/components/DragSelect/index.vue +++ /dev/null @@ -1,61 +0,0 @@ - - - - - diff --git a/frontend/src/components/Dropzone/index.vue b/frontend/src/components/Dropzone/index.vue deleted file mode 100644 index 15d811d9..00000000 --- a/frontend/src/components/Dropzone/index.vue +++ /dev/null @@ -1,297 +0,0 @@ - - - - - diff --git a/frontend/src/components/ScrollView/LogItem.vue b/frontend/src/components/ScrollView/LogItem.vue new file mode 100644 index 00000000..4a5aa168 --- /dev/null +++ b/frontend/src/components/ScrollView/LogItem.vue @@ -0,0 +1,39 @@ + + + + + diff --git a/frontend/src/components/ScrollView/LogView.vue b/frontend/src/components/ScrollView/LogView.vue new file mode 100644 index 00000000..75eff8be --- /dev/null +++ b/frontend/src/components/ScrollView/LogView.vue @@ -0,0 +1,78 @@ + + + + + diff --git a/frontend/src/router/index.js b/frontend/src/router/index.js index 0bfae74f..e03e9f49 100644 --- a/frontend/src/router/index.js +++ b/frontend/src/router/index.js @@ -159,6 +159,7 @@ export const constantRouterMap = [ name: 'Deploy', path: '/deploys', component: Layout, + hidden: true, meta: { title: 'Deploy', icon: 'fa fa-cloud' diff --git a/frontend/src/views/task/TaskDetail.vue b/frontend/src/views/task/TaskDetail.vue index 92198773..053ef963 100644 --- a/frontend/src/views/task/TaskDetail.vue +++ b/frontend/src/views/task/TaskDetail.vue @@ -7,11 +7,7 @@ -
-
-              {{taskLog}}
-            
-
+
@@ -37,10 +33,12 @@ import { } from 'vuex' import TaskOverview from '../../components/Overview/TaskOverview' import GeneralTableView from '../../components/TableView/GeneralTableView' +import LogView from '../../components/ScrollView/LogView' export default { name: 'TaskDetail', components: { + LogView, GeneralTableView, TaskOverview }, diff --git a/frontend/yarn.lock b/frontend/yarn.lock index 75d4b025..2d266229 100644 --- a/frontend/yarn.lock +++ b/frontend/yarn.lock @@ -8493,6 +8493,11 @@ vue-template-es2015-compiler@^1.6.0, vue-template-es2015-compiler@^1.8.2: version "1.8.2" resolved "http://registry.npm.taobao.org/vue-template-es2015-compiler/download/vue-template-es2015-compiler-1.8.2.tgz#dd73e80ba58bb65dd7a8aa2aeef6089cf6116f2a" +vue-virtual-scroll-list@^1.3.9: + version "1.3.9" + resolved "https://registry.npm.taobao.org/vue-virtual-scroll-list/download/vue-virtual-scroll-list-1.3.9.tgz#ba3ce6425374fb323ea83ab33daa2727117808ed" + integrity sha1-ujzmQlN0+zI+qDqzPaonJxF4CO0= + vue@^2.3.3: version "2.6.10" resolved "https://registry.npm.taobao.org/vue/download/vue-2.6.10.tgz#a72b1a42a4d82a721ea438d1b6bf55e66195c637"