diff --git a/crawlab/routes/tasks.py b/crawlab/routes/tasks.py index 17e5df6d..175ba857 100644 --- a/crawlab/routes/tasks.py +++ b/crawlab/routes/tasks.py @@ -34,9 +34,10 @@ class TaskApi(BaseApi): _task = db_manager.get('tasks_celery', id=task['_id']) _spider = db_manager.get('spiders', id=str(task['spider_id'])) if _task: - task['status'] = _task['status'] - else: - task['status'] = TaskStatus.UNAVAILABLE + if not task.get('status'): + task['status'] = _task['status'] + # else: + # task['status'] = TaskStatus.UNAVAILABLE task['result'] = _task['result'] task['spider_name'] = _spider['name'] try: diff --git a/crawlab/tasks/spider.py b/crawlab/tasks/spider.py index acc8c1b5..feef3003 100644 --- a/crawlab/tasks/spider.py +++ b/crawlab/tasks/spider.py @@ -5,6 +5,7 @@ from bson import ObjectId from celery.utils.log import get_logger from config import PROJECT_DEPLOY_FILE_FOLDER, PROJECT_LOGS_FOLDER +from constants.task import TaskStatus from db.manager import db_manager from .celery import celery_app import subprocess @@ -44,6 +45,7 @@ def execute_spider(self, id: str): 'node_id': 'celery@%s' % hostname, 'hostname': hostname, 'log_file_path': log_file_path, + 'status': TaskStatus.PENDING }) # execute the command @@ -61,9 +63,17 @@ def execute_spider(self, id: str): # get output from the process _stdout, _stderr = p.communicate() + # get return code + code = p.poll() + if code == 0: + status = TaskStatus.SUCCESS + else: + status = TaskStatus.FAILURE + # save task when the task is finished db_manager.update_one('tasks', id=task_id, values={ 'finish_ts': datetime.now(), + 'status': status }) task = db_manager.get('tasks', id=id) diff --git a/spiders/segmentfault/package-lock.json b/spiders/segmentfault/package-lock.json new file mode 100644 index 00000000..15a5cdf7 --- /dev/null +++ b/spiders/segmentfault/package-lock.json @@ -0,0 +1,81 @@ +{ + "name": "segmentfault", + "version": "1.0.0", + "lockfileVersion": 1, + "requires": true, + "dependencies": { + "bson": { + "version": "1.1.1", + "resolved": "http://registry.npm.taobao.org/bson/download/bson-1.1.1.tgz", + "integrity": "sha1-QzD16ZEExOdR5zUYWeLUCCefLxM=" + }, + "memory-pager": { + "version": "1.5.0", + "resolved": "http://registry.npm.taobao.org/memory-pager/download/memory-pager-1.5.0.tgz", + "integrity": "sha1-2HUWVdItOEaCdByXLyw9bfo+ZrU=", + "optional": true + }, + "mongodb": { + "version": "3.1.13", + "resolved": "http://registry.npm.taobao.org/mongodb/download/mongodb-3.1.13.tgz", + "integrity": "sha1-+M3Ls2rXoItXC9EnHIUldT91+fQ=", + "requires": { + "mongodb-core": "3.1.11", + "safe-buffer": "^5.1.2" + } + }, + "mongodb-core": { + "version": "3.1.11", + "resolved": "http://registry.npm.taobao.org/mongodb-core/download/mongodb-core-3.1.11.tgz", + "integrity": "sha1-slMDjbtNcynz0cLuVAC7DJIh/eU=", + "requires": { + "bson": "^1.1.0", + "require_optional": "^1.0.1", + "safe-buffer": "^5.1.2", + "saslprep": "^1.0.0" + } + }, + "require_optional": { + "version": "1.0.1", + "resolved": "http://registry.npm.taobao.org/require_optional/download/require_optional-1.0.1.tgz", + "integrity": "sha1-TPNaQkf2TKPfjC7yCMxJSxyo/C4=", + "requires": { + "resolve-from": "^2.0.0", + "semver": "^5.1.0" + } + }, + "resolve-from": { + "version": "2.0.0", + "resolved": "http://registry.npm.taobao.org/resolve-from/download/resolve-from-2.0.0.tgz", + "integrity": "sha1-lICrIOlP+h2egKgEx+oUdhGWa1c=" + }, + "safe-buffer": { + "version": "5.1.2", + "resolved": "http://registry.npm.taobao.org/safe-buffer/download/safe-buffer-5.1.2.tgz", + "integrity": "sha1-mR7GnSluAxN0fVm9/St0XDX4go0=" + }, + "saslprep": { + "version": "1.0.2", + "resolved": "http://registry.npm.taobao.org/saslprep/download/saslprep-1.0.2.tgz", + "integrity": "sha1-2lq5NubqC7rpEf/sd1NL43DJ9S0=", + "optional": true, + "requires": { + "sparse-bitfield": "^3.0.3" + } + }, + "semver": { + "version": "5.6.0", + "resolved": "http://registry.npm.taobao.org/semver/download/semver-5.6.0.tgz", + "integrity": "sha1-fnQlb7qknHWqfHogXMInmcrIAAQ=" + }, + "sparse-bitfield": { + "version": "3.0.3", + "resolved": "http://registry.npm.taobao.org/sparse-bitfield/download/sparse-bitfield-3.0.3.tgz", + "integrity": "sha1-/0rm5oZWBWuks+eSqzM004JzyhE=", + "optional": true, + "requires": { + "memory-pager": "^1.0.2" + } + } + } +} diff --git a/spiders/segmentfault/package.json b/spiders/segmentfault/package.json new file mode 100644 index 00000000..e1686a26 --- /dev/null +++ b/spiders/segmentfault/package.json @@ -0,0 +1,14 @@ +{ + "name": "segmentfault", + "version": "1.0.0", + "description": "", + "main": "segmentfault_spider.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "author": "", + "license": "ISC", + "dependencies": { + "mongodb": "^3.1.13" + } +} diff --git a/spiders/segmentfault/screenshot.png b/spiders/segmentfault/screenshot.png index c26747bd..afb4517d 100644 Binary files a/spiders/segmentfault/screenshot.png and b/spiders/segmentfault/screenshot.png differ diff --git a/spiders/segmentfault/segmentfault_spider.js b/spiders/segmentfault/segmentfault_spider.js index 9fbc20dc..a6c7b5fe 100644 --- a/spiders/segmentfault/segmentfault_spider.js +++ b/spiders/segmentfault/segmentfault_spider.js @@ -1,20 +1,27 @@ const puppeteer = require('puppeteer'); +const MongoClient = require('mongodb').MongoClient; (async () => { + // browser const browser = await (puppeteer.launch({ timeout: 15000 })); + // define start url const url = 'https://segmentfault.com/newest'; + // start a new page const page = await browser.newPage(); + // navigate to url await page.goto(url); await page.waitFor(2000); + // take a screenshot await page.screenshot({path: 'screenshot.png'}); - const titles = await page.evaluate(sel => { + // scrape data + const results = await page.evaluate(() => { let results = []; document.querySelectorAll('.news-list .news-item .news__item-title').forEach(el => { results.push({ @@ -24,7 +31,24 @@ const puppeteer = require('puppeteer'); return results; }); - console.log(titles); + // open database connection + const client = await MongoClient.connect('mongodb://localhost/crawlab_test'); + let db = await client.db('test'); + const colName = process.env.CRAWLAB_COLLECTION; + const taskId = process.env.CRAWLAB_TASK_ID; + const col = db.collection(colName); + // save to database + await results.forEach(d => { + d.task_id = taskId; + col.save(d); + }); + + // close database connection + db.close(); + + console.log(results); + + // shutdown browser browser.close(); })(); \ No newline at end of file