modified task status update logic

This commit is contained in:
Marvin Zhang
2019-03-12 13:10:24 +08:00
parent c532d0614d
commit 9180e00d94
6 changed files with 135 additions and 5 deletions

View File

@@ -34,9 +34,10 @@ class TaskApi(BaseApi):
_task = db_manager.get('tasks_celery', id=task['_id'])
_spider = db_manager.get('spiders', id=str(task['spider_id']))
if _task:
task['status'] = _task['status']
else:
task['status'] = TaskStatus.UNAVAILABLE
if not task.get('status'):
task['status'] = _task['status']
# else:
# task['status'] = TaskStatus.UNAVAILABLE
task['result'] = _task['result']
task['spider_name'] = _spider['name']
try:

View File

@@ -5,6 +5,7 @@ from bson import ObjectId
from celery.utils.log import get_logger
from config import PROJECT_DEPLOY_FILE_FOLDER, PROJECT_LOGS_FOLDER
from constants.task import TaskStatus
from db.manager import db_manager
from .celery import celery_app
import subprocess
@@ -44,6 +45,7 @@ def execute_spider(self, id: str):
'node_id': 'celery@%s' % hostname,
'hostname': hostname,
'log_file_path': log_file_path,
'status': TaskStatus.PENDING
})
# execute the command
@@ -61,9 +63,17 @@ def execute_spider(self, id: str):
# get output from the process
_stdout, _stderr = p.communicate()
# get return code
code = p.poll()
if code == 0:
status = TaskStatus.SUCCESS
else:
status = TaskStatus.FAILURE
# save task when the task is finished
db_manager.update_one('tasks', id=task_id, values={
'finish_ts': datetime.now(),
'status': status
})
task = db_manager.get('tasks', id=id)

81
spiders/segmentfault/package-lock.json generated Normal file
View File

@@ -0,0 +1,81 @@
{
"name": "segmentfault",
"version": "1.0.0",
"lockfileVersion": 1,
"requires": true,
"dependencies": {
"bson": {
"version": "1.1.1",
"resolved": "http://registry.npm.taobao.org/bson/download/bson-1.1.1.tgz",
"integrity": "sha1-QzD16ZEExOdR5zUYWeLUCCefLxM="
},
"memory-pager": {
"version": "1.5.0",
"resolved": "http://registry.npm.taobao.org/memory-pager/download/memory-pager-1.5.0.tgz",
"integrity": "sha1-2HUWVdItOEaCdByXLyw9bfo+ZrU=",
"optional": true
},
"mongodb": {
"version": "3.1.13",
"resolved": "http://registry.npm.taobao.org/mongodb/download/mongodb-3.1.13.tgz",
"integrity": "sha1-+M3Ls2rXoItXC9EnHIUldT91+fQ=",
"requires": {
"mongodb-core": "3.1.11",
"safe-buffer": "^5.1.2"
}
},
"mongodb-core": {
"version": "3.1.11",
"resolved": "http://registry.npm.taobao.org/mongodb-core/download/mongodb-core-3.1.11.tgz",
"integrity": "sha1-slMDjbtNcynz0cLuVAC7DJIh/eU=",
"requires": {
"bson": "^1.1.0",
"require_optional": "^1.0.1",
"safe-buffer": "^5.1.2",
"saslprep": "^1.0.0"
}
},
"require_optional": {
"version": "1.0.1",
"resolved": "http://registry.npm.taobao.org/require_optional/download/require_optional-1.0.1.tgz",
"integrity": "sha1-TPNaQkf2TKPfjC7yCMxJSxyo/C4=",
"requires": {
"resolve-from": "^2.0.0",
"semver": "^5.1.0"
}
},
"resolve-from": {
"version": "2.0.0",
"resolved": "http://registry.npm.taobao.org/resolve-from/download/resolve-from-2.0.0.tgz",
"integrity": "sha1-lICrIOlP+h2egKgEx+oUdhGWa1c="
},
"safe-buffer": {
"version": "5.1.2",
"resolved": "http://registry.npm.taobao.org/safe-buffer/download/safe-buffer-5.1.2.tgz",
"integrity": "sha1-mR7GnSluAxN0fVm9/St0XDX4go0="
},
"saslprep": {
"version": "1.0.2",
"resolved": "http://registry.npm.taobao.org/saslprep/download/saslprep-1.0.2.tgz",
"integrity": "sha1-2lq5NubqC7rpEf/sd1NL43DJ9S0=",
"optional": true,
"requires": {
"sparse-bitfield": "^3.0.3"
}
},
"semver": {
"version": "5.6.0",
"resolved": "http://registry.npm.taobao.org/semver/download/semver-5.6.0.tgz",
"integrity": "sha1-fnQlb7qknHWqfHogXMInmcrIAAQ="
},
"sparse-bitfield": {
"version": "3.0.3",
"resolved": "http://registry.npm.taobao.org/sparse-bitfield/download/sparse-bitfield-3.0.3.tgz",
"integrity": "sha1-/0rm5oZWBWuks+eSqzM004JzyhE=",
"optional": true,
"requires": {
"memory-pager": "^1.0.2"
}
}
}
}

View File

@@ -0,0 +1,14 @@
{
"name": "segmentfault",
"version": "1.0.0",
"description": "",
"main": "segmentfault_spider.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
"license": "ISC",
"dependencies": {
"mongodb": "^3.1.13"
}
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 129 KiB

After

Width:  |  Height:  |  Size: 144 KiB

View File

@@ -1,20 +1,27 @@
const puppeteer = require('puppeteer');
const MongoClient = require('mongodb').MongoClient;
(async () => {
// browser
const browser = await (puppeteer.launch({
timeout: 15000
}));
// define start url
const url = 'https://segmentfault.com/newest';
// start a new page
const page = await browser.newPage();
// navigate to url
await page.goto(url);
await page.waitFor(2000);
// take a screenshot
await page.screenshot({path: 'screenshot.png'});
const titles = await page.evaluate(sel => {
// scrape data
const results = await page.evaluate(() => {
let results = [];
document.querySelectorAll('.news-list .news-item .news__item-title').forEach(el => {
results.push({
@@ -24,7 +31,24 @@ const puppeteer = require('puppeteer');
return results;
});
console.log(titles);
// open database connection
const client = await MongoClient.connect('mongodb://localhost/crawlab_test');
let db = await client.db('test');
const colName = process.env.CRAWLAB_COLLECTION;
const taskId = process.env.CRAWLAB_TASK_ID;
const col = db.collection(colName);
// save to database
await results.forEach(d => {
d.task_id = taskId;
col.save(d);
});
// close database connection
db.close();
console.log(results);
// shutdown browser
browser.close();
})();