changed the way how task is saved to database

This commit is contained in:
Marvin Zhang
2019-03-13 21:33:47 +08:00
parent e7ec1259a8
commit 6ed8a449ce
9 changed files with 59 additions and 34 deletions

View File

@@ -155,6 +155,14 @@ class SpiderApi(BaseApi):
def on_crawl(self, id):
job = execute_spider.delay(id)
# create a new task
db_manager.save('tasks', {
'_id': job.id,
'spider_id': ObjectId(id),
'create_ts': datetime.now(),
'status': TaskStatus.PENDING
})
return {
'code': 200,
'status': 'ok',
@@ -267,7 +275,7 @@ class SpiderApi(BaseApi):
}
def get_tasks(self, id):
items = db_manager.list('tasks', cond={'spider_id': ObjectId(id)}, limit=10, sort_key='finish_ts')
items = db_manager.list('tasks', cond={'spider_id': ObjectId(id)}, limit=10, sort_key='create_ts')
for item in items:
spider_id = item['spider_id']
spider = db_manager.get('spiders', id=str(spider_id))

View File

@@ -49,14 +49,12 @@ class TaskApi(BaseApi):
args = self.parser.parse_args()
page_size = args.get('page_size') or 10
page_num = args.get('page_num') or 1
tasks = db_manager.list('tasks', {}, limit=page_size, skip=page_size * (page_num - 1), sort_key='finish_ts')
tasks = db_manager.list('tasks', {}, limit=page_size, skip=page_size * (page_num - 1), sort_key='create_ts')
items = []
for task in tasks:
_task = db_manager.get('tasks_celery', id=task['_id'])
# _task = db_manager.get('tasks_celery', id=task['_id'])
_spider = db_manager.get('spiders', id=str(task['spider_id']))
if _task:
task['status'] = _task['status']
else:
if task.get('status') is None:
task['status'] = TaskStatus.UNAVAILABLE
task['spider_name'] = _spider['name']
items.append(task)

View File

@@ -38,17 +38,15 @@ def execute_spider(self, id: str):
stderr = open(log_file_path, 'a')
# create a new task
db_manager.save('tasks', {
'_id': task_id,
'spider_id': ObjectId(id),
'create_ts': datetime.now(),
db_manager.update_one('tasks', id=task_id, values={
'start_ts': datetime.now(),
'node_id': 'celery@%s' % hostname,
'hostname': hostname,
'log_file_path': log_file_path,
'status': TaskStatus.PENDING
'status': TaskStatus.STARTED
})
# execute the command
# start the process and pass params as env variables
env = os.environ.copy()
env['CRAWLAB_TASK_ID'] = task_id
if spider.get('col'):

View File

@@ -20,7 +20,7 @@
width="100">
<template slot-scope="scope">
<el-tag type="success" v-if="scope.row.status === 'SUCCESS'">SUCCESS</el-tag>
<el-tag type="warning" v-else-if="scope.row.status === 'PENDING'">PENDING</el-tag>
<el-tag type="warning" v-else-if="scope.row.status === 'STARTED'">STARTED</el-tag>
<el-tag type="danger" v-else-if="scope.row.status === 'FAILURE'">FAILURE</el-tag>
<el-tag type="info" v-else>{{scope.row['status']}}</el-tag>
</template>

View File

@@ -1,4 +1,3 @@
import dayjs from 'dayjs'
import request from '../../api/request'
const state = {

View File

@@ -52,7 +52,7 @@
:width="col.width">
<template slot-scope="scope">
<el-tag type="success" v-if="scope.row.status === 'SUCCESS'">SUCCESS</el-tag>
<el-tag type="warning" v-else-if="scope.row.status === 'PENDING'">PENDING</el-tag>
<el-tag type="warning" v-else-if="scope.row.status === 'STARTED'">STARTED</el-tag>
<el-tag type="danger" v-else-if="scope.row.status === 'FAILURE'">FAILURE</el-tag>
<el-tag type="info" v-else>{{scope.row[col.name]}}</el-tag>
</template>
@@ -66,7 +66,7 @@
:width="col.width">
</el-table-column>
</template>
<el-table-column label="Action" align="center" width="180">
<el-table-column label="Action" align="center" width="auto">
<template slot-scope="scope">
<el-tooltip content="View" placement="top">
<el-button type="primary" icon="el-icon-search" size="mini" @click="onView(scope.row)"></el-button>
@@ -104,10 +104,11 @@ export default {
},
// tableData,
columns: [
{ name: 'create_ts', label: 'Create Date', width: '150' },
{ name: 'finish_ts', label: 'Finish Date', width: '150' },
{ name: 'create_ts', label: 'Create Time', width: '150' },
{ name: 'start_ts', label: 'Start Time', width: '150' },
{ name: 'finish_ts', label: 'Finish Time', width: '150' },
{ name: 'spider_name', label: 'Spider', width: '160' },
{ name: 'node_id', label: 'Node', width: 'auto' },
{ name: 'node_id', label: 'Node', width: '160' },
{ name: 'status', label: 'Status', width: '160', sortable: true }
]
}

View File

@@ -9,6 +9,7 @@
"author": "",
"license": "ISC",
"dependencies": {
"mongodb": "^3.1.13"
"mongodb": "^3.1.13",
"puppeteer": "^1.13.0"
}
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 144 KiB

View File

@@ -4,7 +4,7 @@ const MongoClient = require('mongodb').MongoClient;
(async () => {
// browser
const browser = await (puppeteer.launch({
timeout: 15000
timeout: 10000
}));
// define start url
@@ -14,40 +14,60 @@ const MongoClient = require('mongodb').MongoClient;
const page = await browser.newPage();
// navigate to url
await page.goto(url);
await page.waitFor(2000);
try {
await page.goto(url);
await page.waitFor(2000);
} catch (e) {
console.error(e);
// take a screenshot
await page.screenshot({path: 'screenshot.png'});
// close browser
browser.close();
// exit code 1 indicating an error happened
code = 1;
process.emit("exit ");
process.reallyExit(code);
return
}
// scroll down to fetch more data
for (let i = 0; i < 10; i++) {
console.log('Pressing PageDown...');
await page.keyboard.press('PageDown', 200);
await page.waitFor(500);
}
// scrape data
const results = await page.evaluate(() => {
let results = [];
document.querySelectorAll('.news-list .news-item .news__item-title').forEach(el => {
document.querySelectorAll('.news-list .news-item').forEach(el => {
results.push({
title: el.innerText
url: 'https://segmentfault.com' + el.querySelector('.news__item-info > a').getAttribute('href'),
title: el.querySelector('.news__item-title').innerText
})
});
return results;
});
// open database connection
const client = await MongoClient.connect('mongodb://localhost/crawlab_test');
let db = await client.db('test');
const colName = process.env.CRAWLAB_COLLECTION;
const client = await MongoClient.connect('mongodb://192.168.99.100:27017');
let db = await client.db('crawlab_test');
const colName = process.env.CRAWLAB_COLLECTION || 'results_segmentfault';
const taskId = process.env.CRAWLAB_TASK_ID;
const col = db.collection(colName);
// save to database
await results.forEach(d => {
d.task_id = taskId;
col.save(d);
console.log(d);
col.insertOne(d);
});
// close database connection
db.close();
console.log(`results.length: ${results.length}`);
console.log(results);
// close database connection
client.close();
// shutdown browser
browser.close();