mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-22 17:31:03 +01:00
changed the way how task is saved to database
This commit is contained in:
@@ -155,6 +155,14 @@ class SpiderApi(BaseApi):
|
||||
def on_crawl(self, id):
|
||||
job = execute_spider.delay(id)
|
||||
|
||||
# create a new task
|
||||
db_manager.save('tasks', {
|
||||
'_id': job.id,
|
||||
'spider_id': ObjectId(id),
|
||||
'create_ts': datetime.now(),
|
||||
'status': TaskStatus.PENDING
|
||||
})
|
||||
|
||||
return {
|
||||
'code': 200,
|
||||
'status': 'ok',
|
||||
@@ -267,7 +275,7 @@ class SpiderApi(BaseApi):
|
||||
}
|
||||
|
||||
def get_tasks(self, id):
|
||||
items = db_manager.list('tasks', cond={'spider_id': ObjectId(id)}, limit=10, sort_key='finish_ts')
|
||||
items = db_manager.list('tasks', cond={'spider_id': ObjectId(id)}, limit=10, sort_key='create_ts')
|
||||
for item in items:
|
||||
spider_id = item['spider_id']
|
||||
spider = db_manager.get('spiders', id=str(spider_id))
|
||||
|
||||
@@ -49,14 +49,12 @@ class TaskApi(BaseApi):
|
||||
args = self.parser.parse_args()
|
||||
page_size = args.get('page_size') or 10
|
||||
page_num = args.get('page_num') or 1
|
||||
tasks = db_manager.list('tasks', {}, limit=page_size, skip=page_size * (page_num - 1), sort_key='finish_ts')
|
||||
tasks = db_manager.list('tasks', {}, limit=page_size, skip=page_size * (page_num - 1), sort_key='create_ts')
|
||||
items = []
|
||||
for task in tasks:
|
||||
_task = db_manager.get('tasks_celery', id=task['_id'])
|
||||
# _task = db_manager.get('tasks_celery', id=task['_id'])
|
||||
_spider = db_manager.get('spiders', id=str(task['spider_id']))
|
||||
if _task:
|
||||
task['status'] = _task['status']
|
||||
else:
|
||||
if task.get('status') is None:
|
||||
task['status'] = TaskStatus.UNAVAILABLE
|
||||
task['spider_name'] = _spider['name']
|
||||
items.append(task)
|
||||
|
||||
@@ -38,17 +38,15 @@ def execute_spider(self, id: str):
|
||||
stderr = open(log_file_path, 'a')
|
||||
|
||||
# create a new task
|
||||
db_manager.save('tasks', {
|
||||
'_id': task_id,
|
||||
'spider_id': ObjectId(id),
|
||||
'create_ts': datetime.now(),
|
||||
db_manager.update_one('tasks', id=task_id, values={
|
||||
'start_ts': datetime.now(),
|
||||
'node_id': 'celery@%s' % hostname,
|
||||
'hostname': hostname,
|
||||
'log_file_path': log_file_path,
|
||||
'status': TaskStatus.PENDING
|
||||
'status': TaskStatus.STARTED
|
||||
})
|
||||
|
||||
# execute the command
|
||||
# start the process and pass params as env variables
|
||||
env = os.environ.copy()
|
||||
env['CRAWLAB_TASK_ID'] = task_id
|
||||
if spider.get('col'):
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
width="100">
|
||||
<template slot-scope="scope">
|
||||
<el-tag type="success" v-if="scope.row.status === 'SUCCESS'">SUCCESS</el-tag>
|
||||
<el-tag type="warning" v-else-if="scope.row.status === 'PENDING'">PENDING</el-tag>
|
||||
<el-tag type="warning" v-else-if="scope.row.status === 'STARTED'">STARTED</el-tag>
|
||||
<el-tag type="danger" v-else-if="scope.row.status === 'FAILURE'">FAILURE</el-tag>
|
||||
<el-tag type="info" v-else>{{scope.row['status']}}</el-tag>
|
||||
</template>
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import dayjs from 'dayjs'
|
||||
import request from '../../api/request'
|
||||
|
||||
const state = {
|
||||
|
||||
@@ -52,7 +52,7 @@
|
||||
:width="col.width">
|
||||
<template slot-scope="scope">
|
||||
<el-tag type="success" v-if="scope.row.status === 'SUCCESS'">SUCCESS</el-tag>
|
||||
<el-tag type="warning" v-else-if="scope.row.status === 'PENDING'">PENDING</el-tag>
|
||||
<el-tag type="warning" v-else-if="scope.row.status === 'STARTED'">STARTED</el-tag>
|
||||
<el-tag type="danger" v-else-if="scope.row.status === 'FAILURE'">FAILURE</el-tag>
|
||||
<el-tag type="info" v-else>{{scope.row[col.name]}}</el-tag>
|
||||
</template>
|
||||
@@ -66,7 +66,7 @@
|
||||
:width="col.width">
|
||||
</el-table-column>
|
||||
</template>
|
||||
<el-table-column label="Action" align="center" width="180">
|
||||
<el-table-column label="Action" align="center" width="auto">
|
||||
<template slot-scope="scope">
|
||||
<el-tooltip content="View" placement="top">
|
||||
<el-button type="primary" icon="el-icon-search" size="mini" @click="onView(scope.row)"></el-button>
|
||||
@@ -104,10 +104,11 @@ export default {
|
||||
},
|
||||
// tableData,
|
||||
columns: [
|
||||
{ name: 'create_ts', label: 'Create Date', width: '150' },
|
||||
{ name: 'finish_ts', label: 'Finish Date', width: '150' },
|
||||
{ name: 'create_ts', label: 'Create Time', width: '150' },
|
||||
{ name: 'start_ts', label: 'Start Time', width: '150' },
|
||||
{ name: 'finish_ts', label: 'Finish Time', width: '150' },
|
||||
{ name: 'spider_name', label: 'Spider', width: '160' },
|
||||
{ name: 'node_id', label: 'Node', width: 'auto' },
|
||||
{ name: 'node_id', label: 'Node', width: '160' },
|
||||
{ name: 'status', label: 'Status', width: '160', sortable: true }
|
||||
]
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"mongodb": "^3.1.13"
|
||||
"mongodb": "^3.1.13",
|
||||
"puppeteer": "^1.13.0"
|
||||
}
|
||||
}
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 144 KiB |
@@ -4,7 +4,7 @@ const MongoClient = require('mongodb').MongoClient;
|
||||
(async () => {
|
||||
// browser
|
||||
const browser = await (puppeteer.launch({
|
||||
timeout: 15000
|
||||
timeout: 10000
|
||||
}));
|
||||
|
||||
// define start url
|
||||
@@ -14,40 +14,60 @@ const MongoClient = require('mongodb').MongoClient;
|
||||
const page = await browser.newPage();
|
||||
|
||||
// navigate to url
|
||||
await page.goto(url);
|
||||
await page.waitFor(2000);
|
||||
try {
|
||||
await page.goto(url);
|
||||
await page.waitFor(2000);
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
|
||||
// take a screenshot
|
||||
await page.screenshot({path: 'screenshot.png'});
|
||||
// close browser
|
||||
browser.close();
|
||||
|
||||
// exit code 1 indicating an error happened
|
||||
code = 1;
|
||||
process.emit("exit ");
|
||||
process.reallyExit(code);
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// scroll down to fetch more data
|
||||
for (let i = 0; i < 10; i++) {
|
||||
console.log('Pressing PageDown...');
|
||||
await page.keyboard.press('PageDown', 200);
|
||||
await page.waitFor(500);
|
||||
}
|
||||
|
||||
// scrape data
|
||||
const results = await page.evaluate(() => {
|
||||
let results = [];
|
||||
document.querySelectorAll('.news-list .news-item .news__item-title').forEach(el => {
|
||||
document.querySelectorAll('.news-list .news-item').forEach(el => {
|
||||
results.push({
|
||||
title: el.innerText
|
||||
url: 'https://segmentfault.com' + el.querySelector('.news__item-info > a').getAttribute('href'),
|
||||
title: el.querySelector('.news__item-title').innerText
|
||||
})
|
||||
});
|
||||
return results;
|
||||
});
|
||||
|
||||
// open database connection
|
||||
const client = await MongoClient.connect('mongodb://localhost/crawlab_test');
|
||||
let db = await client.db('test');
|
||||
const colName = process.env.CRAWLAB_COLLECTION;
|
||||
const client = await MongoClient.connect('mongodb://192.168.99.100:27017');
|
||||
let db = await client.db('crawlab_test');
|
||||
const colName = process.env.CRAWLAB_COLLECTION || 'results_segmentfault';
|
||||
const taskId = process.env.CRAWLAB_TASK_ID;
|
||||
const col = db.collection(colName);
|
||||
|
||||
// save to database
|
||||
await results.forEach(d => {
|
||||
d.task_id = taskId;
|
||||
col.save(d);
|
||||
console.log(d);
|
||||
col.insertOne(d);
|
||||
});
|
||||
|
||||
// close database connection
|
||||
db.close();
|
||||
console.log(`results.length: ${results.length}`);
|
||||
|
||||
console.log(results);
|
||||
// close database connection
|
||||
client.close();
|
||||
|
||||
// shutdown browser
|
||||
browser.close();
|
||||
|
||||
Reference in New Issue
Block a user