diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py
index 4cfc9457..89881223 100644
--- a/crawlab/routes/spiders.py
+++ b/crawlab/routes/spiders.py
@@ -155,6 +155,14 @@ class SpiderApi(BaseApi):
def on_crawl(self, id):
job = execute_spider.delay(id)
+ # create a new task
+ db_manager.save('tasks', {
+ '_id': job.id,
+ 'spider_id': ObjectId(id),
+ 'create_ts': datetime.now(),
+ 'status': TaskStatus.PENDING
+ })
+
return {
'code': 200,
'status': 'ok',
@@ -267,7 +275,7 @@ class SpiderApi(BaseApi):
}
def get_tasks(self, id):
- items = db_manager.list('tasks', cond={'spider_id': ObjectId(id)}, limit=10, sort_key='finish_ts')
+ items = db_manager.list('tasks', cond={'spider_id': ObjectId(id)}, limit=10, sort_key='create_ts')
for item in items:
spider_id = item['spider_id']
spider = db_manager.get('spiders', id=str(spider_id))
diff --git a/crawlab/routes/tasks.py b/crawlab/routes/tasks.py
index 24a2f7ca..1b13f87e 100644
--- a/crawlab/routes/tasks.py
+++ b/crawlab/routes/tasks.py
@@ -49,14 +49,12 @@ class TaskApi(BaseApi):
args = self.parser.parse_args()
page_size = args.get('page_size') or 10
page_num = args.get('page_num') or 1
- tasks = db_manager.list('tasks', {}, limit=page_size, skip=page_size * (page_num - 1), sort_key='finish_ts')
+ tasks = db_manager.list('tasks', {}, limit=page_size, skip=page_size * (page_num - 1), sort_key='create_ts')
items = []
for task in tasks:
- _task = db_manager.get('tasks_celery', id=task['_id'])
+ # _task = db_manager.get('tasks_celery', id=task['_id'])
_spider = db_manager.get('spiders', id=str(task['spider_id']))
- if _task:
- task['status'] = _task['status']
- else:
+ if task.get('status') is None:
task['status'] = TaskStatus.UNAVAILABLE
task['spider_name'] = _spider['name']
items.append(task)
diff --git a/crawlab/tasks/spider.py b/crawlab/tasks/spider.py
index feef3003..93f27a67 100644
--- a/crawlab/tasks/spider.py
+++ b/crawlab/tasks/spider.py
@@ -38,17 +38,15 @@ def execute_spider(self, id: str):
stderr = open(log_file_path, 'a')
# create a new task
- db_manager.save('tasks', {
- '_id': task_id,
- 'spider_id': ObjectId(id),
- 'create_ts': datetime.now(),
+ db_manager.update_one('tasks', id=task_id, values={
+ 'start_ts': datetime.now(),
'node_id': 'celery@%s' % hostname,
'hostname': hostname,
'log_file_path': log_file_path,
- 'status': TaskStatus.PENDING
+ 'status': TaskStatus.STARTED
})
- # execute the command
+ # start the process and pass params as env variables
env = os.environ.copy()
env['CRAWLAB_TASK_ID'] = task_id
if spider.get('col'):
diff --git a/frontend/src/components/TableView/TaskTableView.vue b/frontend/src/components/TableView/TaskTableView.vue
index 49d08fce..83ffc2a7 100644
--- a/frontend/src/components/TableView/TaskTableView.vue
+++ b/frontend/src/components/TableView/TaskTableView.vue
@@ -20,7 +20,7 @@
width="100">
SUCCESS
- PENDING
+ STARTED
FAILURE
{{scope.row['status']}}
diff --git a/frontend/src/store/modules/task.js b/frontend/src/store/modules/task.js
index 7b2321c7..c421a79b 100644
--- a/frontend/src/store/modules/task.js
+++ b/frontend/src/store/modules/task.js
@@ -1,4 +1,3 @@
-import dayjs from 'dayjs'
import request from '../../api/request'
const state = {
diff --git a/frontend/src/views/task/TaskList.vue b/frontend/src/views/task/TaskList.vue
index bee8e0c5..94899d6c 100644
--- a/frontend/src/views/task/TaskList.vue
+++ b/frontend/src/views/task/TaskList.vue
@@ -52,7 +52,7 @@
:width="col.width">
SUCCESS
- PENDING
+ STARTED
FAILURE
{{scope.row[col.name]}}
@@ -66,7 +66,7 @@
:width="col.width">
-
+
@@ -104,10 +104,11 @@ export default {
},
// tableData,
columns: [
- { name: 'create_ts', label: 'Create Date', width: '150' },
- { name: 'finish_ts', label: 'Finish Date', width: '150' },
+ { name: 'create_ts', label: 'Create Time', width: '150' },
+ { name: 'start_ts', label: 'Start Time', width: '150' },
+ { name: 'finish_ts', label: 'Finish Time', width: '150' },
{ name: 'spider_name', label: 'Spider', width: '160' },
- { name: 'node_id', label: 'Node', width: 'auto' },
+ { name: 'node_id', label: 'Node', width: '160' },
{ name: 'status', label: 'Status', width: '160', sortable: true }
]
}
diff --git a/spiders/segmentfault/package.json b/spiders/segmentfault/package.json
index e1686a26..65a0f774 100644
--- a/spiders/segmentfault/package.json
+++ b/spiders/segmentfault/package.json
@@ -9,6 +9,7 @@
"author": "",
"license": "ISC",
"dependencies": {
- "mongodb": "^3.1.13"
+ "mongodb": "^3.1.13",
+ "puppeteer": "^1.13.0"
}
}
diff --git a/spiders/segmentfault/screenshot.png b/spiders/segmentfault/screenshot.png
deleted file mode 100644
index afb4517d..00000000
Binary files a/spiders/segmentfault/screenshot.png and /dev/null differ
diff --git a/spiders/segmentfault/segmentfault_spider.js b/spiders/segmentfault/segmentfault_spider.js
index a6c7b5fe..d71d0762 100644
--- a/spiders/segmentfault/segmentfault_spider.js
+++ b/spiders/segmentfault/segmentfault_spider.js
@@ -4,7 +4,7 @@ const MongoClient = require('mongodb').MongoClient;
(async () => {
// browser
const browser = await (puppeteer.launch({
- timeout: 15000
+ timeout: 10000
}));
// define start url
@@ -14,40 +14,60 @@ const MongoClient = require('mongodb').MongoClient;
const page = await browser.newPage();
// navigate to url
- await page.goto(url);
- await page.waitFor(2000);
+ try {
+ await page.goto(url);
+ await page.waitFor(2000);
+ } catch (e) {
+ console.error(e);
- // take a screenshot
- await page.screenshot({path: 'screenshot.png'});
+ // close browser
+ browser.close();
+
+ // exit code 1 indicating an error happened
+ code = 1;
+ process.emit("exit ");
+ process.reallyExit(code);
+
+ return
+ }
+
+ // scroll down to fetch more data
+ for (let i = 0; i < 10; i++) {
+ console.log('Pressing PageDown...');
+ await page.keyboard.press('PageDown', 200);
+ await page.waitFor(500);
+ }
// scrape data
const results = await page.evaluate(() => {
let results = [];
- document.querySelectorAll('.news-list .news-item .news__item-title').forEach(el => {
+ document.querySelectorAll('.news-list .news-item').forEach(el => {
results.push({
- title: el.innerText
+ url: 'https://segmentfault.com' + el.querySelector('.news__item-info > a').getAttribute('href'),
+ title: el.querySelector('.news__item-title').innerText
})
});
return results;
});
// open database connection
- const client = await MongoClient.connect('mongodb://localhost/crawlab_test');
- let db = await client.db('test');
- const colName = process.env.CRAWLAB_COLLECTION;
+ const client = await MongoClient.connect('mongodb://192.168.99.100:27017');
+ let db = await client.db('crawlab_test');
+ const colName = process.env.CRAWLAB_COLLECTION || 'results_segmentfault';
const taskId = process.env.CRAWLAB_TASK_ID;
const col = db.collection(colName);
// save to database
await results.forEach(d => {
d.task_id = taskId;
- col.save(d);
+ console.log(d);
+ col.insertOne(d);
});
- // close database connection
- db.close();
+ console.log(`results.length: ${results.length}`);
- console.log(results);
+ // close database connection
+ client.close();
// shutdown browser
browser.close();