mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-22 17:31:03 +01:00
28
CHANGELOG.md
Normal file
28
CHANGELOG.md
Normal file
@@ -0,0 +1,28 @@
|
||||
# 0.2.2 (unreleased)
|
||||
### Features / Enhancement
|
||||
- **Automatic Extract Fields**: Automatically extracting data fields in list pages for configurable spider.
|
||||
- **Download Results**: Allow downloading results as csv file.
|
||||
- **Baidu Tongji**: Allow users to choose to report usage info to Baidu Tongji.
|
||||
|
||||
### Bug Fixes
|
||||
- **Results Page Pagination**: Fixes so the pagination of results page is working correctly. [#45](https://github.com/tikazyq/crawlab/issues/45)
|
||||
- **Schedule Tasks Duplicated Triggers**: Set Flask DEBUG as False so that schedule tasks won't trigger twice. [#32](https://github.com/tikazyq/crawlab/issues/32)
|
||||
- **Frontend Environment**: Added `VUE_APP_BASE_URL` as production mode environment variable so the API call won't be always `localhost` in deployed env [#30](https://github.com/tikazyq/crawlab/issues/30)
|
||||
|
||||
# 0.2.1 (2019-05-27)
|
||||
- **Configurable Spider**: Allow users to create a spider to crawl data without coding.
|
||||
|
||||
# 0.2 (2019-05-10)
|
||||
|
||||
- **Advanced Stats**: Advanced analytics in spider detail view.
|
||||
- **Sites Data**: Added sites list (China) for users to check info such as robots.txt and home page response time/code.
|
||||
|
||||
# 0.1.1 (2019-04-23)
|
||||
|
||||
- **Basic Stats**: User can view basic stats such as number of failed tasks and number of results in spiders and tasks pages.
|
||||
- **Near Realtime Task Info**: Periodically (5 sec) polling data from server to allow view task info in a near-realtime fashion.
|
||||
- **Scheduled Tasks**: Allow users to set up cron-like scheduled/periodical tasks using apscheduler.
|
||||
|
||||
# 0.1 (2019-04-17)
|
||||
|
||||
- **Initial Release**
|
||||
@@ -10,6 +10,7 @@ Babel==2.6.0
|
||||
beautifulsoup4==4.7.1
|
||||
billiard==3.6.0.0
|
||||
bs4==0.0.1
|
||||
bson==0.5.8
|
||||
cachetools==3.1.0
|
||||
celery==4.3.0
|
||||
certifi==2019.3.9
|
||||
@@ -20,9 +21,11 @@ coloredlogs==10.0
|
||||
constantly==15.1.0
|
||||
cryptography==2.6.1
|
||||
cssselect==1.0.3
|
||||
csvalidate==1.1.1
|
||||
Flask==1.0.2
|
||||
Flask-APScheduler==1.11.0
|
||||
Flask-Cors==3.0.7
|
||||
Flask-CSV==1.2.0
|
||||
Flask-RESTful==0.3.7
|
||||
flask-restplus==0.12.1
|
||||
flower==0.9.3
|
||||
@@ -42,6 +45,7 @@ jsonschema==3.0.1
|
||||
kombu==4.5.0
|
||||
lxml==4.3.3
|
||||
MarkupSafe==1.1.1
|
||||
marshmallow==2.19.2
|
||||
mongoengine==0.17.0
|
||||
multidict==4.5.2
|
||||
parsel==1.5.1
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from time import time
|
||||
|
||||
from flask_csv import send_csv
|
||||
|
||||
try:
|
||||
from _signal import SIGKILL
|
||||
@@ -178,7 +181,7 @@ class TaskApi(BaseApi):
|
||||
if not col_name:
|
||||
return []
|
||||
fields = get_spider_col_fields(col_name)
|
||||
items = db_manager.list(col_name, {'task_id': id})
|
||||
items = db_manager.list(col_name, {'task_id': id}, skip=page_size * (page_num - 1), limit=page_size)
|
||||
return {
|
||||
'status': 'ok',
|
||||
'fields': jsonify(fields),
|
||||
@@ -213,3 +216,16 @@ class TaskApi(BaseApi):
|
||||
'id': id,
|
||||
'status': 'ok',
|
||||
}
|
||||
|
||||
def download_results(self, id: str):
|
||||
task = db_manager.get('tasks', id=id)
|
||||
spider = db_manager.get('spiders', id=task['spider_id'])
|
||||
col_name = spider.get('col')
|
||||
if not col_name:
|
||||
return send_csv([], f'results_{col_name}_{round(time())}.csv')
|
||||
items = db_manager.list(col_name, {'task_id': id}, limit=999999999)
|
||||
fields = get_spider_col_fields(col_name, task_id=id, limit=999999999)
|
||||
return send_csv(items,
|
||||
filename=f'results_{col_name}_{round(time())}.csv',
|
||||
fields=fields,
|
||||
encoding='utf-8')
|
||||
|
||||
@@ -41,12 +41,17 @@ def get_spider_type(path: str) -> SpiderType:
|
||||
return SpiderType.SCRAPY
|
||||
|
||||
|
||||
def get_spider_col_fields(col_name: str) -> list:
|
||||
def get_spider_col_fields(col_name: str, task_id: str = None, limit: int = 100) -> list:
|
||||
"""
|
||||
Get spider collection fields
|
||||
:param col_name: collection name
|
||||
:param task_id: task_id
|
||||
:param limit: limit
|
||||
"""
|
||||
items = db_manager.list(col_name, {}, limit=100, sort_key='_id')
|
||||
filter_ = {}
|
||||
if task_id is not None:
|
||||
filter_['task_id'] = task_id
|
||||
items = db_manager.list(col_name, filter_, limit=limit, sort_key='_id')
|
||||
fields = set()
|
||||
for item in items:
|
||||
for k in item.keys():
|
||||
|
||||
@@ -58,18 +58,18 @@ export default {
|
||||
computed: {
|
||||
filteredData () {
|
||||
return this.data
|
||||
.map(d => d)
|
||||
.filter((d, index) => {
|
||||
// pagination
|
||||
const pageNum = this.pageNum
|
||||
const pageSize = this.pageSize
|
||||
return (pageSize * (pageNum - 1) <= index) && (index < pageSize * pageNum)
|
||||
})
|
||||
// .map(d => d)
|
||||
// .filter((d, index) => {
|
||||
// // pagination
|
||||
// const pageNum = this.pageNum
|
||||
// const pageSize = this.pageSize
|
||||
// return (pageSize * (pageNum - 1) <= index) && (index < pageSize * pageNum)
|
||||
// })
|
||||
}
|
||||
},
|
||||
methods: {
|
||||
onPageChange () {
|
||||
this.$emit('page-change')
|
||||
this.$emit('page-change', { pageNum: this.pageNum, pageSize: this.pageSize })
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -56,6 +56,8 @@ export default {
|
||||
'Stop': '停止',
|
||||
'Preview': '预览',
|
||||
'Extract Fields': '提取字段',
|
||||
'Download': '下载',
|
||||
'Download CSV': '下载CSV',
|
||||
|
||||
// 主页
|
||||
'Total Tasks': '总任务数',
|
||||
|
||||
@@ -18,7 +18,7 @@ const state = {
|
||||
pageNum: 0,
|
||||
pageSize: 10,
|
||||
// results
|
||||
resultsPageNum: 0,
|
||||
resultsPageNum: 1,
|
||||
resultsPageSize: 10
|
||||
}
|
||||
|
||||
|
||||
@@ -15,11 +15,17 @@
|
||||
</el-card>
|
||||
</el-tab-pane>
|
||||
<el-tab-pane :label="$t('Results')" name="results">
|
||||
<div class="button-group">
|
||||
<el-button type="primary" icon="el-icon-download" @click="downloadCSV">
|
||||
{{$t('Download CSV')}}
|
||||
</el-button>
|
||||
</div>
|
||||
<general-table-view :data="taskResultsData"
|
||||
:columns="taskResultsColumns"
|
||||
:page-num="resultsPageNum"
|
||||
:page-size="resultsPageSize"
|
||||
:total="taskResultsTotalCount"/>
|
||||
:total="taskResultsTotalCount"
|
||||
@page-change="onResultsPageChange"/>
|
||||
</el-tab-pane>
|
||||
</el-tabs>
|
||||
</div>
|
||||
@@ -78,6 +84,15 @@ export default {
|
||||
},
|
||||
onSpiderChange (id) {
|
||||
this.$router.push(`/spiders/${id}`)
|
||||
},
|
||||
onResultsPageChange (payload) {
|
||||
const { pageNum, pageSize } = payload
|
||||
this.resultsPageNum = pageNum
|
||||
this.resultsPageSize = pageSize
|
||||
this.$store.dispatch('task/getTaskResults', this.$route.params.id)
|
||||
},
|
||||
downloadCSV () {
|
||||
window.location.href = this.$request.baseUrl + '/tasks/' + this.$route.params.id + '/download_results'
|
||||
}
|
||||
},
|
||||
created () {
|
||||
@@ -114,4 +129,9 @@ export default {
|
||||
overflow-x: auto;
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
.button-group {
|
||||
margin-bottom: 10px;
|
||||
text-align: right;
|
||||
}
|
||||
</style>
|
||||
|
||||
Reference in New Issue
Block a user