added lianjia example

2026-01-22 17:31:03 +01:00 · 2019-04-16 20:19:22 +08:00
parent 0b94bf241d
commit ffaccd67fd
16 changed files with 456 additions and 9 deletions
--- a/crawlab/utils/node.py
+++ b/crawlab/utils/node.py
@@ -33,9 +33,10 @@ def update_nodes_status(refresh=False):

        # new node
        if node is None:
-            node = {'_id': node_name, 'name': node_name, 'status': node_status}
+            node = {'_id': node_name, 'name': node_name, 'status': node_status, 'ip': 'localhost', 'port': '8000'}
            db_manager.save('nodes', node)

+        # existing node
        else:
            node['status'] = node_status
            db_manager.save('nodes', node)
--- a/frontend/src/i18n/zh.js
+++ b/frontend/src/i18n/zh.js
@@ -8,6 +8,7 @@ export default {
  'Task': '任务',
  'Tasks': '任务',
  'Task Detail': '任务详情',
+  'Schedules': '定时任务',
  'Deploys': '部署',

  // 标签
--- a/frontend/src/router/index.js
+++ b/frontend/src/router/index.js
@@ -140,7 +140,7 @@ export const constantRouterMap = [
      title: 'Schedules',
      icon: 'fa fa-calendar'
    },
-    hidden: true,
+    hidden: false,
    children: [
      {
        path: '',
--- a/frontend/src/store/index.js
+++ b/frontend/src/store/index.js
@@ -9,6 +9,7 @@ import spider from './modules/spider'
 import deploy from './modules/deploy'
 import task from './modules/task'
 import file from './modules/file'
+import schedule from './modules/schedule'
 import lang from './modules/lang'
 import getters from './getters'

@@ -25,6 +26,7 @@ const store = new Vuex.Store({
    deploy,
    task,
    file,
+    schedule,
    lang
  },
  getters
--- a/frontend/src/store/modules/schedule.js
+++ b/frontend/src/store/modules/schedule.js
@@ -0,0 +1,34 @@
+import request from '../../api/request'
+
+const state = {
+  scheduleList: [],
+  scheduleForm: {}
+}
+
+const getters = {}
+
+const mutations = {
+  SET_SCHEDULE_LIST (state, value) {
+    state.scheduleList = value
+  },
+  SET_SCHEDULE_FORM (state, value) {
+    state.scheduleForm = value
+  }
+}
+
+const actions = {
+  getScheduleList ({ state, commit }) {
+    request.get('/schedules')
+      .then(response => {
+        commit('SET_SCHEDULE_LIST', response.data.items)
+      })
+  }
+}
+
+export default {
+  namespaced: true,
+  state,
+  getters,
+  mutations,
+  actions
+}
--- a/frontend/src/views/schedule/ScheduleList.vue
+++ b/frontend/src/views/schedule/ScheduleList.vue
@@ -1,15 +1,121 @@
 <template>
  <div class="app-container">
-    Schedule List
+    <!--add popup-->
+    <el-dialog
+      :title="$t('Add Schedule')"
+      :visible.sync="dialogVisible"
+      width="60%"
+      :before-close="onDialogClose">
+      <el-form label-width="180px"
+               :model="scheduleForm"
+               :inline-message="true"
+               ref="scheduleForm"
+               label-position="right">
+        <el-form-item :label="$t('Schedule Name')" prop="url" required>
+          <el-input v-model="scheduleForm.name" :placeholder="$t('Schedule Name')"></el-input>
+        </el-form-item>
+        <el-form-item :label="$t('Cron')" prop="cron" required>
+          <el-input v-model="scheduleForm.cron" :placeholder="$t('Cron')"></el-input>
+        </el-form-item>
+        <el-form-item :label="$t('Schedule Description')" prop="description">
+          <el-input v-model="scheduleForm.description" :placeholder="$t('Schedule Description')"></el-input>
+        </el-form-item>
+      </el-form>
+      <span slot="footer" class="dialog-footer">
+        <el-button @click="onCancel">{{$t('Cancel')}}</el-button>
+        <el-button type="primary" @click="onAddSubmit">{{$t('Add')}}</el-button>
+      </span>
+    </el-dialog>
+
+    <!--filter-->
+    <div class="filter">
+      <div class="right">
+        <el-button type="primary"
+                   icon="el-icon-plus"
+                   class="refresh"
+                   @click="onAdd">
+          {{$t('Add Schedule')}}
+        </el-button>
+      </div>
+    </div>
+
+    <!--table list-->
+    <el-table :data="filteredTableData"
+              class="table"
+              :header-cell-style="{background:'rgb(48, 65, 86)',color:'white'}"
+              border>
+      <template v-for="col in columns">
+        <el-table-column :key="col.name"
+                         :property="col.name"
+                         :label="$t(col.label)"
+                         :sortable="col.sortable"
+                         align="center"
+                         :width="col.width">
+        </el-table-column>
+      </template>
+      <el-table-column :label="$t('Action')" align="left" width="250">
+        <template slot-scope="scope">
+          <el-tooltip :content="$t('View')" placement="top">
+            <el-button type="primary" icon="el-icon-search" size="mini" @click="onView(scope.row)"></el-button>
+          </el-tooltip>
+          <el-tooltip :content="$t('Remove')" placement="top">
+            <el-button type="danger" icon="el-icon-delete" size="mini" @click="onRemove(scope.row)"></el-button>
+          </el-tooltip>
+          <el-tooltip v-if="isShowRun(scope.row)" :content="$t('Run')" placement="top">
+            <el-button type="success" icon="fa fa-bug" size="mini" @click="onCrawl(scope.row)"></el-button>
+          </el-tooltip>
+        </template>
+      </el-table-column>
+    </el-table>
  </div>
 </template>

 <script>
+import {
+  mapState
+} from 'vuex'
+
 export default {
-  name: 'ScheduleList'
+  name: 'ScheduleList',
+  data () {
+    return {
+      columns: [
+        { name: 'name', label: 'Name', width: '220' },
+        { name: 'cron', label: 'Cron', width: '220' },
+        { name: 'description', label: 'Description', width: 'auto' }
+      ],
+      dialogVisible: false
+    }
+  },
+  computed: {
+    ...mapState('schedule', [
+      'scheduleList',
+      'scheduleForm'
+    ]),
+    filteredTableData () {
+      return this.scheduleList
+    }
+  },
+  methods: {
+    onDialogClose () {
+    },
+    onCancel () {
+      this.dialogVisible = false
+    },
+    onAdd () {
+      this.dialogVisible = true
+    },
+    onAddSubmit () {
+    }
+  },
+  created () {
+    this.$store.dispatch('schedule/getScheduleList')
+  }
 }
 </script>

 <style scoped>
-
+  .filter .right {
+    float: right;
+  }
 </style>
--- a/spiders/csdn/csdn_spider.js
+++ b/spiders/csdn/csdn_spider.js
@@ -1,6 +1,10 @@
 const puppeteer = require('puppeteer');
 const MongoClient = require('mongodb').MongoClient;

+const MONGO_HOST = process.env.MONGO_HOST;
+const MONGO_PORT = process.env.MONGO_PORT;
+const MONGO_DB = process.env.MONGO_DB;
+
 (async () => {
  // browser
  const browser = await (puppeteer.launch({
@@ -53,8 +57,8 @@ const MongoClient = require('mongodb').MongoClient;
  });

  // open database connection
-  const client = await MongoClient.connect('mongodb://127.0.0.1:27017');
-  let db = await client.db('crawlab_test');
+  const client = await MongoClient.connect(`mongodb://${MONGO_HOST}:${MONGO_PORT}`);
+  let db = await client.db(MONGO_DB);
  const colName = process.env.CRAWLAB_COLLECTION || 'results_juejin';
  const taskId = process.env.CRAWLAB_TASK_ID;
  const col = db.collection(colName);
--- a/spiders/example_juejin/juejin/pipelines.py
+++ b/spiders/example_juejin/juejin/pipelines.py
@@ -9,9 +9,8 @@ import os
 from pymongo import MongoClient

 MONGO_HOST = os.environ['MONGO_HOST']
-MONGO_PORT = os.environ['MONGO_PORT']
+MONGO_PORT = int(os.environ['MONGO_PORT'])
 MONGO_DB = os.environ['MONGO_DB']
-print(MONGO_HOST)


 class JuejinPipeline(object):
--- a/spiders/realestate/realestate/init.py
+++ b/spiders/realestate/realestate/init.py
--- a/spiders/realestate/realestate/items.py
+++ b/spiders/realestate/realestate/items.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class RealEstateItem(scrapy.Item):
+    # _id
+    _id = scrapy.Field()
+
+    # task_id
+    task_id = scrapy.Field()
+
+    # 房产名
+    name = scrapy.Field()
+
+    # url
+    url = scrapy.Field()
+
+    # 类别
+    type = scrapy.Field()
+
+    # 价格（万）
+    price = scrapy.Field()
+
+    # 大小
+    size = scrapy.Field()
+
+    # 小区
+    region = scrapy.Field()
+
+    # 城市
+    city = scrapy.Field()
--- a/spiders/realestate/realestate/middlewares.py
+++ b/spiders/realestate/realestate/middlewares.py
@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class RealestateSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class RealestateDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/spiders/realestate/realestate/pipelines.py
+++ b/spiders/realestate/realestate/pipelines.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import os
+
+from pymongo import MongoClient
+
+MONGO_HOST = os.environ['MONGO_HOST']
+MONGO_PORT = int(os.environ['MONGO_PORT'])
+MONGO_DB = os.environ['MONGO_DB']
+
+
+class MongoPipeline(object):
+    mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
+    db = mongo[MONGO_DB]
+    col_name = os.environ.get('CRAWLAB_COLLECTION')
+    col = db[col_name]
+
+    def process_item(self, item, spider):
+        item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
+        self.col.save(item)
+        return item
--- a/spiders/realestate/realestate/settings.py
+++ b/spiders/realestate/realestate/settings.py
@@ -0,0 +1,89 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for realestate project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'realestate'
+
+SPIDER_MODULES = ['realestate.spiders']
+NEWSPIDER_MODULE = 'realestate.spiders'
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+# USER_AGENT = 'realestate (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+# CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+# DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+# CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+# COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+# TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+# DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+# }
+
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+# SPIDER_MIDDLEWARES = {
+#    'realestate.middlewares.RealestateSpiderMiddleware': 543,
+# }
+
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+# DOWNLOADER_MIDDLEWARES = {
+#    'realestate.middlewares.RealestateDownloaderMiddleware': 543,
+# }
+
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+# EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+# }
+
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+    'realestate.pipelines.MongoPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+# AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+# AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+# AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+# AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+# HTTPCACHE_ENABLED = True
+# HTTPCACHE_EXPIRATION_SECS = 0
+# HTTPCACHE_DIR = 'httpcache'
+# HTTPCACHE_IGNORE_HTTP_CODES = []
+# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/spiders/realestate/realestate/spiders/init.py
+++ b/spiders/realestate/realestate/spiders/init.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/spiders/realestate/realestate/spiders/lianjia.py
+++ b/spiders/realestate/realestate/spiders/lianjia.py
@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+import scrapy
+
+from realestate.items import RealEstateItem
+
+
+class LianjiaSpider(scrapy.Spider):
+    name = 'lianjia'
+    allowed_domains = ['lianjia.com']
+    start_urls = ['https://cq.lianjia.com/ershoufang/']
+
+    def start_requests(self):
+        for i in range(100):
+            url = 'https://cq.lianjia.com/ershoufang/pg%s' % i
+            yield scrapy.Request(url=url)
+
+    def parse(self, response):
+        for item in response.css('.sellListContent > li'):
+            yield RealEstateItem(
+                name=item.css('.title > a::text').extract_first(),
+                url=item.css('.title > a::attr("href")').extract_first(),
+                type='secondhand',
+                price=item.css('.totalPrice > span::text').extract_first(),
+                region=item.css('.houseInfo > a::text').extract_first(),
+                size=item.css('.houseInfo::text').extract_first().split(' | ')[2]
+            )
+
+        # 分页
+        # a_next = response.css('.house-lst-page-box > a')[-1]
+        # href = a_next.css('a::attr("href")')
+        # yield scrapy.Response(url='https://cq.lianjia.com' + href)
--- a/spiders/realestate/scrapy.cfg
+++ b/spiders/realestate/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = realestate.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = realestate