PySpider 爬猫途鹰网排名景点并保存至 MongoDB

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-10-30 18:01:28
# Project: Maotu

from pyspider.libs.base_handler import *
import pymongo

class Handler(BaseHandler):
crawl_config = {
}
client = pymongo.MongoClient('localhost')
db = client['maotu']

@every(minutes=24 * 60)
def on_start(self):
url = 'https://www.tripadvisor.cn/Attractions-g294217-Activities-Hong_Kong.html''
self.crawl(url,callback=self.index_page,validate_cert=False,fetch_type='js')

@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('.attraction_element .listing_title > a').items():
self.crawl(each.attr.href, callback=self.detail_page,validate_cert=False,fetch_type='js')
nextlink = response.doc('.nav.next').attr.href
self.crawl(nextlink,callback=self.index_page,validate_cert=False,fetch_type='js')

@config(priority=2)
def detail_page(self, response):
name = response.doc('.h1').text(),
rank =response.doc('b > span').text(),
location = response.doc('.headerBL > div').text(),
view = response.doc('.seeAllReviews').text()[:-3],
score = response.doc('.overallRating').text(),
kfsj = response.doc('.headerBL .header_detail').text()[5:],
phone = response.doc('.contact > .phone > div').text()[0:-4]
return {
"name":name,
"rank":rank,
"location":location,
"view":view,
"score":score,
"kf":kfsj,
"phone":phone
}

def on_result(self,result):
if result:
self.save_to_mongo(result)

def save_to_mongo(self,result):
if self.db['MaoTU_HongKong'].insert(result):
print('savinf to mongo',result)

本站由 VITAN 使用 Stellar 主题创建。
本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议,转载请注明出处。