1.启动pyspider
2.新建一个项目
3.代码
4. 注意事项:网址什么的都变了
5.存储到MongoDB,
注意这个地方我错了三次
6.在tableau可视化才发现错误的1,2
之后就能可视化了,本次实验是个半成品。后期会补充。
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-10-22 17:07:31
# Project: TripAdvisor
from pyspider.libs.base_handler import *
import pymongo
class Handler(BaseHandler):
crawl_config = {
}
client = pymongo.MongoClient('localhost')
db = client['trip']
@every(minutes=24 * 60)
def on_start(self):
self.crawl('https://www.tripadvisor.cn/Attractions-g186338-Activities-c47-oa30-London_England.html#FILTERED_LIST', callback=self.index_page)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('#ATTR_ENTRY_194299 > div.attraction_clarity_cell > div > div > div.listing_info > div.listing_title > a').items():
self.crawl(each.attr.href, callback=self.detail_page)
next = response.doc('#FILTERED_LIST > div.al_border.deckTools.btm > div > div > a.nav.next.rndBtn.ui_button.primary.taLnk ').attr.href
self.crawl(next,callback=self.index_page)
@config(priority=2)
def detail_page(self, response):
url = response.url
name = response.doc('#taplc_trip_planner_breadcrumbs_0 > ul > li:nth-child(6)').text()
phone = response.doc('#taplc_location_detail_contact_card_ar_responsive_0 > div.contactInfo > div.contact > div.contactType.phone.is-hidden-mobile > div').text()
durating = response.doc('#taplc_location_detail_reviews_card_0 > div.section.rating > a.seeAllReviews').text()
score =response.doc('#taplc_location_detail_reviews_card_0 > div.section.rating > span').text()
return {
"url": url,
"name": name,
"phone":phone,
"durating":durating,
"score":score
}
def on_result(self,result):
if result:
self.save_to_mongo(result)
def save_to_mongo(self,result):
if self.db['lodon'].insert(result):
print('saved to mongo',result)