可怕的pyspider猫途鹰

1.启动pyspider

2.新建一个项目

3.代码

4. 注意事项:网址什么的都变了

5.存储到MongoDB,

注意这个地方我错了三次

6.在tableau可视化才发现错误的1,2

之后就能可视化了,本次实验是个半成品。后期会补充。

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-10-22 17:07:31
# Project: TripAdvisor

from pyspider.libs.base_handler import *
import pymongo

class Handler(BaseHandler):
    crawl_config = {
    }
    
    client = pymongo.MongoClient('localhost')
    db = client['trip']

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('https://www.tripadvisor.cn/Attractions-g186338-Activities-c47-oa30-London_England.html#FILTERED_LIST', callback=self.index_page)
        
    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('#ATTR_ENTRY_194299 > div.attraction_clarity_cell > div > div > div.listing_info > div.listing_title > a').items():
            self.crawl(each.attr.href, callback=self.detail_page)
        next = response.doc('#FILTERED_LIST > div.al_border.deckTools.btm > div > div > a.nav.next.rndBtn.ui_button.primary.taLnk ').attr.href
        self.crawl(next,callback=self.index_page)
        
       
    @config(priority=2)
    def detail_page(self, response):
        url = response.url
        name = response.doc('#taplc_trip_planner_breadcrumbs_0 > ul > li:nth-child(6)').text()
        phone = response.doc('#taplc_location_detail_contact_card_ar_responsive_0 > div.contactInfo > div.contact > div.contactType.phone.is-hidden-mobile > div').text()
        durating = response.doc('#taplc_location_detail_reviews_card_0 > div.section.rating > a.seeAllReviews').text()
        score =response.doc('#taplc_location_detail_reviews_card_0 > div.section.rating > span').text()
        
        return {
            "url": url,
            "name": name,
            "phone":phone,
            "durating":durating,
            "score":score
            
        }

    def on_result(self,result):
        if result:
            self.save_to_mongo(result)
           
        
    def save_to_mongo(self,result):
        if self.db['lodon'].insert(result):
            print('saved to mongo',result)
        

猜你喜欢

转载自blog.csdn.net/OYY_90/article/details/83278755
今日推荐