携程景区

一.主题式网络爬虫设计方案

1.主题式网络爬虫名称:关于携程景区信息

2.主题式网络爬虫爬取的内容:景区福利评级与热度

3.设计方案概述:

实现思路:爬取网站内容,之后分析提取需要的数据,进行数据清洗,之后数据可视化,并计算不同比率的相关系数

技术难点:因为用的是json分析,所以需要通过查找的方式获取数据

二、主题页面的结构特征分析

1.主题页面的结构与特征分析,通过解析发现数据都通过json传输 ,解析参数循环得数据

2.

在57行找到基本参数

 

 

 

三、网络爬虫程序设计

1.数据爬取与采集

import json
import time
import pandas as pd
from lxml import etree
import requests

infos = []      # 用来存储所有信息的列表

# 定义目标链接
url = "https://sec-m.ctrip.com/restapi/soa2/12530/json/ticketSpotSearch"

# 定义User_Agent列表,下方headers中使用
ua = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
      "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
      "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
      "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
      "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
      "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
      "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
      "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
      "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
      "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
      "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
      "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
      "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
      "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
      "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
      "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
      "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
      "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]

# 定义请求头列表,防止携程识别出我们的爬虫程序
headers = {

    "Host": "sec-m.ctrip.com",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0",
    "Accept": "*/*",
    "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
    "Accept-Encoding": "gzip, deflate, br",
    "Referer": "https://piao.ctrip.com/dest/u-%E5%A4%A9%E6%B4%A5/s-tickets/",
    "Content-Type": "application/json",
    "cookieorigin": "https://piao.ctrip.com",
    "Origin": "https://piao.ctrip.com",
    "Content-Length": "423",
    "Connection": "keep-alive",
    "Cookie": "_abtest_userid=6f592f35-e449-4bc0-b564-d00f956f84ff; _bfa=1.1582097534632.ml3vt.1.1585454134475.1585471635679.48.511.228032; Session=SmartLinkCode=U155952&SmartLinkKeyWord=&SmartLinkQuary=&SmartLinkHost=&SmartLinkLanguage=zh; _RF1=116.132.81.181; _RSG=PTaQiACpsx2FxjposHE578; _RDG=2841b6791758af2d7c11b9d9aaab2910e8; _RGUID=2ef6ccc2-ac6a-4759-81c5-391db2cedecf; MKT_CKID=1582097537594.5es03.inf0; _jzqco=%7C%7C%7C%7C1585471664213%7C1.961816114.1582097537604.1585471638835.1585471664156.1585471638835.1585471664156.undefined.0.0.206.206; __zpspc=9.51.1585471638.1585471664.2%233%7Cbzclk.baidu.com%7C%7C%7C%7C%23; _ga=GA1.2.427107527.1582097538; HotelCityID=12split%E6%B5%8E%E5%8D%97splitNanjingsplit2020-3-18split2020-03-22split0; appFloatCnt=93; nfes_isSupportWebP=1; StartCity_Pkg=PkgStartCity=346; FlightIntl=Search=[%22HKG|%E9%A6%99%E6%B8%AF(HKG)|58|HKG|480%22%2C%22BJS|%E5%8C%97%E4%BA%AC(BJS)|1|BJS|480%22%2C%222020-03-02%22]; AHeadUserInfo=VipGrade=0&VipGradeName=%C6%D5%CD%A8%BB%E1%D4%B1&UserName=&NoReadMessageCount=1; UUID=04BDE45396F048A5BD505942DCF4AA0E; IsPersonalizedLogin=F; MKT_Pagesource=PC; Union=OUID=index&AllianceID=4897&SID=155952&SourceID=&createtime=1585471639&Expires=1586076438757; GUID=09031158111425303949; __utma=1.427107527.1582097538.1584616142.1584619463.2; __utmz=1.1584619463.2.2.utmcsr=ctrip.com|utmccn=(referral)|utmcmd=referral|utmcct=/; gad_city=b6a29287793c7ffa817a1ee3a6776529; MKT_CKID_LMT=1585454137863; _gid=GA1.2.198710281.1585454139; _bfs=1.4; _gat=1; _bfi=p1%3D10650032348%26p2%3D10650034475%26v1%3D511%26v2%3D510",
    "Pragma": "no-cache",
    "Cache-Control": "no-cache",
    "TE": "Trailers",

}

# 此列表用来存储每一页的请求参数
das = []
for i in range(1,100):
    # 循环便利将每一页的请求参数赋值到列表中
    das.append({"pageid":10650032348,"searchtype":2,"keyword":"天津","sort":1,"pidx":int("{}".format(i)),"psize":20,"reltype":1,"excepts":[],"filters":[],"imagesize":"C_200_130","contentType":"json","head":{"appid":"100013776","cid":"09031158111425303949","ctok":"","cver":"1.0","lang":"01","sid":"8888","syscode":"09","auth":"","extension":[{"name":"protocal","value":"https"}]},"ver":"7.10.3.0319180000"})

# 循环便利取出对应的请求参数
for k,da in enumerate(das):
    print(k+1)          # 打印进度,声明第几次循环
    time.sleep(10)      # 控制程序峰值速度,防止携程检测

    # requests模块发送post请求并携带headers与data,得到返回内容并解码
    resp = requests.post(url, headers=headers, data=json.dumps(da)).content.decode()

    # 将回返内容转化为json类型
    js_info = json.loads(resp)

    # 取出转化后的json类型中景点信息部分
    c = js_info['data']['viewspots']

信息处理

 # 循环遍历取出每一家景点对应详细数据
    for i in c:

        # 判断该景区是否为A级景区,是则赋值,不是则用“无”来声明
        xj = i['star'] if i['star'] else ""
        # 定义临时集合
        temp = set()

        # 取出评论数
        pls = i['commentCount'] if i['commentCount'] else ""

        # 循环取出每一条tag标签并添加到temp集合中,籍此实现数据清洗去重
        for tg in i['taginfos']:
            temp.add(tg['tags'][0]['tname'])

        # 将结构化完毕的数据添加到infos列表中
        infos.append({
            '景点名称':i['name'],
            '星级':xj,
            '评分':i['cmtscore'],
            '推荐度':i['cmttag'],
            '简介':i['feature'],
            '标签':str(temp)[1:-1],
            '评论数':pls,
            'id':i['id']


        })


# 文件名
fileName = "景点基本信息.csv"


# 将信息传入pandas库中的DataFram对象中,得到DataFram对象
data = pd.DataFrame(infos)

# 写入csv文件,csv_headers列表定义所需字段
csv_headers = ['景点名称', '星级', '评分', '推荐度', '简介', '标签','评论数','id']
# 传入数据,规定编码utf_8_sig,规定写入模式W
data.to_csv(fileName, header=csv_headers, index=False, mode='w', encoding='utf_8_sig')

景点评论信息读取

import json
import random
import time
from pprint import pprint
import pandas as pd

import requests

import csv

from xlrd import open_workbook
import matplotlib.pyplot as plt
from pylab import mpl


mpl.rcParams['font.sans-serif'] = ['SimHei']

ids = []
names = []
# 打开文件读取数据并处理
with open("信息/测试.csv", "r", encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    for k, i in enumerate(reader):
        if k == 0:
            continue
        ids.append(i[7])  # 第7列内容
        names.append(i[0])

print(ids[10:20])

print(names)

url = "https://sec-m.ctrip.com/restapi/soa2/12530/json/viewCommentList"
ua = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
      "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
      "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
      "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
      "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
      "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
      "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
      "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
      "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
      "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
      "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
      "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
      "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
      "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
      "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
      "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
      "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
      "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]



for k_n,id_ in enumerate(ids[0:20]):
    infos = []
    print("",k_n+1,"")
    print(id_)
    time.sleep(1)
    try:
        for i in range(1,150):
            time.sleep(random.randint(1,5))
            headers = {

                "Host": "sec-m.ctrip.com",
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0",
                "Accept": "*/*",
                "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
                "Accept-Encoding": "gzip, deflate, br",
                "Referer": "https://piao.ctrip.com/dest/t62366.html",
                "Content-Type": "application/json",
                "cookieorigin": "https://piao.ctrip.com",
                "Origin": "https://piao.ctrip.com",
                "Content-Length": "301",
                "Connection": "keep-alive",
                "Cookie": "_RSG=IAKbfurzgT67orE1HidVdB; _RGUID=676af75e-2995-4292-b9df-9ec4413b5c87; _RDG=28082a428bdd5028b729a37e7358dc92a0; _ga=GA1.2.991982228.1577102675; MKT_CKID=1577102676203.kutn3.1xoq; _abtest_userid=6226bfd3-af4b-4b3e-9944-1481b77ec4e1; GUID=09031037111575175981; nfes_isSupportWebP=1; AHeadUserInfo=VipGrade=0&VipGradeName=%C6%D5%CD%A8%BB%E1%D4%B1&UserName=&NoReadMessageCount=1; UUID=2EB823AD79CF4F9782A3B6F1A4BC785A; IsPersonalizedLogin=F; MKT_Pagesource=PC; FlightIntl=Search=[%22HKG|%E9%A6%99%E6%B8%AF(HKG)|58|HKG|480%22%2C%22BJS|%E5%8C%97%E4%BA%AC(BJS)|1|BJS|480%22%2C%222020-03-02%22]; StartCity_Pkg=PkgStartCity=4; HotelCityID=28split%E5%A4%AA%E5%8E%9FsplitChengdusplit2020-3-8split2020-03-09split0; cticket=BE530BE5BB7E7811F220525E8C33169DD87C02F75B700B8245C86572EE2C02A2; ticket_ctrip=bJ9RlCHVwlu1ZjyusRi+ypZ7X2r4+yojzPTkvQyq9vImDzFP2o8THKDAyqgI/WdJuBe/PrX8z0uYhEi1ZEdd4z11qlPjrfQUSVGcfWg/4uN12MIM0gt3isFtN5F0BxIdZatidUi2UsVxx0i60xSK44YC5SJQx5m3ECqGoByW6jwfN/he0qPZxyQ1SaAJHlFcmtMH3529rMYVqLXi/AqJ6gIef4VItVL6I9SWudNZ04LL+pRNDTXIu0vl5ikJi29mpk0wo3EPysbqVbIg9DAlADh8vboQ2wJqh5AHiykS/CY=; DUID=u=6DAFC943576FCF25C099C08E47501D5F&v=0; IsNonUser=u=6DAFC943576FCF25C099C08E47501D5F&v=0; Session=SmartLinkCode=U155952&SmartLinkKeyWord=&SmartLinkQuary=&SmartLinkHost=&SmartLinkLanguage=zh; __utma=1.991982228.1577102675.1584941982.1585295364.5; __utmz=1.1585295364.5.5.utmcsr=piao.ctrip.com|utmccn=(referral)|utmcmd=referral|utmcct=/dest/u-%E6%9D%AD%E5%B7%9E/s-tickets/; _RF1=116.132.81.181; _gid=GA1.2.754446494.1585453219; Union=OUID=index&AllianceID=4897&SID=155952&SourceID=&createtime=1585557042&Expires=1586161842447; MKT_CKID_LMT=1585557042481; gad_city=b6a29287793c7ffa817a1ee3a6776529; _bfs=1.8; _bfa=1.1577102671675.3iqikw.1.1585533737761.1585557037493.106.937.10650016817; _bfi=p1%3D10650000804%26p2%3D10650032348%26v1%3D937%26v2%3D936; _jzqco=%7C%7C%7C%7C1585557042635%7C1.2086882401.1577102676198.1585557078796.1585557097675.1585557078796.1585557097675.undefined.0.0.378.378; __zpspc=9.96.1585557042.1585557097.4%232%7Csp0.baidu.com%7C%7C%7C%25E6%2590%25BA%25E7%25A8%258B%7C%23; appFloatCnt=70",
                "Pragma": "no-cache",
                "Cache-Control": "no-cache",
                "TE": "Trailers"

            }

            da = {"viewid":id_,"pagenum":i,"head":{"appid":"100013776"}}

            resp = requests.post(url, headers=headers, data=json.dumps(da)).content.decode()

            js_info = json.loads(resp)

            c = js_info['data']['comments']
          
   

数据清洗

  if c:
                for k,info in enumerate(c):
                    # 如果评论内容小于5则不在采集范围之内,从源头实现数据清洗
                    if len(info['content']) < 5:
                        continue
                    infos.append({
                        '评分':info['score'],
                        '时间':info['date'],
                        '内容':info['content']
                    })
            else:
                break
    except:
        print("外层错误"*30)
        continue

数据写入

 #
    fileName = "./new_pl/{}.csv".format(names[k_n])


    data = pd.DataFrame(infos)
    # 写入csv文件,'a+'是追加模式

    csv_headers = ['评分', '时间', '内容']
    data.to_csv(fileName, header=csv_headers, index=False, mode='w', encoding='utf_8_sig')

整理

import json
import random
import time
from pprint import pprint
import pandas as pd

import requests

import csv

from xlrd import open_workbook
import matplotlib.pyplot as plt
from pylab import mpl


mpl.rcParams['font.sans-serif'] = ['SimHei']

ids = []
names = []
# 打开文件读取数据并处理
with open("信息/测试.csv", "r", encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    for k, i in enumerate(reader):
        if k == 0:
            continue
        ids.append(i[7])  # 第7列内容
        names.append(i[0])

print(ids[10:20])

print(names)

url = "https://sec-m.ctrip.com/restapi/soa2/12530/json/viewCommentList"
ua = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
      "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
      "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
      "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
      "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
      "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
      "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
      "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
      "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
      "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
      "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
      "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
      "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
      "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
      "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
      "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
      "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
      "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]



for k_n,id_ in enumerate(ids[0:20]):
    infos = []
    print("",k_n+1,"")
    print(id_)
    time.sleep(1)
    try:
        for i in range(1,150):
            time.sleep(random.randint(1,5))
            headers = {

                "Host": "sec-m.ctrip.com",
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0",
                "Accept": "*/*",
                "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
                "Accept-Encoding": "gzip, deflate, br",
                "Referer": "https://piao.ctrip.com/dest/t62366.html",
                "Content-Type": "application/json",
                "cookieorigin": "https://piao.ctrip.com",
                "Origin": "https://piao.ctrip.com",
                "Content-Length": "301",
                "Connection": "keep-alive",
                "Cookie": "_RSG=IAKbfurzgT67orE1HidVdB; _RGUID=676af75e-2995-4292-b9df-9ec4413b5c87; _RDG=28082a428bdd5028b729a37e7358dc92a0; _ga=GA1.2.991982228.1577102675; MKT_CKID=1577102676203.kutn3.1xoq; _abtest_userid=6226bfd3-af4b-4b3e-9944-1481b77ec4e1; GUID=09031037111575175981; nfes_isSupportWebP=1; AHeadUserInfo=VipGrade=0&VipGradeName=%C6%D5%CD%A8%BB%E1%D4%B1&UserName=&NoReadMessageCount=1; UUID=2EB823AD79CF4F9782A3B6F1A4BC785A; IsPersonalizedLogin=F; MKT_Pagesource=PC; FlightIntl=Search=[%22HKG|%E9%A6%99%E6%B8%AF(HKG)|58|HKG|480%22%2C%22BJS|%E5%8C%97%E4%BA%AC(BJS)|1|BJS|480%22%2C%222020-03-02%22]; StartCity_Pkg=PkgStartCity=4; HotelCityID=28split%E5%A4%AA%E5%8E%9FsplitChengdusplit2020-3-8split2020-03-09split0; cticket=BE530BE5BB7E7811F220525E8C33169DD87C02F75B700B8245C86572EE2C02A2; ticket_ctrip=bJ9RlCHVwlu1ZjyusRi+ypZ7X2r4+yojzPTkvQyq9vImDzFP2o8THKDAyqgI/WdJuBe/PrX8z0uYhEi1ZEdd4z11qlPjrfQUSVGcfWg/4uN12MIM0gt3isFtN5F0BxIdZatidUi2UsVxx0i60xSK44YC5SJQx5m3ECqGoByW6jwfN/he0qPZxyQ1SaAJHlFcmtMH3529rMYVqLXi/AqJ6gIef4VItVL6I9SWudNZ04LL+pRNDTXIu0vl5ikJi29mpk0wo3EPysbqVbIg9DAlADh8vboQ2wJqh5AHiykS/CY=; DUID=u=6DAFC943576FCF25C099C08E47501D5F&v=0; IsNonUser=u=6DAFC943576FCF25C099C08E47501D5F&v=0; Session=SmartLinkCode=U155952&SmartLinkKeyWord=&SmartLinkQuary=&SmartLinkHost=&SmartLinkLanguage=zh; __utma=1.991982228.1577102675.1584941982.1585295364.5; __utmz=1.1585295364.5.5.utmcsr=piao.ctrip.com|utmccn=(referral)|utmcmd=referral|utmcct=/dest/u-%E6%9D%AD%E5%B7%9E/s-tickets/; _RF1=116.132.81.181; _gid=GA1.2.754446494.1585453219; Union=OUID=index&AllianceID=4897&SID=155952&SourceID=&createtime=1585557042&Expires=1586161842447; MKT_CKID_LMT=1585557042481; gad_city=b6a29287793c7ffa817a1ee3a6776529; _bfs=1.8; _bfa=1.1577102671675.3iqikw.1.1585533737761.1585557037493.106.937.10650016817; _bfi=p1%3D10650000804%26p2%3D10650032348%26v1%3D937%26v2%3D936; _jzqco=%7C%7C%7C%7C1585557042635%7C1.2086882401.1577102676198.1585557078796.1585557097675.1585557078796.1585557097675.undefined.0.0.378.378; __zpspc=9.96.1585557042.1585557097.4%232%7Csp0.baidu.com%7C%7C%7C%25E6%2590%25BA%25E7%25A8%258B%7C%23; appFloatCnt=70",
                "Pragma": "no-cache",
                "Cache-Control": "no-cache",
                "TE": "Trailers"

            }

            da = {"viewid":id_,"pagenum":i,"head":{"appid":"100013776"}}

            resp = requests.post(url, headers=headers, data=json.dumps(da)).content.decode()

            js_info = json.loads(resp)

            c = js_info['data']['comments']
            if c:
                for k,info in enumerate(c):
                    # 如果评论内容小于5则不在采集范围之内,从源头实现数据清洗
                    if len(info['content']) < 5:
                        continue
                    infos.append({
                        '评分':info['score'],
                        '时间':info['date'],
                        '内容':info['content']
                    })
            else:
                break
    except:
        print("外层错误"*30)
        continue


    #
    fileName = "./new_pl/{}.csv".format(names[k_n])


    data = pd.DataFrame(infos)
    # 写入csv文件,'a+'是追加模式

    csv_headers = ['评分', '时间', '内容']
    data.to_csv(fileName, header=csv_headers, index=False, mode='w', encoding='utf_8_sig')

数据可视化分析

标签热度

import csv

import xlrd

import matplotlib.pyplot as plt

from pylab import mpl

mpl.rcParams['font.sans-serif'] = ['SimHei']

new_list = []       # 存储所有标签数据

time_list = []      # 标签列表
# 打开文件读取数据并处理
with open("景点基本信息.csv", "r", encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)

    for k, i in enumerate(reader):

        if k == 0:
            continue

        time_list.append(i[5])

for i in time_list:
    temp_list = i.split(',')

    for i2 in temp_list:
        new_list.append(i2.replace("'", "").replace(" ",""))






data_dict = {}
for i in new_list:
    if i == "榜单" or i == "et(":
        continue
    if i not in data_dict:
        data_dict[i] = 1
    else:
        data_dict[i] += 1

#
#
new_data = sorted(data_dict.items(), key=lambda item: item[1], reverse=True)[0:20]

x = []
y = []

for i in new_data:
    x.append(i[0])
    y.append(i[1])

# 规定柱体样式
plt.figure(figsize=(33,6))
plt.bar(x, y, align='center', width=0.8)
# 为每个条形图添加数值标签
for k,i in enumerate(y):
    plt.text(k,i,i)
plt.title('景点热度图示')
plt.ylabel('热度')
plt.xlabel('景点名称')
plt.savefig("./images/标签.png")

 

A级景区比例

from fractions import Fraction

import matplotlib.pyplot as plt
from pylab import mpl
import csv

# 声明编码
mpl.rcParams['font.sans-serif'] = ['SimHei']

temp = []
# 打开文件读取数据并处理
with open("景点基本信息.csv", "r", encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)

    for k, i in enumerate(reader):

        if k == 0:
            continue
        # 取出所有的评论存入content
        temp.append(i[1])

data_dict = {}
for k,i in enumerate(temp):
    if i == "":
        continue
    if i not in data_dict:
        data_dict[i] = 1
    else:
        data_dict[i] += 1

print(data_dict)
#
# print(data_dict)

# 对字典进行排序,按照value值,返回列表类型,嵌套元组

data_dict = sorted(data_dict.items(), key=lambda item: item[1], reverse=True)
print(data_dict)

nums = []           # 存储频率次数
vles = []           # 存储A级

for i in data_dict:
    vles.append(i[0]+"A")
    nums.append(i[1])

sum_nums = len(nums)

squares = [x/sum_nums for x in nums]
print(vles)
print(squares)
#
# 保证圆形
plt.axes(aspect=1)
plt.pie(x=squares, labels=vles, autopct='%3.1f %%')
plt.savefig("./images/A级景区占比饼状图示.png")

月份热度(五大道为例)

import matplotlib.pyplot as plt

from pylab import mpl

mpl.rcParams['font.sans-serif'] = ['SimHei']

file_name = "五大道"       # 景区名字

time_list = []      # 时间信息列表
# 打开文件读取数据并处理
with open("./new_pl/{}.csv".format(file_name), "r", encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)

    for k, i in enumerate(reader):

        if k == 0:
            continue

        time_list.append(i[1])


time_dict = {}
for i in time_list:
    new_i = i[5:7].replace('/','')
    if new_i not in time_dict:
        time_dict[new_i] = 1
    else:
        time_dict[new_i] += 1


new_data = sorted(time_dict.items(), key=lambda item: int(item[0]), reverse=False)


month = []
nums = []

for i in new_data:
    month.append(i[0])
    nums.append(i[1])





plt.plot(month, nums, linewidth=2,)  # 调用绘制函数,传入输入参数和输出参数
plt.title("景区月份热度图示", fontsize=24)  # 指定标题,并设置标题字体大小
plt.xlabel("月份", fontsize=14)  # 指定X坐标轴的标签,并设置标签字体大小
plt.ylabel("数量", fontsize=14)  # 指定Y坐标轴的标签,并设置标签字体大小
plt.tick_params(axis='both', labelsize=14)  # 参数axis值为both,代表要设置横纵的刻度标记,标记大小为14
plt.savefig("./images/五大道淡旺季.png")  # 打开matplotlib查看器,并显示绘制的图形

 

医护情况

from fractions import Fraction

import matplotlib.pyplot as plt
from pylab import mpl
import csv

# 声明编码
mpl.rcParams['font.sans-serif'] = ['SimHei']


desc = []

# 打开文件读取数据并处理
with open("景点基本信息.csv", "r", encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)

    for k, i in enumerate(reader):

        if k == 0:
            continue
        # 取出所有简介
        desc.append(i[4])

# true 与 false
t = 0
f = 0

# 我们仅对排名较为靠前的一些景区进行分析,较后的小型景区无人管理不在目标之中
for i in desc[0:100]:
    if i == "医护人员免费":
        t += 1
    else:
        f += 1



vles = ['医护免费景区','其他']

sum_nums = 100

squares = [x/sum_nums for x in [t,f]]

# 保证圆形
plt.axes(aspect=1)
plt.pie(x=squares, labels=vles, autopct='%3.1f %%')
plt.savefig("./images/医护福利占比饼状图示.png")

五、总结

1.经过对数据的分析和可视化,从几个图形分析得人们更愿意就近游玩,8-10月为旅游旺季,4A景点占据主流,过半景点没有医疗免费。

2.小结:在这次对旅游景点要素分析中,我从中学会了很多知识。很多次都卡在一个点上,绞尽脑汁去想解决问题的办法,通过观看b站的视频,百度搜索去找寻答案,求助于热心网友。认识到了编程的丰富内容,对于旅游要注意的几个内容有了了解。深刻意识到了难者不会会者不难,熟能生巧,要多做多练的道理。

猜你喜欢

转载自www.cnblogs.com/hyq1/p/12696760.html