Reptile combat: Reptile plus data analysis, Chongqing Electric Brother's article will take you to analyze all the tourist attractions in Chongqing

The winter vacation has come, and I want to play. As an authentic Chongqing power grid brother, I want to use python crawler + data analysis to tell you what places in Chongqing are fun.

First, the last map of the location distribution results of the last scenic spot

Data source: Where to travel

Website: Where to travel-Chongqing

Use request to request json data

Part 1: Crawler

Data search: small test

import requests
keyword = "重庆"
page=1#打印第一页

headers = {
    
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"}
url = f'http://piao.qunar.com/ticket/list.json?keyword={keyword}&region=&from=mpl_search_suggest&page={page}'
res = requests.request("GET", url, headers=headers)

try:
    res_json = res.json()
    data = res_json['data']
    print(data)
except:
    pass

result

The data format returned by json is dictionary type, and we need to find the keywords I am interested in.
Search results
We found that we are interested in sightList

So you can modify the code as

import requests
keyword = "重庆"
page=1

headers = {
    
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"}
url = f'http://piao.qunar.com/ticket/list.json?keyword={keyword}&region=&from=mpl_search_suggest&page={page}'
res = requests.request("GET", url, headers=headers)


res_json = res.json()
sightLists = res_json['data']['sightList']#sightList是感兴趣的
for  sight in sightLists:
    print(sight)

Extract the information again and modify the code as

import requests
import pandas as pd
keyword = "重庆"
page=1#查看第一页

headers = {
    
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"}
url = f'http://piao.qunar.com/ticket/list.json?keyword={keyword}&region=&from=mpl_search_suggest&page={page}'
res = requests.request("GET", url, headers=headers)


res_json = res.json()
sightLists = res_json['data']['sightList']#sightList是感兴趣的
for  sight in sightLists:
    name=(sight['sightName'] if 'sightName' in sight.keys() else None)#名称
    districts=(sight['districts'] if 'districts' in sight.keys() else None)#地址
    star=(sight['star'] if 'star' in sight.keys() else None) #星级
    qunarPrice=(sight['qunarPrice'] if 'qunarPrice' in sight.keys() else None)#最低价格
    saleCount=(sight['saleCount'] if 'saleCount' in sight.keys() else None)#购买人数
    score=(sight['score'] if 'score' in sight.keys() else None )#评分
    point=(sight['point'] if 'point' in sight.keys() else None )#坐标位置
    intro=(sight['intro'] if 'intro' in sight.keys() else None)#介绍
    print('名称:{0},地址:{1},星级:{2},价格:{3},saleCount:{4},评分:{5},坐标:{6},介绍:{7}'.format(name,districts,star,qunarPrice,saleCount,score,point,intro))

We need to write data into the table.

import requests
import pandas as pd
import numpy as np
keyword = "重庆"
page=1#查看第一页

headers = {
    
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"}
url = f'http://piao.qunar.com/ticket/list.json?keyword={keyword}&region=&from=mpl_search_suggest&page={page}'
res = requests.request("GET", url, headers=headers)


res_json = res.json()
sightLists = res_json['data']['sightList']#sightList是感兴趣的
for  sight in sightLists:
    name=(sight['sightName'] if 'sightName' in sight.keys() else None)#名称
    districts=(sight['districts'] if 'districts' in sight.keys() else None)#地址
    star=(sight['star'] if 'star' in sight.keys() else None) #星级
    qunarPrice=(sight['qunarPrice'] if 'qunarPrice' in sight.keys() else None)#最低价格
    saleCount=(sight['saleCount'] if 'saleCount' in sight.keys() else None)#购买人数
    score=(sight['score'] if 'score' in sight.keys() else None )#评分
    point=(sight['point'] if 'point' in sight.keys() else None )#坐标位置
    intro=(sight['intro'] if 'intro' in sight.keys() else None)#介绍
    #print('名称:{0},地址:{1},星级:{2},价格:{3},saleCount:{4},评分:{5},坐标:{6},介绍:{7}'.format(name,districts,star,qunarPrice,saleCount,score,point,intro))

    shuju=np.array((name,districts,star,qunarPrice,saleCount,score,point,intro))
    shuju=shuju.reshape(-1,8)
    shuju=pd.DataFrame(shuju,columns=['名称','地址','星级','最低价格','购买人数','评分','坐标位置','介绍'])
    #print(shuju)

    shuju.to_csv('重庆景点数据.csv', mode='a+', index=False,header=False)  # mode='a+'追加写入

Multi-page crawling Take
one page of data as an example, sort out the approximate code, now need to crawl multiple pages

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Author: yudengwu 余登武
# @Date  : 2021/1/30
import requests
import pandas as pd
import numpy as np
import random
from time import sleep
def get_data(keyword, page):
    headers = {
    
    
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"}
    url = f'http://piao.qunar.com/ticket/list.json?keyword={keyword}&region=&from=mpl_search_suggest&page={page}'
    res = requests.request("GET", url, headers=headers)
    sleep(random.uniform(1, 2))
    try:
        res_json = res.json()
        sightLists = res_json['data']['sightList']  # sightList是感兴趣的
        for sight in sightLists:
            name = (sight['sightName'] if 'sightName' in sight.keys() else None)  # 名称
            districts = (sight['districts'] if 'districts' in sight.keys() else None)  # 地址
            star = (sight['star'] if 'star' in sight.keys() else None)  # 星级
            qunarPrice = (sight['qunarPrice'] if 'qunarPrice' in sight.keys() else None)  # 最低价格
            saleCount = (sight['saleCount'] if 'saleCount' in sight.keys() else None)  # 购买人数
            score = (sight['score'] if 'score' in sight.keys() else None)  # 评分
            point = (sight['point'] if 'point' in sight.keys() else None)  # 坐标位置
            intro = (sight['intro'] if 'intro' in sight.keys() else None)  # 介绍
            # print('名称:{0},地址:{1},星级:{2},价格:{3},saleCount:{4},评分:{5},坐标:{6},介绍:{7}'.format(name,districts,star,qunarPrice,saleCount,score,point,intro))

            shuju = np.array((name, districts, star, qunarPrice, saleCount, score, point, intro))
            shuju = shuju.reshape(-1, 8)
            shuju = pd.DataFrame(shuju, columns=['名称', '地址', '星级', '最低价格', '购买人数', '评分', '坐标位置', '介绍'])
            # print(shuju)

            shuju.to_csv('重庆景点数据.csv', mode='a+', index=False, header=False)  # mode='a+'追加写入
    except:
        pass


if __name__ == '__main__':
    keyword = "重庆"
    for page in range(1, 75):  # 控制页数
        print(f"正在提取第{page}页")
        sleep(random.uniform(1, 2))
        get_data(keyword, page)

More than 1,000 pieces of data, so many interesting things in Chongqing

Part 2: Data Analysis

We crawled the data earlier, now let’s analyze it.

1. Read the data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.sans-serif'] = ['SimHei']  # 设置加载的字体名
plt.rcParams['axes.unicode_minus'] = False#

df=pd.read_csv('重庆景点数据.csv',header=None,names=list(['名称', '地址', '星级', '最低价格', '购买人数', '评分', '坐标位置', '介绍']))
df = df.drop_duplicates()#删除重复数据。得到470行数据
print(df.head())

After removing the duplicate data, we get that there are 470 scenic spots in Chongqing

2. Analysis of the price of scenic spots
Top 20

df_qunarPrice = df.pivot_table(index='名称',values='最低价格')
df_qunarPrice.sort_values('最低价格',inplace=True,ascending=False)#降序
#print(df_qunarPrice[:20])#最高价格top20
df_qunarPrice[:20].plot(kind='barh')
plt.title('价格最高Top20')
plt.show()

Lowest Top20

df_qunarPrice = df.pivot_table(index='名称',values='最低价格')
df_qunarPrice.sort_values('最低价格',inplace=True,ascending=True)
#print(df_qunarPrice[:20])#最高价格top20
df_qunarPrice[:20].plot(kind='barh')
plt.title('最低Top20')
plt.show()

3. Analysis of scenic spots scores
Top 20 with the highest scores

#评分TOP20景点
df_score = df.pivot_table(index='名称',values='评分')
df_score.sort_values('评分',inplace=True,ascending=False)
df_score[:20].plot(kind='barh',color='red')#barh横条形图
plt.title('评分最高Top20')
plt.show()

The lowest rated Top20
df_score = df.pivot_table(index='名称',values='评分')
df_score.sort_values('评分',inplace=True,ascending=True )
df_score[:20].plot(kind='barh',color='red')#barh横条形图
plt.title('评分最低Top20')
plt.show()

No rating (maybe the site has not yet included the rating of this place...)

4. Monthly sales analysis
Top 20

df_saleCount = df.pivot_table(index='名称',values='购买人数')
df_saleCount.sort_values('购买人数',inplace=True,ascending=False)
df_saleCount[:20].plot(kind='barh',color='green')#barh横条形图
plt.title('月销售额最高Top20')
plt.show()

Lowest Top20 (may not include the data of this place, maybe this place is free)

5. Attraction level distribution

from pyecharts.charts import *
from pyecharts import options as opts
from pyecharts.globals import ThemeType

df_star = df["星级"].value_counts()
df_star = df_star.sort_values(ascending=False)
print(df_star)

Find the names of scenic spots with ratings, that is, 3 stars and above

print(df[df["星级"]!='无'].sort_values("星级",ascending=False)['名称'])

Show some pictures, too many

6. Map drawing of scenic spots address
first save the text file

df["lon"] = df["坐标位置"].str.split(",",expand=True)[0]#经度
df["lat"] = df["坐标位置"].str.split(",",expand=True)[1]#纬度
df.to_csv("data重庆.csv")

Draw a map

import pandas as pd

stations = pd.read_csv('data重庆.csv',delimiter=',')

from pyecharts.charts import Geo
from pyecharts import options
from pyecharts.globals import GeoType

g = Geo().add_schema(maptype="重庆")

# 给所有点附上标签 'StationID'
for i in stations.index:
    s = stations.iloc[i]
    g.add_coordinate(s['名称'],s['lon'],s['lat'])#地区名称,经度,纬度

# 给每个点的值赋为 1
data_pair = [(stations.iloc[i]['名称'],1) for i in stations.index]

# 画图
g.add('',data_pair, type_=GeoType.EFFECT_SCATTER, symbol_size=2)
g.set_series_opts(label_opts=options.LabelOpts(is_show=False))
g.set_global_opts(title_opts=options.TitleOpts(title="重庆景点分布图by-yudengwu"))

# 保存结果到 html
result = g.render('stations.html')

Much more fun in the main city

Insert picture description here

Author: Electrical - Yudeng Wu. Writing is not easy, if you think it’s good, please give it a thumbs up and leave.

Guess you like

Origin blog.csdn.net/kobeyu652453/article/details/113417002