The winter vacation has come, and I want to play. As an authentic Chongqing power grid brother, I want to use python crawler + data analysis to tell you what places in Chongqing are fun.
First, the last map of the location distribution results of the last scenic spot
Data source: Where to travel
Website: Where to travel-Chongqing
Use request to request json data
Part 1: Crawler
Data search: small test
import requests
keyword = "重庆"
page=1#打印第一页
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"}
url = f'http://piao.qunar.com/ticket/list.json?keyword={keyword}®ion=&from=mpl_search_suggest&page={page}'
res = requests.request("GET", url, headers=headers)
try:
res_json = res.json()
data = res_json['data']
print(data)
except:
pass
result
The data format returned by json is dictionary type, and we need to find the keywords I am interested in.
Search results
We found that we are interested in sightList
So you can modify the code as
import requests
keyword = "重庆"
page=1
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"}
url = f'http://piao.qunar.com/ticket/list.json?keyword={keyword}®ion=&from=mpl_search_suggest&page={page}'
res = requests.request("GET", url, headers=headers)
res_json = res.json()
sightLists = res_json['data']['sightList']#sightList是感兴趣的
for sight in sightLists:
print(sight)
Extract the information again and modify the code as
import requests
import pandas as pd
keyword = "重庆"
page=1#查看第一页
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"}
url = f'http://piao.qunar.com/ticket/list.json?keyword={keyword}®ion=&from=mpl_search_suggest&page={page}'
res = requests.request("GET", url, headers=headers)
res_json = res.json()
sightLists = res_json['data']['sightList']#sightList是感兴趣的
for sight in sightLists:
name=(sight['sightName'] if 'sightName' in sight.keys() else None)#名称
districts=(sight['districts'] if 'districts' in sight.keys() else None)#地址
star=(sight['star'] if 'star' in sight.keys() else None) #星级
qunarPrice=(sight['qunarPrice'] if 'qunarPrice' in sight.keys() else None)#最低价格
saleCount=(sight['saleCount'] if 'saleCount' in sight.keys() else None)#购买人数
score=(sight['score'] if 'score' in sight.keys() else None )#评分
point=(sight['point'] if 'point' in sight.keys() else None )#坐标位置
intro=(sight['intro'] if 'intro' in sight.keys() else None)#介绍
print('名称:{0},地址:{1},星级:{2},价格:{3},saleCount:{4},评分:{5},坐标:{6},介绍:{7}'.format(name,districts,star,qunarPrice,saleCount,score,point,intro))
We need to write data into the table.
import requests
import pandas as pd
import numpy as np
keyword = "重庆"
page=1#查看第一页
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"}
url = f'http://piao.qunar.com/ticket/list.json?keyword={keyword}®ion=&from=mpl_search_suggest&page={page}'
res = requests.request("GET", url, headers=headers)
res_json = res.json()
sightLists = res_json['data']['sightList']#sightList是感兴趣的
for sight in sightLists:
name=(sight['sightName'] if 'sightName' in sight.keys() else None)#名称
districts=(sight['districts'] if 'districts' in sight.keys() else None)#地址
star=(sight['star'] if 'star' in sight.keys() else None) #星级
qunarPrice=(sight['qunarPrice'] if 'qunarPrice' in sight.keys() else None)#最低价格
saleCount=(sight['saleCount'] if 'saleCount' in sight.keys() else None)#购买人数
score=(sight['score'] if 'score' in sight.keys() else None )#评分
point=(sight['point'] if 'point' in sight.keys() else None )#坐标位置
intro=(sight['intro'] if 'intro' in sight.keys() else None)#介绍
#print('名称:{0},地址:{1},星级:{2},价格:{3},saleCount:{4},评分:{5},坐标:{6},介绍:{7}'.format(name,districts,star,qunarPrice,saleCount,score,point,intro))
shuju=np.array((name,districts,star,qunarPrice,saleCount,score,point,intro))
shuju=shuju.reshape(-1,8)
shuju=pd.DataFrame(shuju,columns=['名称','地址','星级','最低价格','购买人数','评分','坐标位置','介绍'])
#print(shuju)
shuju.to_csv('重庆景点数据.csv', mode='a+', index=False,header=False) # mode='a+'追加写入
Multi-page crawling Take
one page of data as an example, sort out the approximate code, now need to crawl multiple pages
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Author: yudengwu 余登武
# @Date : 2021/1/30
import requests
import pandas as pd
import numpy as np
import random
from time import sleep
def get_data(keyword, page):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"}
url = f'http://piao.qunar.com/ticket/list.json?keyword={keyword}®ion=&from=mpl_search_suggest&page={page}'
res = requests.request("GET", url, headers=headers)
sleep(random.uniform(1, 2))
try:
res_json = res.json()
sightLists = res_json['data']['sightList'] # sightList是感兴趣的
for sight in sightLists:
name = (sight['sightName'] if 'sightName' in sight.keys() else None) # 名称
districts = (sight['districts'] if 'districts' in sight.keys() else None) # 地址
star = (sight['star'] if 'star' in sight.keys() else None) # 星级
qunarPrice = (sight['qunarPrice'] if 'qunarPrice' in sight.keys() else None) # 最低价格
saleCount = (sight['saleCount'] if 'saleCount' in sight.keys() else None) # 购买人数
score = (sight['score'] if 'score' in sight.keys() else None) # 评分
point = (sight['point'] if 'point' in sight.keys() else None) # 坐标位置
intro = (sight['intro'] if 'intro' in sight.keys() else None) # 介绍
# print('名称:{0},地址:{1},星级:{2},价格:{3},saleCount:{4},评分:{5},坐标:{6},介绍:{7}'.format(name,districts,star,qunarPrice,saleCount,score,point,intro))
shuju = np.array((name, districts, star, qunarPrice, saleCount, score, point, intro))
shuju = shuju.reshape(-1, 8)
shuju = pd.DataFrame(shuju, columns=['名称', '地址', '星级', '最低价格', '购买人数', '评分', '坐标位置', '介绍'])
# print(shuju)
shuju.to_csv('重庆景点数据.csv', mode='a+', index=False, header=False) # mode='a+'追加写入
except:
pass
if __name__ == '__main__':
keyword = "重庆"
for page in range(1, 75): # 控制页数
print(f"正在提取第{page}页")
sleep(random.uniform(1, 2))
get_data(keyword, page)
More than 1,000 pieces of data, so many interesting things in Chongqing
Part 2: Data Analysis
We crawled the data earlier, now let’s analyze it.
1. Read the data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置加载的字体名
plt.rcParams['axes.unicode_minus'] = False#
df=pd.read_csv('重庆景点数据.csv',header=None,names=list(['名称', '地址', '星级', '最低价格', '购买人数', '评分', '坐标位置', '介绍']))
df = df.drop_duplicates()#删除重复数据。得到470行数据
print(df.head())
After removing the duplicate data, we get that there are 470 scenic spots in Chongqing
2. Analysis of the price of scenic spots
Top 20
df_qunarPrice = df.pivot_table(index='名称',values='最低价格')
df_qunarPrice.sort_values('最低价格',inplace=True,ascending=False)#降序
#print(df_qunarPrice[:20])#最高价格top20
df_qunarPrice[:20].plot(kind='barh')
plt.title('价格最高Top20')
plt.show()
Lowest Top20
df_qunarPrice = df.pivot_table(index='名称',values='最低价格')
df_qunarPrice.sort_values('最低价格',inplace=True,ascending=True)
#print(df_qunarPrice[:20])#最高价格top20
df_qunarPrice[:20].plot(kind='barh')
plt.title('最低Top20')
plt.show()
3. Analysis of scenic spots scores
Top 20 with the highest scores
#评分TOP20景点
df_score = df.pivot_table(index='名称',values='评分')
df_score.sort_values('评分',inplace=True,ascending=False)
df_score[:20].plot(kind='barh',color='red')#barh横条形图
plt.title('评分最高Top20')
plt.show()
The lowest rated Top20
df_score = df.pivot_table(index='名称',values='评分')
df_score.sort_values('评分',inplace=True,ascending=True )
df_score[:20].plot(kind='barh',color='red')#barh横条形图
plt.title('评分最低Top20')
plt.show()
No rating (maybe the site has not yet included the rating of this place...)
4. Monthly sales analysis
Top 20
df_saleCount = df.pivot_table(index='名称',values='购买人数')
df_saleCount.sort_values('购买人数',inplace=True,ascending=False)
df_saleCount[:20].plot(kind='barh',color='green')#barh横条形图
plt.title('月销售额最高Top20')
plt.show()
Lowest Top20 (may not include the data of this place, maybe this place is free)
5. Attraction level distribution
from pyecharts.charts import *
from pyecharts import options as opts
from pyecharts.globals import ThemeType
df_star = df["星级"].value_counts()
df_star = df_star.sort_values(ascending=False)
print(df_star)
Find the names of scenic spots with ratings, that is, 3 stars and above
print(df[df["星级"]!='无'].sort_values("星级",ascending=False)['名称'])
Show some pictures, too many
6. Map drawing of scenic spots address
first save the text file
df["lon"] = df["坐标位置"].str.split(",",expand=True)[0]#经度
df["lat"] = df["坐标位置"].str.split(",",expand=True)[1]#纬度
df.to_csv("data重庆.csv")
Draw a map
import pandas as pd
stations = pd.read_csv('data重庆.csv',delimiter=',')
from pyecharts.charts import Geo
from pyecharts import options
from pyecharts.globals import GeoType
g = Geo().add_schema(maptype="重庆")
# 给所有点附上标签 'StationID'
for i in stations.index:
s = stations.iloc[i]
g.add_coordinate(s['名称'],s['lon'],s['lat'])#地区名称,经度,纬度
# 给每个点的值赋为 1
data_pair = [(stations.iloc[i]['名称'],1) for i in stations.index]
# 画图
g.add('',data_pair, type_=GeoType.EFFECT_SCATTER, symbol_size=2)
g.set_series_opts(label_opts=options.LabelOpts(is_show=False))
g.set_global_opts(title_opts=options.TitleOpts(title="重庆景点分布图by-yudengwu"))
# 保存结果到 html
result = g.render('stations.html')
Much more fun in the main city
Author: Electrical - Yudeng Wu. Writing is not easy, if you think it’s good, please give it a thumbs up and leave.