from lxml import etree
import urllib
import urllib.request
import xlwt
import pandas as pd
from pyecharts import Geo
import matplotlib.pyplot as plt
import matplotlib as mpl
def getpage(url):
req=urllib.request.Request(url)
req.add_header('User-Agent') #添加自己的用户代理
data=urllib.request.urlopen(req).read().decode("gbk")
return data
def getdata(data):
AQI=[]
start=0
html=etree.HTML(data)
infor=html.xpath('//li[@id!="tr-fixed"]//text()') #利用xpath解析路径
while True:
if start<len(infor):
AQI.append(infor[start:start+5])
start=start+5
else:
break
return AQI
#写入excel表格
def writeExcel(AQI):
f=xlwt.Workbook()
sheet1=f.add_sheet('The AQI',cell_overwrite_ok=True)
rowTitle=['order','city','province','AQI','quality']
for i in range(len(rowTitle)):
sheet1.write(0,i,rowTitle[i])
for j in range(len(AQI)):
for k in range(len(AQI[j])):
sheet1.write(j+1,k,AQI[j][k])
f.save("E:\\python\\aqi.xls")
if __name__=="__main__":
url="http://tianqi.2345.com/air-rank.htm"
data=getpage(url)
AQI=getdata(data)
writeExcel(AQI)
city=[];value=[]
fbook=pd.DataFrame(pd.read_excel("E:\\python\\aqi.xls",0))
for each in fbook['city']:
city.append(str(each))
for each in fbook['AQI']:
value.append(each)
for order,quality in zip(fbook['order'],fbook['quality']):
if quality=="中度污染":
index=order-1
break
geo = Geo("全国空气质量指数", "Data from AQI", title_color="#fff", width=1000, height=600, \
background_color='#404a59')
geo.add("空气质量指数", city,value, visual_range=[1,60], maptype='china', type='effectScatter', \
visual_text_color="#fff",effect_scale=5,symbol_size=15, is_visualmap=True,is_random=True,is_roam=False)
geo.render(path="全国空气质量指数.html")
fig=plt.figure()
ax=fig.add_subplot(111)
rects = ax.bar(range(len(city)-index),value[index:],tick_label=city[index:],color='r')
plt.grid()
plt.xlabel('城市')
plt.ylabel('空气质量指数')
plt.title("中度污染、重度污染、严重污染城市")
mpl.rcParams["font.sans-serif"] = ["KaiTi"]
mpl.rcParams["axes.unicode_minus"] = False
plt.show()
爬取到的大数据如下所示: