import requests import re # 正则表达式操作 import xlwt def getHTMLText(url): # 得到网页文件 try: r = requests.get(url, timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding # 转换编码格式 return r.text except Exception as e: print(str(e)) def parsePage(ilt, html): # 分析页面函数得到相应的值 try: ymd = re.findall("ymd:'(.*?)',", html) high = re.findall("bWendu:'(.*?)℃',", html) low = re.findall("yWendu:'(.*?)℃',", html) tianqi = re.findall("tianqi:'(.*?)',", html) fengxiang = re.findall("fengxiang:'(.*?)',", html) fengli = re.findall(",fengli:'(.*?)'", html) aqi = re.findall("aqi:'(.*?)',", html) aqiInfo = re.findall("aqiInfo:'(.*?)',", html) aqiLevel = re.findall(",aqiLevel:'(.*?)'", html) # 应为日期是每一天都有的,所以可以作为基准,以此来遍历 for i in range(len(ymd)): # 将每一天的信息保存为一个列表 data = ymd[i] maxTem = high[i] minTem = low[i] weather = tianqi[i] windDirection = fengxiang[i] windLev = fengli[i] airNum = aqi[i] airEva = aqiInfo[i] airLev = aqiLevel[i] ilt.append([data, maxTem, minTem, weather, windDirection, windLev, airNum, airEva, airLev]) except: print("异常") def printGoodsList(ilt): # 字符串的格式化处理 xiNing = xlwt.Workbook() sheet1 = xiNing.add_sheet("sheet1") """ rule = "{:8}\t{:3}\t{:3}\t{:8}\t{:8}\t{:6}\t{:4}\t{:2}\t{:6}" # 字符串的格式化 print(rule.format("data", "maxTem", "minTem", "weather", "windDirection", "windLev", "airNum", "airEva", "airLev")) # 还需要改进的代码 for i in range(len(ilt)): print(rule.format(ilt[i][0], ilt[i][1], ilt[i][2], ilt[i][3], ilt[i][4], ilt[i][5], ilt[i][6], ilt[i][7], ilt[i][8])) """ for i in range(len(ilt)): for j in range(len(ilt[i])): sheet1.write(i, j, ilt[i][j]) xiNing.save("西宁.xls") if __name__ == "__main__": start_url = "http://tianqi.2345.com/t/wea_history/js/" list0 = [] list0.append(["日期", "最高气温", "最低气温", "天气", "风向", "风力", "空气质量指数", "空气质量评价", "空气质量等级"]) #根据观察,发现有的日期的格式有点奇怪,所以分开处理 for i in range(2016, 2019): # 左闭右开 try: if i == 2018: for j in range(1, 5): # 1-4月 url = start_url + str(i) + "0" + str(j) + "/52866_" + str(i) + "0" + str(j) + ".js" html = getHTMLText(url) # print(html) #网址没有错,且能访问成功 parsePage(list0, html) # 调用函数的时候,列表的值岁随函数里面的变化而变化 elif i == 2016: for j in range(1, 13): # 1-12月 if j in range(3, 10): url = start_url + str(i) + "0" + str(j) + "/52866_" + str(i) + "0" + str(j) + ".js" elif j in range(1, 3): url = start_url + "/52866_" + str(i) + str(j) + ".js" else: url = start_url + str(i) + str(j) + "/52866_" + str(i) + str(j) + ".js" html = getHTMLText(url) #print(html) parsePage(list0, html) else: for j in range(1, 13): if j in range(1, 10): url = start_url + str(i) + "0" + str(j) + "/52866_" + str(i) + "0" + str(j) + ".js" else: url = start_url + str(i) + str(j) + "/52866_" + str(i) + str(j) + ".js" html = getHTMLText(url) # print(html) parsePage(list0, html) except: continue #print(len(list0)) printGoodsList(list0)
欢迎大家来指教