import requests import re #Regular expression operation import xlwt def getHTMLText(url): #Get the webpage file try : r = requests.get(url, timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding #Convert encoding format return r.text except Exception as e: print (str(e)) def parsePage(ilt, html): #Analyze the page function to get the corresponding value try : ymd = re.findall("ymd:'(.*?)',", html) high = re.findall("bWendu:'(.*?)℃',", html) low = re.findall("yWendu:'(.*?)℃',", html) tianqi = re.findall("tianqi:'(.*?)',", html) fengxiang = re.findall( " fengxiang:'(.*?)', " , html) fengli = re.findall(",fengli:'(.*?)'", html) aqi = re.findall("aqi:'(.*?)',", html) aqiInfo = re.findall("aqiInfo:'(.*?)',", html) aqiLevel = re.findall( " ,aqiLevel:'(.*?)' " , html) #The date should be available every day, so it can be used as a benchmark to traverse for i in range(len(ymd) ): #Save the information of each day as a list data = ymd[i] maxTem = high[i] minTem = low[i] weather = tianqi[i] windDirection = fengxiang[i] windLev = fengli[i] airNum = aqi [i] airEva = aqiInfo [i] airLev = aqiLevel [i] ilt.append([data, maxTem, minTem, weather, windDirection, windLev, airNum, airEva, airLev]) except : print ( " exception " ) def printGoodsList(ilt): #Formatting of strings xiNing = xlwt.Workbook() sheet1 = xiNing.add_sheet("sheet1") """ rule = "{:8}\t{:3}\t{:3}\t{:8}\t{:8}\t{:6}\t{:4}\t{:2}\ t{:6}" # string formatting print(rule.format("data", "maxTem", "minTem", "weather", "windDirection", "windLev", "airNum", "airEva", "airLev")) # Still need to improve the code for i in range(len(ilt)): print (rule.format (oxygen [i] [0], oxygen [i] [1], oxygen [i] [2], oxygen [i] [3], oxygen [i] [4], oxygen [i] [5], oxygen [i] [6], oxygen [i] [7], oxygen [i] [8])) """ for i in range(len(ilt)): for j in range(len(ilt[i])): sheet1.write(i, j, ilt[i][j]) xiNing.save( " Xining.xls " ) if __name__ == "__main__": start_url = "http://tianqi.2345.com/t/wea_history/js/" list0 = [] list0.append([ " date " , " maximum temperature " , " minimum temperature " , " weather " , " wind direction " , " wind force " , " air quality index " , " air quality evaluation " , " air quality level " ]) #According to observation, it is found that the format of some dates is a bit strange, so they are processed separately for i in range(2016, 2019): #Left closed and right open try : if i == 2018: for j in range(1, 5): # January-April url = start_url + str(i) + " 0 " + str(j) + " /52866_ " + str(i) + " 0 " + str(j ) + " .js " html = getHTMLText(url) # print(html) #The URL is correct and can be accessed successfully parsePage (list0, html) #When the function is called, the value of the list changes with the changes in the function elif i == 2016 : for j in range(1, 13): # 1-12月 if j in range(3, 10): url = start_url + str(i) + "0" + str(j) + "/52866_" + str(i) + "0" + str(j) + ".js" elif j in range(1, 3): url = start_url + "/52866_" + str(i) + str(j) + ".js" else: url = start_url + str(i) + str(j) + "/52866_" + str(i) + str(j) + ".js" html = getHTMLText(url) #print(html) parsePage(list0, html) else: for j in range(1, 13): if j in range(1, 10): url = start_url + str(i) + "0" + str(j) + "/52866_" + str(i) + "0" + str(j) + ".js" else: url = start_url + str(i) + str(j) + "/52866_" + str(i) + str(j) + ".js" html = getHTMLText(url) # print(html) parsePage(list0, html) except: continue #print(len(list0)) printGoodsList(list0)
Welcome to advise