Crawl the weather information of Xining (Python)

import requests
 import re   #Regular expression operation 
import xlwt




def getHTMLText(url): #Get   the webpage file 
    try :
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding #Convert   encoding format 
        return r.text
     except Exception as e:
         print (str(e))


def parsePage(ilt, html): #Analyze the   page function to get the corresponding value 
    try :
        ymd = re.findall("ymd:'(.*?)',", html)
        high = re.findall("bWendu:'(.*?)℃',", html)
        low = re.findall("yWendu:'(.*?)℃',", html)
        tianqi = re.findall("tianqi:'(.*?)',", html)
        fengxiang = re.findall( " fengxiang:'(.*?)', " , html)
        fengli = re.findall(",fengli:'(.*?)'", html)
        aqi = re.findall("aqi:'(.*?)',", html)
        aqiInfo = re.findall("aqiInfo:'(.*?)',", html)
        aqiLevel = re.findall( " ,aqiLevel:'(.*?)' " , html)
         #The date should be available every day, so it can be used as a benchmark to traverse 
        for i in range(len(ymd) ):   #Save the information of each day as a list 
            data = ymd[i]
            maxTem = high[i]
            minTem = low[i]
            weather = tianqi[i]
            windDirection = fengxiang[i]
            windLev = fengli[i]
            airNum = aqi [i]
            airEva = aqiInfo [i]
            airLev = aqiLevel [i]
            ilt.append([data, maxTem, minTem, weather, windDirection, windLev, airNum, airEva, airLev])
    except :
         print ( " exception " )


def printGoodsList(ilt): #Formatting   of strings 
    xiNing = xlwt.Workbook()
    sheet1 = xiNing.add_sheet("sheet1")
    """ 
    rule = "{:8}\t{:3}\t{:3}\t{:8}\t{:8}\t{:6}\t{:4}\t{:2}\ t{:6}" # string formatting
    print(rule.format("data", "maxTem", "minTem", "weather", "windDirection", "windLev", "airNum", "airEva", "airLev"))
     # Still need to improve the code
    for i in range(len(ilt)):
        print (rule.format (oxygen [i] [0], oxygen [i] [1], oxygen [i] [2], oxygen [i] [3], oxygen [i] [4], oxygen [i] [5], oxygen [i] [6], oxygen [i] [7], oxygen [i] [8]))
    """
    for i in range(len(ilt)):
        for j in range(len(ilt[i])):
            sheet1.write(i, j, ilt[i][j])
    xiNing.save( " Xining.xls " )



if __name__ == "__main__":
    start_url = "http://tianqi.2345.com/t/wea_history/js/"
    list0 = []
    list0.append([ " date " , " maximum temperature " , " minimum temperature " , " weather " , " wind direction " , " wind force " , " air quality index " , " air quality evaluation " , " air quality level " ])
     #According to observation, it is found that the format of some dates is a bit strange, so they are processed separately 
    for i in range(2016, 2019):   #Left closed and right open 
        try :
            if i == 2018:
                for j in range(1, 5):   # January-April 
                    url = start_url + str(i) + " 0 " + str(j) + " /52866_ " + str(i) + " 0 " + str(j ) + " .js " 
                    html = getHTMLText(url)
                    # print(html) #The URL is correct and can be accessed successfully parsePage 
                    (list0, html) #When   the function is called, the value of the list changes with the changes in the function 
            elif i == 2016 :
                 for j in range(1, 13):   # 1-12月
                    if j in range(3, 10):
                        url = start_url + str(i) + "0" + str(j) + "/52866_" + str(i) + "0" + str(j) + ".js"
                    elif j in range(1, 3):
                        url = start_url + "/52866_" + str(i) + str(j) + ".js"
                    else:
                        url = start_url + str(i) + str(j) + "/52866_" + str(i) + str(j) + ".js"
                    html = getHTMLText(url)
                    #print(html)
                    parsePage(list0, html)
            else:
                for j in range(1, 13):
                    if j in range(1, 10):
                        url = start_url + str(i) + "0" + str(j) + "/52866_" + str(i) + "0" + str(j) + ".js"
                    else:
                        url = start_url + str(i) + str(j) + "/52866_" + str(i) + str(j) + ".js"
                    html = getHTMLText(url)
                   # print(html)
                    parsePage(list0, html)
        except:
            continue
    #print(len(list0))
    printGoodsList(list0)

Welcome to advise

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324687780&siteId=291194637