(reproduced) Introduction to python crawler

Reprint address: http://blog.csdn.net/bo_wen_/article/details/50868339

 

Step 1: Install 2 packages

            requests和beautifulsoup

 

Step 2: Import the code and execute

    

import requests
import csv
import random
import time
import socket
import http.client
# import urllib.request
from bs4 import BeautifulSoup

def get_content(url , data = None):
    header={
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.235'
}
    timeout = random.choice(range(80, 180))
    while True:
        try:
            rep = requests.get(url,headers = header,timeout = timeout)
            rep.encoding = 'utf-8'
# req = urllib.request.Request(url, data, header)
            # response = urllib.request.urlopen(req, timeout=timeout)
            # html1 = response.read().decode('UTF-8', errors='ignore')
            # response.close()
break
# except urllib.request.HTTPError as e:
        #         print( '1:', e)
        #         time.sleep(random.choice(range(5, 10)))
        #
        # except urllib.request.URLError as e:
        #     print( '2:', e)
        #     time.sleep(random.choice(range(5, 10)))
except socket.timeout as e:
            print( '3:', e)
            time.sleep(random.choice(range(8,15)))

        except socket.error as e:
            print( '4:', e)
            time.sleep(random.choice(range(20, 60)))

        except http.client.BadStatusLine as e:
            print( '5:', e)
            time.sleep(random.choice(range(30, 80)))

        except http.client.IncompleteRead as e:
            print( '6:', e)
            time.sleep(random.choice(range(5, 15)))

    return rep.text
    # return html_text
def get_data(html_text):
    final = []
    bs = BeautifulSoup(html_text, "html.parser" )   # Create BeautifulSoup object
 body = bs.body # Get body part
 data = body.find( 'div' , { 'id' : '7d' })   # Find id 7d div
 ul = data.find( 'ul' )   # get the ul part
 li = ul.find_all( 'li' )   # get all li
 for day in li: # traverse the content of each li tag
 temp = [ ]
        date = day.find( 'h1' ).string   # find the date
 temp.append(date)   # append to temp
 inf = day.find_all( 'p' )   # find all p tags in li
 temp.append(inf[ 0 ].string,)   # The content of the first p tag (weather conditions) is added to temp
 if inf[ 1 ].find( 'span' ) is None :
            temperature_highest = None # The weather forecast may not have the highest temperature of the day (in the evening, that's it), you need to add a judgment statement to output the lowest temperature
 else :
            temperature_highest = inf[ 1 ].find( 'span' ).string   # Find the highest temperature
 temperature_highest = temperature_highest.replace( '℃' , '' )   # The website will change at night, and there is also a ℃ behind the highest temperature
 temperature_lowest = inf[ 1 ].find( 'i' ).string   # find the lowest temperature
 temperature_lowest = temperature_lowest.replace( '℃' , '' )   # there is a ℃ after the lowest temperature, remove the symbol
 temp.append(temperature_highest)    # add the highest temperature to temp.append(temperature_lowest)    in temp
 #Add the lowest temperature to temp
 final.append(temp) #Add    temp to final
 return final

def write_data(data, name):
    file_name = name
    with open(file_name, 'a', errors='ignore', newline='') as f:
            f_csv = csv.writer(f)
            f_csv.writerows(data)

if __name__ == '__main__':
                url = 'http://www.weather.com.cn/weather/101190401.shtml'
html = get_content(url)
                result = get_data(html)
                write_data(result, 'weather.csv')

 

Step 3: The result is as follows:

23rd (today) partly cloudy 19 12
24th (tomorrow) partly cloudy 20 12
25th (the day after tomorrow) partly cloudy 21 14
26th (Thursday) partly cloudy 21 14
27th (Friday) partly cloudy 22 14
28th (Saturday) partly cloudy 21 15
29th (Sunday) Cloudy to clear 21 11

 

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=326170779&siteId=291194637