urllib exercise

# coding = utf-8
"""
Parse the proxy address in the https://www.kuaidaili.com/free/ website,
and test if it is available
"""
import re import time import urllib.request def downHtml(url, retry=3): """ Request the web address and download the source code, if the request fails, retry three times :param url: request URL :param retry: number of retries :return: Web page source code """ try: request = urllib.request.Request(url) #Get web page source html = urllib.request.urlopen(request).read().decode() except urllib.error.URLError as e: print ( ' Request exception: ' , e.reason) if retry > 0: time.sleep(2) # 两秒后重试 downHtml(url, retry=retry-1) else: return None else: return html def getProxy(html): """ Use regular expressions to match all proxies from the source code :param html: Web page source code :return: list containing matching proxies """ proxies = re.findall(r'<td data-title="IP">(.*?)</td>.*?<td data-title="PORT">(.*?)</td>', html, re.S) return proxies def isAbleToUse(ips): """ Use ip to test the website to determine whether the obtained proxy is available :param ips: matched proxy ip :return: """ #Test website url = " http://httpbin.org/ip " #Construct proxy proxy = { ' http ' : ' {}:{} ' .format(ips[0], ips[1]), ' https ' : ' {}:{} ' .format(ips[0], ips[1 ])} #Create proxy handler proxies = urllib.request.ProxyHandler(proxy) #Create opener processing object opener = urllib.request.build_opener(proxies, urllib.request.HTTPHandler) urllib.request.install_opener(opener) try: data = opener.open(url).read().decode() # 请求 print(data) except Exception as e: print(e) else: print('{}:{}'.format(ips[0], ips[1]), '可用!') if __name__ == '__main__': url = "https://www.kuaidaili.com/free/" #Get the source code html = downHtml(url) #Parse the proxy from the source code proxies = getProxy(html) #Test whether the proxy is available for proxy in proxies: isAbleToUse(proxy)

 

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325363151&siteId=291194637