Python网络爬虫:User Agent和代理IP

一、在urllib2中的使用:
# 一:
# 异常处理,及设置请求次数
# 可添加time时间间隔
import urllib2
def download(url,num_retries=2):
	print("Downloading:",url)
	try:
		html = urllib2.urlopen(url).read()
	except urllib2.URLError as e:
		print("Download error:",e.reason)
		html = None
		if num_retries>0:
			if hasattr(e,"code") and 500 <= e.code <600:
				# 当服务器响应结果为5XX时重新请求
				return download(url,num_retries-1)
	return html

# 二:在一的基础上添加用户代理
import urllib2
def download(url,uesr_agent="wswp",num_retries=2):
	print("Downloading:",url)
	headers = {"User_agent":uesr_agent}
	requset = urllib2.Request(url,headers=headers)
	try:
		html = urllib2.urlopen(requset).read()
	except urllib2.URLError as e:
		print("Download error:",e.reason)
		html = None
		if num_retries>0:
			if hasattr(e,"code") and 500 <= e.code <600:
				# 当服务器响应结果为5XX时重新请求
				return download(url,num_retries-1)
	return html

# 三、支持代理
import urllib2
def download(url,uesr_agent="wswp",proxy=None,num_retries=2):
	print("Downloading:",url)
	headers = {"User_agent":uesr_agent}
	requset = urllib2.Request(url,headers=headers)
	opener = urllib2.build_opener()
	if proxy:
		proxy_params = {urlparse.urlparse(url).scheme:proxy}
		opener.add_handler(urllib2.ProxyHandler(proxy_params))
	try:
		html = urllib2.urlopen(requset).read()
	except urllib2.URLError as e:
		print("Download error:",e.reason)
		html = None
		if num_retries>0:
			if hasattr(e,"code") and 500 <= e.code <600:
				# 当服务器响应结果为5XX时重新请求
				return download(url,num_retries-1)
	return html

二、在requests中的使用:

import requests
headers={"User_agent":uesr_agent}
proxies= {
    "http":"http://127.0.0.1:9999",
    "https":"http://127.0.0.1:8888"
}
response  = requests.get("https://www.baidu.com",headers=headers,proxies=proxies)
print(response.text)

更多requests的用法,参考:https://www.cnblogs.com/zhaof/p/6915127.html

猜你喜欢

转载自blog.csdn.net/weixin_41601173/article/details/80019778