写爬虫时为避免因频繁访问而被禁止访问的情况,常用到代理的方法
首先是代理的写法
from urllib.request import Request, ProxyHandler, build_opener
from fake_useragent import UserAgent
url = "http://httpbin.org/get"
headers = {
"User-Agent": UserAgent().chrome
}
request = Request(url, headers=headers)
# handler = ProxyHandler({"http": "username:password@ip:port"})
handler = ProxyHandler({"http": "119.101.113.58:9999"})
opener = build_opener(handler)
response = opener.open(request)
print(response.read().decode())
额,免费的代理一般都要多试几个才能成功。带注释的一行是付费的账号,可靠性较高。一般情况下,用几个免费的就行了。
初学爬虫,没什么太大用处,但免费的代理有的不能用,本想写个类来筛选可用的代理,代码贴在这了,求教。verify()方法似乎有点问题,单独测这个方法时没问题,但在这里调用的时候把爬到的代理都判断为可用代理写入ip_ok.txt里了,我还没想好怎么改(~_~)(我jio得没问题呀,但运行结果表明方法有错误~~,头疼)
from fake_useragent import UserAgent
import requests
from lxml import etree
import time
from urllib.request import Request, ProxyHandler, build_opener
from urllib.error import URLError
# IP代理池
class ProxyPool(object):
@staticmethod
def get_url(initial_url): # 高匿代理网址,返回网址的list
url_list = []
for i in range(1, 4): # 抓取的页数
url_new = initial_url + str(i)
url_list.append(url_new)
return url_list
@staticmethod
def get_content(page_url): # 获取网页内容
response = requests.get(page_url, headers={"User-Agent": UserAgent().random}) # 随机ua
if response.status_code == 200:
time.sleep(2) # 睡眠2s,保证页面加载完毕;降低访问频率。
return response.text
else:
print(response.status_code, page_url)
return "error"
@staticmethod
def get_info(html): # 解析网页获取Type & IP & Port
if html != "error":
e = etree.HTML(html)
ips = e.xpath('//tbody/tr/td[1]/text()')
ports = e.xpath('//tbody/tr/td[2]/text()')
types = e.xpath('//tbody/tr/td[4]/text()')
# 代理网站爬到的代理,可用性未知
with open("ip_port.txt", 'a') as f:
for i in range(0, len(ips)):
info = str(types[i].lower()) + "," + str(ips[i]) + ":" + str(ports[i])
f.write(info + u"\n")
else:
print("not html page")
return "error"
# 测试代理ip是否可用
@staticmethod
def verify(type_, ip_port):
test_url = "http://httpbin.org/get"
request = Request(test_url, headers={"User-Agent": UserAgent().random})
# 构建代理
proxy = {type_: ip_port}
print("testing proxy ", proxy)
handler = ProxyHandler(proxy)
opener = build_opener(handler)
time.sleep(1)
try:
response = opener.open(request)
time.sleep(3)
if response.read():
print("proxy-testing is ok ==>", proxy)
# 保存可用的代理
with open("ip_ok.txt", "a") as f2:
f2.write(str(proxy) + "\n")
else:
print("proxy-test failure!")
except URLError as e:
print(e.reason)
if __name__ == '__main__':
url = "https://www.kuaidaili.com/free/inha/"
test = ProxyPool()
urls = test.get_url(url)
for url_ in urls:
html_ = test.get_content(url_)
test.get_info(html_)
with open("ip_port.txt", "r") as f3:
infos = f3.readlines()
for data in infos:
# 对读入的一行数据进行处理
temp_type = data.split(",")[0]
temp_ip_port = data.split(",")[1].strip()
test.verify(temp_type, temp_ip_port)