1.getIp方法:从http://www.xicidaili.com/nn/网站获取免费代理ip并存储在csv文档中。
2.getproxy方法:从文档中取出ip和端口,组装ip,并保存在集合中。
3.test方法:测试代理ip是否可用,访问百度,timeout=2秒时间
4.主方法:分别用普通方法测试完成时间singleprocess needs 和使用多线程测试完成时间multiprocess needs 。由于本电脑核心数为4,使用多线程的时间要比普通方法少大概4倍。
import requests
from bs4 import BeautifulSoup
import csv
import time
from multiprocessing import Pool
def getIp(numpage):
csvfile = open('ips.csv', 'w')
writer = csv.writer(csvfile,dialect = ("excel"))
time.sleep(10)
url = 'http://www.xicidaili.com/nn/'
user_agent = 'IP'
headers = {'User-agent': user_agent}
for i in range(1, numpage + 1):
real_url = url + str(i)
response = requests.get(real_url, headers=headers)
content = response.text
bs = BeautifulSoup(content,"lxml")
trs = bs.find_all('tr')
for items in trs:
tds = items.find_all('td')
temp = []
try:
temp.append(tds[1].text)
temp.append(tds[2].text)
#print(temp)
writer.writerow(temp)
except:
pass
getIp(1)
def getProxy():
with open('ips.csv','r') as csvfile:
reader = csv.reader(csvfile)
Proxy = []
for row in reader:
try:
proxy = {"http": row[0] + ':' + row[1]}
except:
continue
Proxy.append(proxy)
return Proxy
def test(proxy):
try:
response = requests.get('http://www.baidu.com', proxies=proxy, timeout=2)
if response:
return proxy
except:
pass
if __name__ == '__main__':
proxy = getProxy()
IPPool1 = []
time1 = time.time()
for item in proxy:
IPPool1.append(test(item))
time2 = time.time()
print('singleprocess needs ' + str(time2 - time1) + ' s')
pool = Pool()
IPPool2 = []
temp = []
time3 = time.time()
for item in proxy:
temp.append(pool.apply_async(test, args=(item,)))
pool.close()
pool.join()
for item in temp:
IPPool2.append(item.get())
time4 = time.time()
print('multiprocess needs ' + str(time4 - time3) + ' s')
5.结果展示: