西刺代理用多进程爬取

运用多进程检测西刺代理中免费代理中的可用代理

import requests
from lxml import etree
def daili(queue):
#爬取5页代理
for s in range(1,5):
url = ‘http://www.xicidaili.com/nn/%d‘% s
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36’
}
response = requests.get(url, headers=headers)
# with open(‘1234567890987654321.html’, ‘wb’) as ff:
# ff.write(response.content)
res_str=response.text
res_lxml=etree.HTML(res_str)
res_re=res_lxml.xpath(‘//table/tr/td[2]/text()’)
res_port=res_lxml.xpath(‘//table/tr/td[3]/text()’)
for i in range(0,len(res_re)):
res_proxy=’http://’+res_re[i]+”:”+res_port[i]
queue.put(res_proxy)
def check_proxy(proxy_one):
try:
url = ‘http://www.baidu.com/s?wd=ip
proxy1={
‘http’:proxy_one
}
response=requests.get(url,proxies=proxy1,timeout=3)
if response.status_code==200:
print(‘代理%s是可以使用的’% proxy_one)
else:
print(‘代理是使用超时了’)
except:
print(‘代理不可用’)
if name==’main‘:
import multiprocessing
queue= multiprocessing.Queue()
m=3
p=multiprocessing.Process(target=daili,args=(queue,))
p.start()
pool=multiprocessing.Pool(3)
for i in range(0,500):
var=queue.get()
# print(‘打印从消息队列中获取到的消息{}’.format(var))
pool.apply_async(check_proxy,(var,))
# print(‘提交请求成功’)
pool.close()
pool.join()
p.join()

猜你喜欢

转载自blog.csdn.net/chengjintao1121/article/details/82024589
今日推荐