python爬虫-爬取代理IP并通过多线程快速验证(这个验证没跑通)
scrapy爬虫代理——利用crawlera神器,无需再寻找代理IP
第一个用了BeautifulSoup,第二个用了PyQuery
有代理网站的样式:
自己代码:
#coding=UTF-8 import urllib.request import chardet from bs4 import BeautifulSoup from pyquery import PyQuery as pyq of = open('proxy.txt', 'w') class ProxyIp: def xiciProxy(self): for page in range(1,10): url = 'http://www.xicidaili.com/nn/%s' % page print(url) #user_agent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36" headers = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', #'Accept-Encoding':'gzip, deflate, sdch', 'Accept-Language':'zh-CN,zh;q=0.8', 'Cache-Control':'max-age=0', 'Connection':'keep-alive', 'Cookie':'_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJTE3YTI3ZjgyYzE4NGVhMjhmZTVjMjRiOTVhMmE2YWFhBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMWxCYURmRS9DRHpmZ3hNeTFUVFVCcURCeFhSeXQyWG5qbTFsblFIM0Y2R2M9BjsARg%3D%3D--088e944b5bfb2e7d5c2547822a205693aeb68b0c; Hm_lvt_0cf76c77469e965d2957f0553e6ecf59=1491037294,1491448130; Hm_lpvt_0cf76c77469e965d2957f0553e6ecf59=1491449617', 'Host':'www.xicidaili.com', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36' } data = None request = urllib.request.Request(url,data,headers) try: response = urllib.request.urlopen(request) content = response.read(); #print(content) #print(chardet.detect(content)) #print(type(content)) content = str(content,encoding='UTF-8') #print(content.decode('UTF-8')) #print(bytes.decode(content)) #print(content.decode()) #print(content) except urllib.request.URLError as e: print(e.reason,e.code) soup = BeautifulSoup(content) trs = soup.find('table', {"id":"ip_list"}).findAll('tr') for tr in trs[1:]: tds = tr.findAll('td') ip = tds[1].text.strip() port = tds[2].text.strip() protocol = tds[5].text.strip() #print(ip,port,protocol) if protocol == 'HTTP' or protocol == 'HTTPS': of.write('%s=%s:%s\n' % (protocol.lower(), ip, port)) print('%s://%s:%s' % (protocol.lower(), ip, port)) def youDailiProxy(self): #尝试使用PyQuery解析 for page in range(1,2): if page == 1: url = 'http://www.youdaili.net/Daili/guonei/36718.html' else: url = 'http://www.youdaili.net/Daili/guonei/36718_%s.html' % page print(url) headers = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', #'Accept-Encoding':'gzip, deflate, sdch' 'Accept-Language':'zh-CN,zh;q=0.8', 'Cache-Control':'max-age=0', 'Connection':'keep-alive', 'Cookie':'Hm_lvt_f8bdd88d72441a9ad0f8c82db3113a84=1491557308; Hm_lpvt_f8bdd88d72441a9ad0f8c82db3113a84=1491557384', 'Host':'www.youdaili.net', 'If-Modified-Since':'Sun, 02 Apr 2017 05:58:18 GMT', 'If-None-Match':'W/"58e092fa-9770"', 'Referer':'http://www.youdaili.net/Daili/guonei/36718_2.html', 'Upgrade-Insecure-Requests':1, 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36' } request = urllib.request.Request(url,None,headers) try: response = urllib.request.urlopen(request) content = response.read() except urllib.request.URLError as e: print(e.reason) content = bytes.decode(content) #print(content) jq = pyq(content) p = jq(".content").find("p") for tmp in p.items(): #print(tmp.text()) arr = tmp.text().split("#") arr2 = arr[0].split("@") protocol = arr2[1] ipAndPort = arr2[0] print(protocol,ipAndPort) if protocol == 'HTTP' or protocol == 'HTTPS': of.write('%s=%s\n' % (protocol.lower(),ipAndPort)) print('%s://%s' %(protocol.lower(),ipAndPort)) proxy = ProxyIp() #proxy.xiciProxy(); proxy.youDailiProxy()
。。。