从网站爬取代理ip,这个用平常的爬取网站代码即可。
测试代理ip,最简便的方法有两种,一种是是requests
import requests
try:
requests.get('http://wenshu.court.gov.cn/', proxies={"http":"http://121.31.154.12:8123"})
except:
print 'connect failed'
else:
print 'success'
一种是telnet,这里注意的是telnetlib这种库,这种库时python内置的,不用下载
import telnetlib
try:
telnetlib.Telnet('127.0.0.1', port='80', timeout=20)
except:
print 'connect failed'
else:
print 'success'
原文参考:这里
详尽的一个爬取代码
import requests
import bs4
import re
import time
import xlwt
import telnetlib
from bs4 import BeautifulSoup
from urllib import request
from urllib.request import urlopen
from my_fake_useragent import UserAgent
def getHtmlData(url):
r = requests.get(url, timeout=30)
r.encoding="utf-8"
return r.text
def parsePage(ilt,html):
try:
soup = BeautifulSoup(html, 'html.parser')
iplist=soup.find_all('tr')
for i in range(len(iplist)):
if i==0:
continue
else:
ip = iplist[i].contents[0].string
ilt.append(ip)
except:
return "404"
def testIp(SipList,iplist):
testurl="http://txt.go.sohu.com/ip/soip"
print(len(iplist))
for i in range(len(iplist)):
everyIp=iplist[i]
print(everyIp)
try:
telnetlib.Telnet(everyIp,port='80',timeout=20)
print('success')
SipList.append(everyIp)
except:
print('连接失败')
def main():
start_url='https://proxy.horocn.com/free-proxy.html?page=BM'
ipList=[]
SipList=[]
html = getHtmlData(start_url)
parsePage(ipList,html)
testIp(SipList,ipList)
print(SipList[0])
main()