构建ip代理池

最近需要一些知网资料，在爬知网的时候，当访问次数过多的时候，就会被踢掉，所以去找了下ip代理相关资料，资料有很多，借鉴了一下，就加到的自己的代码中(不是照搬)，如有侵权，请联系删除

# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import socket
import urllib2
import urllib

User_Agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
header = {}
header['User-Agent'] = User_Agent

def getIP(totalPage):
    url = 'http://www.xicidaili.com/nn/'
    ipList = []
    for i in range(1,totalPage):
        strUrl = url + str(i)
        try:
            req = urllib2.Request(strUrl,headers=header)
            res = urllib2.urlopen(req).read()
            soup = BeautifulSoup(res)

            ips = soup.find_all('tr')
            for i in range(1,len(ips)):
                ip = ips[i].find_all("td")#在我测试的时候，find_all和findAll都是可行的，据说是不同版本BeautifulSoup可能会存在问题
                ip_temp = ip[1].contents[0] + "\t" + ip[2].contents[0]#这里是contents而不是content
                ipList.append(ip_temp)
        except:
            continue
    return ipList

def getUsableIP(list):
    url = 'http://kns.cnki.net/kns/brief/result.aspx?dbprefix=SCDB'
    IpList = []
    socket.setdefaulttimeout(3)
    for i in list:
        try:
            ip_temp = i.strip().split('\t')
            proxyIp = 'http://' + ip_temp[0] + ':' + ip_temp[1]
            proxy = {'https':proxyIp}
            res = urllib.urlopen(url,proxies=proxy).read()#这种写法只在urllib中支持，要是在urllib2的话，可以构造带proxyhandler的openner来实现
            str = '可用的代理地址:'+proxyIp.encode('utf-8')#这里值得注意,proxyIp本身是unicode编码，不同编码连接起来，这里是无法print，所以先转换成了utf-8
            print str
            IpList.append(proxy)
        except Exception,e:
            continue
    return IpList

list = getIP(3)
getUsableIP(list)

#综合起来，后期可以在规模较大的爬虫中，构造一个代理ip池，不断地给主程序提供可用的代理的ip

猜你喜欢