建立IP代理池存入MongoDB数据库

代码用的Python3.6,抓取xici免费代理,检测放入数据库中,为以后爬虫做准备。

# -*- coding: utf-8 -*-
import requests
from lxml import etree
import time
import random
import pymongo
from multiprocessing import Pool  # 多线程


# ----------------------------------------------------------------------------------------------------------------------
# 返回一个随机的请求头 headers
def getheaders():
    user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
    UserAgent = random.choice(user_agent_list)
    headers = {'User-Agent': UserAgent}
    return headers


class Getproxy(object):
    def __init__(self):
        self.headers = getheaders()  # 定制请求头
        # {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
        self.url = 'http://www.xicidaili.com/wt/'
        # {'1': 'http://www.xicidaili.com/nt/',  # xicidaili国内普通代理
        #  '2': 'http://www.xicidaili.com/nn/',  # xicidaili国内高匿代理
        #  '3': 'http://www.xicidaili.com/wn/',  # xicidaili国内https代理
        #  '4': 'http://www.xicidaili.com/wt/'}  # xicidaili国外http代理
        self.client = pymongo.MongoClient('localhost', 27017)
        self.xici = self.client['xici']
        self.xiciipinfo = self.xici['xiciipinfo']
        # self.removeip = '127.0.0.1' #第一次运行会检测该变量,因为下面只有检测失败了才会赋值

    def getip(self, num):
        # 爬西祠所有代理,更新放入数据库
        url = self.url + str(num)
        wb_data = requests.get(url, headers=self.headers)
        html = etree.HTML(wb_data.text)
        ips = html.xpath('//tr[@class="odd"]/td[2]/text()')
        ports = html.xpath('//tr[@class="odd"]/td[3]/text()')
        protocols = html.xpath('//tr[@class="odd"]/td[6]/text()')
        areas = html.xpath('//tr[@class="odd"]/td[4]/a/text()')
        for ip, port, protocol, area in zip(ips, ports, protocols, areas):
            data = {
                'ip': ip,
                'port': port,
                'protocol': protocol,
                'area': area,
            }
            print(data)
            # self.xiciipinfo.insert_one(data)
            # if self.removeip != ip: #此处加一个判断,如果是下面检测过的不可用的ip,就不更新进入数据库,可以节省下面的检测时间
            self.xiciipinfo.update({'ip': ip}, {'$set': data}, True)

    def count(self, num):
        for i in range(1, num):
            self.getip(i)
            time.sleep(2)

    def dbclose(self):
        self.client.close()

    def getiplist(self):
        # 将数据库内数据整理放入列表
        ips = self.xiciipinfo.find()
        proxylist = []
        for i in ips:
            b = "http" + "://" + i['ip'] + ":" + i['port']
            proxies = {"http": b}
            # print proxies
            proxylist.append(proxies)
        # print proxylist
        return proxylist

    def iptest(self, proxy):
        # 检测ip,并更新进入数据库,删掉不可用的ip
        ip = proxy['http'][7:].split(':')[0]
        try:
            requests.get('http://wenshu.court.gov.cn/', proxies=proxy, timeout=6)
        except:
            print('field...............>>>>>>>>>>>>>>>>>>>>>>>>')
            # self.removeip = ip #赋值给类属性
            self.xiciipinfo.remove({'ip': ip})  # 用remove方法,将符合条件的删掉
            print('remove it now.....{}'.format(ip))
        else:
            print('<<<<<<<<<<<<<<<<<.............success')

            print(proxy)


if __name__ == '__main__':
    pool = Pool()
    proxy = Getproxy()
    proxy.count(2)
    iplist = proxy.getiplist()
    map(proxy.iptest, iplist)
    proxy.dbclose()

发布了67 篇原创文章 · 获赞 111 · 访问量 32万+

猜你喜欢

转载自blog.csdn.net/shengshengshiwo/article/details/90649343