代码用的Python3.6,抓取xici免费代理,检测放入数据库中,为以后爬虫做准备。
# -*- coding: utf-8 -*-
import requests
from lxml import etree
import time
import random
import pymongo
from multiprocessing import Pool # 多线程
# ----------------------------------------------------------------------------------------------------------------------
# 返回一个随机的请求头 headers
def getheaders():
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
UserAgent = random.choice(user_agent_list)
headers = {'User-Agent': UserAgent}
return headers
class Getproxy(object):
def __init__(self):
self.headers = getheaders() # 定制请求头
# {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
self.url = 'http://www.xicidaili.com/wt/'
# {'1': 'http://www.xicidaili.com/nt/', # xicidaili国内普通代理
# '2': 'http://www.xicidaili.com/nn/', # xicidaili国内高匿代理
# '3': 'http://www.xicidaili.com/wn/', # xicidaili国内https代理
# '4': 'http://www.xicidaili.com/wt/'} # xicidaili国外http代理
self.client = pymongo.MongoClient('localhost', 27017)
self.xici = self.client['xici']
self.xiciipinfo = self.xici['xiciipinfo']
# self.removeip = '127.0.0.1' #第一次运行会检测该变量,因为下面只有检测失败了才会赋值
def getip(self, num):
# 爬西祠所有代理,更新放入数据库
url = self.url + str(num)
wb_data = requests.get(url, headers=self.headers)
html = etree.HTML(wb_data.text)
ips = html.xpath('//tr[@class="odd"]/td[2]/text()')
ports = html.xpath('//tr[@class="odd"]/td[3]/text()')
protocols = html.xpath('//tr[@class="odd"]/td[6]/text()')
areas = html.xpath('//tr[@class="odd"]/td[4]/a/text()')
for ip, port, protocol, area in zip(ips, ports, protocols, areas):
data = {
'ip': ip,
'port': port,
'protocol': protocol,
'area': area,
}
print(data)
# self.xiciipinfo.insert_one(data)
# if self.removeip != ip: #此处加一个判断,如果是下面检测过的不可用的ip,就不更新进入数据库,可以节省下面的检测时间
self.xiciipinfo.update({'ip': ip}, {'$set': data}, True)
def count(self, num):
for i in range(1, num):
self.getip(i)
time.sleep(2)
def dbclose(self):
self.client.close()
def getiplist(self):
# 将数据库内数据整理放入列表
ips = self.xiciipinfo.find()
proxylist = []
for i in ips:
b = "http" + "://" + i['ip'] + ":" + i['port']
proxies = {"http": b}
# print proxies
proxylist.append(proxies)
# print proxylist
return proxylist
def iptest(self, proxy):
# 检测ip,并更新进入数据库,删掉不可用的ip
ip = proxy['http'][7:].split(':')[0]
try:
requests.get('http://wenshu.court.gov.cn/', proxies=proxy, timeout=6)
except:
print('field...............>>>>>>>>>>>>>>>>>>>>>>>>')
# self.removeip = ip #赋值给类属性
self.xiciipinfo.remove({'ip': ip}) # 用remove方法,将符合条件的删掉
print('remove it now.....{}'.format(ip))
else:
print('<<<<<<<<<<<<<<<<<.............success')
print(proxy)
if __name__ == '__main__':
pool = Pool()
proxy = Getproxy()
proxy.count(2)
iplist = proxy.getiplist()
map(proxy.iptest, iplist)
proxy.dbclose()