项目需要收集微信的服务器Ip,微信之前也是通过http请求获取的ip地址,参考网上资料写了个脚本,通过代理使用不同地方的ip,发送同样的请求,然后将返回的ip地址存入数据库:
import urllib.request
import random
import zlib
from bs4 import BeautifulSoup
import sqlite3
import os
import requests
import time
import random
class KuaiDaili(object):
def get_html(self, page):
header = { # 添加header可以将程序伪装成浏览器
"Host": "www.kuaidaili.com",
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
}
TARGET_URL1 = ("http://www.kuaidaili.com/free/inha/%s" % page) # 目标地址
TARGET_URL2 = ("http://www.kuaidaili.com/free/intr/%s" % page)
if 1 == page%2:
html = requests.get(url=TARGET_URL1, headers=header, timeout=30).content # 获取html文本
else:
html = requests.get(url=TARGET_URL2, headers=header, timeout=30).content # 获取html文本
return html.decode("utf-8")
def ip_list(self, html):
soup = BeautifulSoup(html, 'lxml') # 创建一个BeautifulSoup,使用更强的lxml解析器,
#time.sleep(random.randint(3,6))
list_tr = soup.find('div', id='list').find_all('tr') # 提取id为list的div标签中的所有tr标签
for i in range(len(list_tr)): # 遍历tr标签的列表
if i == 0: continue # 因为从上图中我们可以看到第一个tr标签里面的内容是表头,
# 不是我们需要的数据,所以我们跳过第一个tr标签,从第二个tr标签开始遍历
tr = list_tr[i]
list_td = tr.find_all('td')# 获取每个tr标签中的所有td标签,分析html可知td标签从上到下
# 依次是ip,端口,匿名度等信息...
ip = list_td[0].get_text()
port = list_td[1].get_text()
anonymous = list_td[2].get_text()
LthLog(anonymous)
types = list_td[3].get_text()
#location = list_td[4].get_text()
#speed = list_td[5].get_text()
#verify_time = list_td[6].get_text()
# 创建代理对象,把每个代理信息都保存到对象中,这一步也可以跳过
ScratchIp(ip,port,types)
def batch_insert(self, page=200):# 批量插入
for i in range(1,page):
resp = self.get_html(i)
self.ip_list(resp)
def query(self):# 查询数据
conn = sqlite3.connect('ip.db')
cursor = conn.cursor()
# sql = 'select ip,port from ip where location like "中国 江苏省 镇江市%";'
# sql = 'select * from ip where port="80";'
sql = 'select * from ip;'
rows = cursor.execute(sql)
for row in rows:
print(str(row[0]) + ":" + str(row[1]))
print('rows length:%s', len(rows))
conn.commit()
conn.close()
def CreateTable(tableName):
conn = sqlite3.connect(tableName + '.db')
cursor = conn.cursor()
cursor.execute(
'''CREATE TABLE ip(ip VARCHAR(20) PRIMARY KEY)
''')
conn.commit()
conn.close()
def InsertTable(tableName,list_tr):
for i in range(len(list_tr)):
if i == 0:
continue
tr = list_tr[i]
#print(tr)
if not os.path.exists(tableName + '.db'):# 检查数据库文件是否存在
CreateTable(tableName)
conn = sqlite3.connect(tableName + '.db')# 建立连接
cursor = conn.cursor()# 创建操作游标
sql = "REPLACE INTO ip(ip) VALUES (\"" + str(tr) +"\")"
cursor.execute(sql)
conn.commit()# 最后不要忘了提交操作
conn.close()
def ReadHtml(html,response):
encoding = response.info().get('Content-Encoding')
#print(encoding)
if encoding == 'gzip':
html = zlib.decompress(html, 16+zlib.MAX_WBITS)
elif encoding == 'deflate':
try:
html = zlib.decompress(html, -zlib.MAX_WBITS)
except zlib.error:
html = zlib.decompress(html)
return html
def LthLog(logfile):
print(logfile)
def ScratchIp(ip,port,types):
time.sleep(1)
proxy = "{\"" + types.lower() + "\" : \"" + ip + ":" + port +"\"}"
proxy = eval(proxy)
httpproxy_handler = urllib.request.ProxyHandler(proxy)
LthLog(proxy)
opener = urllib.request.build_opener(httpproxy_handler)
#request = urllib.request.Request("http://dns.weixin.qq.com/cgi-bin/micromsg-bin/newgetdns")
#response = urllib.request.urlopen(request)
try:
response = opener.open("http://dns.weixin.qq.com/cgi-bin/micromsg-bin/newgetdns",timeout=2)
except :
print("error")
return
html = response.read()
html = ReadHtml(html,response)
htmls = bytes.decode(html)
soup = BeautifulSoup(htmls, 'xml')
nameTags = soup.findAll('domain',{"name":True})
for iptable in nameTags:
#print(iptable)
list_tr = soup.find('domain', attrs={'name':iptable.attrs['name']} ).find_all('ip')
TableName = iptable.attrs['name'].replace('.','x')
InsertTable(TableName,list_tr)
LthLog("insert ok")
if __name__ == '__main__':
daili = KuaiDaili()
daili.batch_insert()