python 代理 爬取不同ip段的微信服务器地址

项目需要收集微信的服务器Ip,微信之前也是通过http请求获取的ip地址,参考网上资料写了个脚本,通过代理使用不同地方的ip,发送同样的请求,然后将返回的ip地址存入数据库:

import urllib.request
import random
import zlib
from bs4 import BeautifulSoup
import sqlite3
import os
import requests
import time
import random


            
class KuaiDaili(object):
    def get_html(self, page):
        header = {  # 添加header可以将程序伪装成浏览器
            "Host": "www.kuaidaili.com",
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8',
        }
        TARGET_URL1 = ("http://www.kuaidaili.com/free/inha/%s" % page)  # 目标地址
        TARGET_URL2 = ("http://www.kuaidaili.com/free/intr/%s" % page)
        if 1 == page%2:
            
            html = requests.get(url=TARGET_URL1, headers=header, timeout=30).content  # 获取html文本
        else:
            html = requests.get(url=TARGET_URL2, headers=header, timeout=30).content  # 获取html文本
        return html.decode("utf-8")

    
    def ip_list(self, html):
        soup = BeautifulSoup(html, 'lxml')  # 创建一个BeautifulSoup,使用更强的lxml解析器,
        #time.sleep(random.randint(3,6))
        list_tr = soup.find('div', id='list').find_all('tr')  # 提取id为list的div标签中的所有tr标签
        for i in range(len(list_tr)):  # 遍历tr标签的列表
            if i == 0: continue  # 因为从上图中我们可以看到第一个tr标签里面的内容是表头,
                                # 不是我们需要的数据,所以我们跳过第一个tr标签,从第二个tr标签开始遍历
            tr = list_tr[i]
            list_td = tr.find_all('td')# 获取每个tr标签中的所有td标签,分析html可知td标签从上到下
                                        # 依次是ip,端口,匿名度等信息...
            ip = list_td[0].get_text()
            port = list_td[1].get_text()
            anonymous = list_td[2].get_text()
            LthLog(anonymous)
            types = list_td[3].get_text()
            #location = list_td[4].get_text()
            #speed = list_td[5].get_text()
            #verify_time = list_td[6].get_text()
            # 创建代理对象,把每个代理信息都保存到对象中,这一步也可以跳过
            
            ScratchIp(ip,port,types)

    def batch_insert(self, page=200):# 批量插入
        for i in range(1,page):
            resp = self.get_html(i)
            self.ip_list(resp)

    def query(self):# 查询数据
        conn = sqlite3.connect('ip.db')
        cursor = conn.cursor()
        # sql = 'select ip,port from ip where location like "中国 江苏省 镇江市%";'
        # sql = 'select * from ip where port="80";'
        sql = 'select * from ip;'
        rows = cursor.execute(sql)

        for row in rows:
            print(str(row[0]) + ":" + str(row[1]))
        print('rows length:%s', len(rows))
        conn.commit()
        conn.close()



def CreateTable(tableName):
    conn = sqlite3.connect(tableName + '.db')
    cursor = conn.cursor()
    cursor.execute(
        '''CREATE TABLE ip(ip VARCHAR(20) PRIMARY KEY)
        ''')
    conn.commit()
    conn.close()

def InsertTable(tableName,list_tr):
    for i in range(len(list_tr)):
        if i == 0:
            continue
        tr = list_tr[i]
        #print(tr)
        if not os.path.exists(tableName + '.db'):# 检查数据库文件是否存在
            CreateTable(tableName)
        conn = sqlite3.connect(tableName + '.db')# 建立连接
        cursor = conn.cursor()# 创建操作游标
        sql = "REPLACE INTO ip(ip) VALUES (\"" + str(tr) +"\")"
        cursor.execute(sql)
        conn.commit()# 最后不要忘了提交操作
        conn.close()
    
def ReadHtml(html,response):
    encoding = response.info().get('Content-Encoding')
    #print(encoding)
    if encoding == 'gzip':
        html = zlib.decompress(html, 16+zlib.MAX_WBITS)
    elif encoding == 'deflate':
        try:
            html = zlib.decompress(html, -zlib.MAX_WBITS)
        except zlib.error:
            html = zlib.decompress(html)
    return html


def LthLog(logfile):
    print(logfile)

def ScratchIp(ip,port,types): 
    time.sleep(1)
    proxy = "{\"" + types.lower() + "\" : \"" + ip + ":" + port +"\"}"
    proxy = eval(proxy)
    httpproxy_handler = urllib.request.ProxyHandler(proxy)
    LthLog(proxy)
    opener = urllib.request.build_opener(httpproxy_handler)
    #request = urllib.request.Request("http://dns.weixin.qq.com/cgi-bin/micromsg-bin/newgetdns")
    
    #response = urllib.request.urlopen(request)
    
    try:
        response = opener.open("http://dns.weixin.qq.com/cgi-bin/micromsg-bin/newgetdns",timeout=2)
    except :
        print("error")
        return
     
    html = response.read()
    html = ReadHtml(html,response)
    htmls = bytes.decode(html)
    soup = BeautifulSoup(htmls, 'xml')
    nameTags = soup.findAll('domain',{"name":True})
    for iptable in nameTags:
        #print(iptable)
        list_tr = soup.find('domain', attrs={'name':iptable.attrs['name']} ).find_all('ip') 
        TableName = iptable.attrs['name'].replace('.','x')
        InsertTable(TableName,list_tr)
        
    LthLog("insert ok")
   



if __name__ == '__main__':
    
    daili = KuaiDaili()
    daili.batch_insert()
    
发布了40 篇原创文章 · 获赞 22 · 访问量 4万+

猜你喜欢

转载自blog.csdn.net/liutianheng654/article/details/86219687