【Python 微博爬虫】Python 实现微博爬虫

前言—功能：这个可以用来爬取微博信息，自定义时间间隔，来爬取用户的微博动态，最后调用短信接口，来提醒用户被监控用户发微博了（因为微博里面有特别关注这个功能，所以这个实战，也仅仅只是兴趣了）

一、简介：

这个微博爬虫是基于一个比较古老的微博版本的，那个时候的微博还可以直接爬网页来获取用户的微博内容，网址是：https://weibo.cn

二、准备阶段：

首先进行爬取的时候是需要带入 cookie 的所以应该先登录自己的账户来获取 cookie ，登录网址为：https://passport.weibo.cn/signin/login 【注】微博账号可以申请一个小号，因为如果被发现的话，微博是只封账号，不封 IP 的，所以注册个小号是比较保险的
获取 cookie ：按 F12 打开控制台，输入 document.cookie 这时，控制台就会打出此时的 cookie
获取目标用户的 UID ：只需要搜索目标用户的用户名即可，然后选择需要监控的页面，将此时的浏览器地址栏里面的地址复制一下即可，里面已经带有了 UID

三、模拟请求 header

header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0'
          ,'Cookie':'此处填入一开始从浏览器获得的cookie'}

以上则是一个被改造了的 header 模拟了一个User-Agent 来防止发现是机器人在获取网页，cookie 是带入个人信息，跳过登录

四、

决定好要爬取几个人的信息：

这里我决定的是爬取两个人的信息，所以我定义了两个数组。

latested_msg=["",""]#记录每个人最新的时间
latested_content=["",""]#记录每个最新微博，防止时间改变，重复发短信

创建本地文件：
首先我的思路是这个样子的，因为我的电脑不可能一直都处于开机状态，所以每次我启动这个程序，用来存历史信息的数组在初始化的时候都会初始化为空，然后微博上爬下来的内容相对于空来说是种更新状态，而不论是不是真的更新了，所以我就创建了一个本地文件，每当用户更新的时候，重写一下文件，每当我重启程序的时候，都会读取这个本地文件来初始化历史信息数组，这样就可以防止短信浪费，毕竟短信也是需要花钱的。
```
#这个本地文件需要注意格式，请读者注意一下，体会一下代码是如何写入的
        if not os.path.exists(dir_path):  # 创建文件夹
            os.mkdir(dir_path)
        path = dir_path + TXT_path
        with open(path, "w") as f:  # 写入更新数据，重写每一个user
            for i in range(0,len(latested_msg)):
                f.write(user[i]+"\n")#写入用户
                f.write(latested_msg[i]+"\n")#写入最新的微博时间
                f.write(latested_content[i]+"\n")#写入最新的微博内容
```

五、处理短信接口
这里的短信接口我使用的是榛子云科技所提供的短信接口，这个个人用户只能使用应用名称为【测试】的应用，不过不影响使用，像其他的如：阿里、腾讯等等，如果发送自定义短信内容的，都需要认证，这就很烦，还好找到了这个，不过这个得花钱，初始免费短信只有1条，后来我又充值了10块共270条短信，网址：http://sms_developer.zhenzikj.com/zhenzisms_user/login.html

六、遇到的问题

也许微博为了防止爬虫啥的，爬取下来的网页代码，并不能用 Beautiful Soup 来解析，因为他在中间插入了好多</html>标签，这就引起了解析错误
解决办法：观察微博网页结构，采用 split() 函数将每个微博分割开来，最后单独处理所获得的第 0 个元素

七、完整代码 + 解释

import bs4
from bs4 import BeautifulSoup
import time
from requests import Session
import os

"""发送实体类"""
import urllib.request
import urllib.parse
import ssl


class ZhenziSmsClient(object):
    def __init__(self, apiUrl, appId, appSecret):
        self.apiUrl = apiUrl
        self.appId = appId
        self.appSecret = appSecret

    def send(self, number, message, messageId=''):
        data = {
            'appId': self.appId,
            'appSecret': self.appSecret,
            'message': message,
            'number': number,
            'messageId': messageId
        }

        data = urllib.parse.urlencode(data).encode('utf-8')
        ssl._create_default_https_context = ssl._create_unverified_context
        req = urllib.request.Request(self.apiUrl + '/sms/send.do', data=data)
        res_data = urllib.request.urlopen(req)
        res = res_data.read()
        res = res.decode('utf-8')
        return res

    def balance(self):
        data = {
            'appId': self.appId,
            'appSecret': self.appSecret
        }
        data = urllib.parse.urlencode(data).encode('utf-8')
        ssl._create_default_https_context = ssl._create_unverified_context
        req = urllib.request.Request(self.apiUrl + '/account/balance.do', data=data)
        res_data = urllib.request.urlopen(req)
        res = res_data.read()
        return res

    def findSmsByMessageId(self, messageId):
        data = {
            'appId': self.appId,
            'appSecret': self.appSecret,
            'messageId': messageId
        }
        data = urllib.parse.urlencode(data).encode('utf-8')
        ssl._create_default_https_context = ssl._create_unverified_context
        req = urllib.request.Request(self.apiUrl + '/smslog/findSmsByMessageId.do', data=data)
        res_data = urllib.request.urlopen(req)
        res = res_data.read()
        return res
"""结束"""

isBreakDown=False
name=""
latested_msg=["",""]#记录每个人最新的时间
latested_content=["",""]#记录每个最新微博，防止时间改变，重复发短信
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0'
          ,'Cookie':'填入所获取到的Cookie'}
          
def getHtml(url,op,user):
    global isBreakDown
    code=None
    try:
        s=Session()
        r=s.get(url,headers=header)
        code=r.status_code
        r.raise_for_status()
        r.encoding="UTF-8"
        """联网成功"""
        if isBreakDown:
            print("<>网络连接成功，继续监控...")
        isBreakDown=False
        """"""
        name1=BeautifulSoup(r.text,"html.parser").find("div",class_="ut")
        if isinstance(name1,bs4.element.Tag):
            global name
            if name!=name1.text.split("/")[0].replace(" ","_"):
                name=name1.text.split("/")[0].replace(" ","_")
            print(name) #打印用户名
        Items=r.text.split('<div class="s"></div>')[0:len(r.text.split('<div class="s"></div>'))-1]

        isFirst = True
        for index in range(0,len(Items)):
            item=Items[index]
            if(index==0):
                items=Items[index].split("<div class=\"c\" id=")
                item="<div class=\"c\" id="+items[1]
            item=BeautifulSoup(item,"html.parser")
            span=item.find("span",class_="ct")
            if isinstance(span,bs4.element.Tag):
                if "今天" in span.text or "分钟前" in span.text or "刚刚" in span.text:
                    if isFirst:#存储第一条微博时间及内容
                        global latested_msg
                        global latested_content
                        itemText=str(item.text.replace("  ","_").replace(" ","_").split("赞")[0].encode("utf-8"))#获取真实文本内容，防止表情出现，转换编码
                        text=span.text.replace(" ","_").replace(" ","_")#span的text
                        if latested_msg[op]!=text and latested_content[op]!=itemText:
                            print("微博更新,内容为：{}".format(item.text))
                            #同时赋值域#
                            latested_msg[op]=text
                            latested_content[op]=itemText
                            changeLog(user)
                            ###########
                            send(name,latested_msg[op])
                    isFirst=False
            if index <5:
                print("最新第{}条微博，内容是:{}".format(index+1,item.text))
        print()
    except:
        isBreakDown=True
        """联网尝试"""
        if isBreakDown:
            print("<>网络连接中断，尝试重新连接中...\n--错误代码【{}】".format(code))
        """"""

def send(name,time):
    print("开始发送")
    # try:
    #     """短信接口采用-榛子科技-提供的接口（http://smsow.zhenzikj.com/），妈的还冲了10块钱，共270条短信，如需技术操作说明，详情见其官网开发文档"""
    #     client = ZhenziSmsClient("https://sms_developer.zhenzikj.com","AppId", "AppSecret")
    #     result = client.send('发送的电话号码', '注意!用户:{},微博已更新,时间:{},详情微博查看'.format(name,time))
    #     # print(result)
    #     # print(client.balance())
    # except:
    #     print("<>发送短信异常")

def init():
    """预处理，读取本地Log.txt 可以节省一条短信"""
    dir_path=r'D:\Wei_Bo_Jian_Kong'
    TXT_path=r'\log.txt'
    who=-1
    try:
        global latested_msg
        global latested_content
        if not os.path.exists(dir_path):#创建Log文件夹
            os.mkdir(dir_path)
        path=dir_path+TXT_path
        with open(path,"r") as f:#读取信息
            EachUser=f.read().split("\n")
            for ans in range(0,len(EachUser)):
                if "https://" in EachUser[ans]:
                    who+=1
                    last_time=ans+1
                    last_msg=ans+2
                    latested_msg[who]=EachUser[last_time]
                    latested_content[who]=EachUser[last_msg]
    except:
        print("init failed")

def changeLog(user):
    """如果检测到微博更新了，则更新本地Log.txt"""
    dir_path = r'D:\Wei_Bo_Jian_Kong'
    TXT_path = r'\log.txt'
    global latested_msg
    global latested_content
    try:
        if not os.path.exists(dir_path):  # 创建文件夹
            os.mkdir(dir_path)
        path = dir_path + TXT_path
        with open(path, "w") as f:  # 写入更新数据，重写每一个user
            for i in range(0,len(latested_msg)):
                f.write(user[i]+"\n")
                f.write(latested_msg[i]+"\n")
                f.write(latested_content[i]+"\n")
    except:
        print("change failed")

if __name__ == '__main__':
    user=["用户1的页面","用户2的页面"]
    print("<>开机启动，等待网络连接...")
    init()#先进行初始化操作
    #time.sleep(30)#30秒足够了
    print("--开始监控--")
    while(True):
        for index in range(0,len(user)):
            getHtml(user[index],index,user)
            if isBreakDown:
                time.sleep(10)#如果断网了，则沉睡3秒后，重新申请
            else:
                time.sleep(3)# 60 * 5 每5分钟轮询一个人，一轮下来 需要 10 分钟

猪猪传奇

发布了83 篇原创文章 · 获赞 15 · 访问量 4万+

私信关注

【Python 微博爬虫】Python 实现微博爬虫

猜你喜欢