Python-爬取新浪微博转发关系(无需cookies免登录)

最近科研老师打算分析微博数据,先从转发关系的分析入手,当然就需要转发关系数据啦!查了许多资料,GitHub上也有成熟代码,但仍决定从0开始根据自己的需求自编一个。

基于m/weibo.cn 这个网站进行爬取是不需要cookies也无需账号登录的,但需要解析json数据,整个爬取过程比较机械繁琐,但不担心被封啊哈哈哈哈哈~~

具体思路见代码,主要是对id的获取。

简单描述思路:

  • 需要手动输入一些大v用户id到get_wbid.py,获取用户id下的所有微博id
  • 使用relationship.py 读取爬取到的微博id,可以访问微博正文所在链接,进而爬取需要的数据
  • 优化:通过relationship.py爬取点赞/评论/转发用户id,扩大get_wbid.py的输入
  • 问题:目前只是形成了最简版本,主要作为工具接口调用,如果要放在后台不断爬取,还需要加上时间识别/去重/……
  • 备注:微博网页主要分两种,一是用户主页类型(思路:get_wbid.py),二是微博正文(思路:relationship.py),小伙伴们可以基于我的思路去爬微博其他网页获取数据,原理都差不多。加入了proxy代理池,可删。

get_wbid.py

# 尝试用用户id获得用户所有的微博博文id
# 输入用户id
# 输出微博博文id

import csv
import requests
import re
import time
import json

def get_user_containerid(user_id):   #containerid和usid不一致,查看用户的关注列表需要他的containerid,usid用于获取用户主页信息
    url = 'http://m.weibo.cn/api/container/getIndex?type=uid&value={user_id}'.format(user_id=user_id)
    resp = requests.get(url)
    jsondata = resp.json()
    jsondata = jsondata['data']
    fans_id=jsondata.get('follow_scheme')
    items = re.findall(r"&lfid=(\w+)*", fans_id, re.M)
    for i in items:
        return i

def get_luicode_lfid(sheader):
    url = sheader
    proxypool_url = 'http://127.0.0.1:5555/random'
    proxies = {
    
    'http': 'http://' + requests.get(proxypool_url).text.strip()}
    response = requests.get(url, proxies=proxies)
    html = json.loads(response.content.decode('utf-8'))
    s = html.get('data').get('scheme')
    luicode = s[s.find('luicode=')+8:s.find('&lfid=')]
    lfid = s[s.find('&lfid=')+6:]
    for i in html.get('data').get('tabsInfo').get('tabs'):
        if i.get('tabKey') == 'weibo':
            containerid = i.get('containerid')
    return [luicode,lfid,containerid]

# 获取微博博文bw_id
def get_bw_id(user_id,sheader): # 用户id和主页前缀
    b = True
    n = 0
    sid = '1'
    url = sheader
    error = {
    
    }
    while b:
        try:
            n += 1
            print('正在处理主页--->', url)
            proxypool_url = 'http://127.0.0.1:5555/random'
            proxies = {
    
    'http': 'http://' + requests.get(proxypool_url).text.strip()}
            response = requests.get(url,proxies=proxies)
            html = json.loads(response.content.decode('utf-8'))

            if 'data' in html.keys():
                if 'since_id' in html.get('data').get('cardlistInfo'):
                    if  html.get('data').get('cardlistInfo').get('since_id') == sid:
                        break
                    elif sid == '':
                        break
                    else:
                        sid = html.get('data').get('cardlistInfo').get('since_id')

                else:
                    break

                if 'cards' in html.get('data'):
                    for i in html.get('data').get('cards'):
                        if i.get('mblog',-1) != -1:
                            screen_name = i['mblog'].get('user').get('screen_name')
                            content = [user_id,screen_name,i['mblog'].get('id')]
                            write_file(content)
                else:
                    break

            time.sleep(1)
        except Exception as e:
            print('请求主页出错--->', url)
            if str(e) == 'Expecting value: line 1 column 1 (char 0)' and error.get(url, -1) == -1:
                error[url] = 1
                n -= 1
                print('重新请求主页--->', url)
                time.sleep(5)
            elif str(e) == 'Expecting value: line 1 column 1 (char 0)' and error.get(url, -1) == 1:
                time.sleep(5)
            else:
                b = False
                print('错误信息:\n',e)
        url = sheader + '&since_id=' + str(sid)
    print('共处理主页 ',n)



def write_file(content):
    with open('user+screen_name+bw.csv','a') as f:
        writer = csv.writer(f)
        writer.writerow(content)

if __name__ == '__main__':

    result_headers = [
        'user_id',
        'screen_name',
        'id',
    ]
    with open('user+screen_name+bw.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(result_headers)
    # 此处读取 '各界大v用户id.txt' 并进行遍历
    user_id = 1288739185
    containerid = get_user_containerid(str(user_id))
    sheader = 'https://m.weibo.cn/api/container/getIndex?uid=' \
              ''+str(user_id)+'&type=uid&value='+str(user_id)+\
              '&containerid='+str(containerid)
    l = get_luicode_lfid(sheader)
    sheader = 'https://m.weibo.cn/api/container/getIndex?uid=' \
              ''+str(user_id)+'&luicode='+str(l[0])+'&lfid='+str(l[1])+\
              '&type=uid&value='+str(user_id)+'&containerid='+str(l[2])
    get_bw_id(user_id,sheader)

relationship.py

# 通过微博博文id建立转发关系
# 输入用户id和微博博文id

import csv
import json
import requests
import time

# 爬取单个微博
def get_fs_info(u_id,u_screen_name,bw_id):
    b = True
    n = 0
    error = {
    
    }
    while b:
        try:
            n += 1
            url = 'https://m.weibo.cn/api/statuses/repostTimeline?id=' + str(bw_id) + '&page=' + str(n)
            print('正在处理-->',url)
            proxypool_url = 'http://127.0.0.1:5555/random'
            proxies = {
    
    'http': 'http://' + requests.get(proxypool_url).text.strip()}
            response = requests.get(url, proxies=proxies)
            html = json.loads(response.content.decode('utf-8'))
            if 'data' in html.keys():
                if 'data' in html.get('data').keys():
                    for i in html.get('data').get('data'):
                        fs_id = i.get('user').get('id')
                        fsbw_id = i.get('id')
                        screen_name = i.get('user').get('screen_name')
                        write_csv([u_id,u_screen_name,bw_id,fs_id,screen_name,fsbw_id])
            else:
                b = False
        except Exception as e :
            if str(e) == 'Expecting value: line 1 column 1 (char 0)' and error.get(url, -1) == -1:
                error[url] = 1
                n -= 1
                time.sleep(5)
                print('重新请求-->', url)
            elif str(e) == 'Expecting value: line 1 column 1 (char 0)' and error.get(url, -1) == 1:
                time.sleep(5)
            else:
                b = False
                print('Error:\n',e)
        time.sleep(1)

def write_csv(result_data): # result_data四元数组
    """将爬取的信息写入csv文件"""
    try:
        with open('relationship.csv','a',encoding='utf-8-sig',newline='') as f:
            writer = csv.writer(f)
            writer.writerow(result_data)
    except Exception as e:
        print('Error: ', e)

if __name__ == '__main__':
    result_headers = [
        'user_id',
        'screen_name',
        'id',
        'fs_user_id',
        'fs_screen_name',
        'fs_id',
    ]
    with open('relationship.csv', 'w', encoding='utf-8-sig', newline='') as f:
        writer = csv.writer(f)
        writer.writerows([result_headers])
    # # 在此处读取 'user+screen_name+bw.csv' 并进行遍历
    with open('user+screen_name+bw.csv','r') as f:
        for line in f:
            l = line.replace('\n','').replace('\r','').split(',')
            if l != ['']:
                u_id = l[0]
                screen_name = l[1]
                bw_id = l[2]
                get_fs_info(u_id,screen_name,bw_id)
    # u_id = 1288739185
    # screen_name = '关晓彤'
    # bw_id = 4503001732052045
    # get_fs_info(u_id, screen_name, bw_id)

猜你喜欢

转载自blog.csdn.net/MaoziYa/article/details/106049338