python爬取网页版优酷视频《我不是药神》的弹幕数据并制作词云图

 首先,播放影片并打开Chrome开发者工具,选择Network。逐步拖动进度条并观察本地与服务器的请求规律,如图: danmu_request_url_png

  然后,确定弹幕数据来自JS实时加载而非XHR。需要注意的是,弹幕的请求数据不是规范的JSON格式。如图:
danmu_json_content

from fake_useragent import UserAgent
from requests.exceptions import RequestException
from tqdm import tqdm
import requests
import time
import os
import re
'''
想要学习Python?Python学习交流群:984632579满足你的需求,资料都已经上传群文件,可以自行下载!
'''
def get_data(mat):
    """
    循环遍历爬取弹幕数据
    :param mat: 偏移量
    :return: list
    """
    # 请求链接
    url = 'https://service.danmu.youku.com/list?jsoncallback=jQuery111207035726936412456_1552483671572&mat={}&mcount=1&ct=1001&iid=959955945&aid=333822&cid=96&lid=0&ouid=0'.format(mat)
    # headers
    headers = {
        'Referer': 'https://v.youku.com/v_show/id_XMzgzOTgyMzc4MA==.html?spm=a2h0k.11417342.soresults.dplaybutton&s=c6c62a475a5d4a14ab48',
        'User-Agent': UserAgent().random
    }
    """
    # 参数
    params = {
        'jsoncallback': 'jQuery11120003560802190473389_1552479833762',
        'mat': mat,
        'mcount': '1',
        'ct': '1001',
        'id': '959955945',
        'aid': '333822',
        'cid': '96',
        'lid': '0',
        'ouid': '0'
        # '_': '1552479833815'  提示:类似时间戳,去掉后不影响数据的获取
    }
    """
    # 获取弹幕
    try:
        response = requests.get(url, headers=headers)
        print(response)
        if response.status_code == 200:
            html = response.text
            # 正则解析(结果为list类型)
            results = re.findall(',\"content\":\"(.*?)\",', html, re.S)
            # 文本存储
            save_dir = './utils/danmu.txt'
            if not os.path.exists(save_dir):  # Determine whether storage path exists, no creation
                os.mkdir(save_dir)
            with open(save_dir, 'a', encoding='utf-8') as f:
                f.write(str(results))
            return results
        return None
    except RequestException as e:
        print('Error: ', e.args)
        return None

if __name__ == '__main__':
    for i in tqdm(range(10), desc='Progress'):
        time.sleep(1)
        get_data(str(i))

猜你喜欢

转载自blog.csdn.net/fei347795790/article/details/88692084
今日推荐