哔哩哔哩小爬虫(爬取某一up主全部投稿视频的相关信息)

笔者最近需要一个bilibili的小爬虫,爬取内容包括某一个up主全部投稿视频的相关信息,包括点赞数量、投币数量、分享数量、视频时长等等

直接贴代码

import requests
import json
from lxml import etree
import time
import xlwt
import re

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
    'cookie': "your cookie"
}

# 只需要把用户名,用户id,视频页数放到下面就行
target_users = [{'user_name': '新华社', 'target_user_id': '473837611', 'pages_num': 33}]  # 要爬取的用户的id和视频的页数
for user in target_users:
    user_id = user['target_user_id']
    user_name = user['user_name']
    pages_num = user['pages_num']
    excel = xlwt.Workbook(encoding='utf-8')
    sheet = excel.add_sheet('sheet1')
    sheet.write(0, 0, '视频名称')
    sheet.write(0, 1, '发布时间')
    sheet.write(0, 2, '视频时长')
    sheet.write(0, 3, '投币数量')
    sheet.write(0, 4, '点赞数量')
    sheet.write(0, 5, '分享数量')
    sheet.write(0, 6, '评论数量')
    sheet.write(0, 7, '弹幕数量')
    sheet.write(0, 8, '播放量')
    sheet.write(0, 9, '收藏数量')
    sheet.write(0, 10, '标签')
    count = 1
    for page in range(1, pages_num + 1):
        try:
            print("正在爬取第" + str(page) + "页数据\n")
            user_main_page_link = "https://api.bilibili.com/x/space/arc/search?mid=" + user_id + "&ps=30&tid=0&pn=" + str(
                page) + "&keyword=&order=pubdate&jsonp=jsonp"
            user_response = requests.get(user_main_page_link, headers=headers)
            user_json = json.loads(user_response.text)
            user_datas = user_json['data']
            ls = user_datas['list']
            vlist = ls['vlist']
            for t in vlist:
                title = t['title']  # 标题
                length = t['length']  # 视频时长
                bvid = t['bvid']  # 视频id
                comment = t['comment']  # 评论数量
                view_num = t['play']
                video_url = 'https://www.bilibili.com/video/' + bvid  # 具体视频连接
                aid = t['aid']
                time.sleep(1)
                tag_url = 'https://api.bilibili.com/x/web-interface/view/detail/tag?aid=' + str(aid)
                tag_response = requests.get(tag_url, headers=headers)
                tag_json = json.loads(tag_response.text)
                tag_datas = tag_json['data']
                tags = []
                for tag_data in tag_datas:
                    tags.append(tag_data['tag_name'])
                time.sleep(1)
                video_response = requests.get(video_url, 'html.parser', headers=headers).content
                video_text = requests.get(video_url)
                selector = etree.HTML(video_response)
                coin_span = selector.xpath("//span[@class='coin']")
                coin_num = coin_span[0].xpath("text()")[0].strip(' ').strip('\n').strip(' ')  # 硬币数量
                dm_span = selector.xpath("//span[@class='dm']")
                dm_num = dm_span[0].xpath("text()")[0].strip(' ').strip('\n')  # 弹幕数量
                like_span = selector.xpath("//span[@class='like']")
                like_num = like_span[0].xpath("text()")[0].strip(' ').strip('\n')  # 喜欢数量
                share_span = selector.xpath("//span[@class='share']")
                share_num = share_span[0].xpath("text()")[0].strip(' ').strip('\n')  # 分享数量
                collect_span = selector.xpath("//span[@class='collect']")
                collect_num = collect_span[0].xpath("text()")[0].strip(' ').strip('\n')  # 收藏数量
                publish_time = re.findall(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", video_text.text)[
                    2]  # 发布时间
                sheet.write(count, 0, title)
                sheet.write(count, 1, publish_time)
                sheet.write(count, 2, length)
                sheet.write(count, 3, coin_num)
                sheet.write(count, 4, like_num)
                sheet.write(count, 5, share_num)
                sheet.write(count, 6, comment)
                sheet.write(count, 7, dm_num)
                sheet.write(count, 8, view_num)
                sheet.write(count, 9, collect_num)
                col = 10
                for tag in tags:
                    sheet.write(count, col, tag)
                    col += 1
                count += 1
                print("已经爬取第" + str(count) + "条\n")
            time.sleep(3)
        except:
            file_name = user_name + ".xls"
            excel.save(file_name)
    file_name = user_name + ".xls"
    excel.save(file_name)

原创不易,尊重原作,转载请注明,转载自:https://blog.csdn.net/kr2563

猜你喜欢

转载自blog.csdn.net/kr2563/article/details/113383145