百度飞桨小白逆袭大神之鲤鱼跃龙门

小白逆袭大神之综合大作业


这里是三岁,转眼间已经到了大作业了,成功与否最后一搏,最后给小白一点建议吧!加油!等到全部结束后回过头再看一遍效果特别好,会发现忽然间恍然大悟!

综合大作业

在这里插入图片描述
一:先爬取评论
二:数据处理,分词,清洗
三:词频统计
四:词云生成

其他具体的在文章里面已经很齐全了
不需要多说什么了

小白专属嘛,怕大家没有爬取过评论,没有做过词云,在这里把前段时间做的B站up主评论爬取和中文词云制作模板给大家做以参考

'''
B站爬取
https://api.bilibili.com/x/v1/dm/list.so?oid=837806779 弹幕api
https://api.bilibili.com/x/v2/reply?type=1&oid=837806779&&pn=1 评论api
弹幕只能够用oid,目前抓包未在到oid集中出现地址
bug:部分网站没有那么严格按照['data']['replies']['content']['message']的顺序来
'''
#导入库
import requests
from bs4 import BeautifulSoup
import re
import json


def Gethtml(url): #获取网页
    kv = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36"}
    r = requests.get(url, headers = kv)
    # print(r.text)
    r.encoding = r.apparent_encoding
    # print(r.text)
    return r.text


#获取up的uid
def u_id(uid_html):
    # url = f"https://search.bilibili.com/all?keyword={name}"
    # r = Gethtml(url)
    # print(r)
    # r = requests.get(url, headers=kv)
    html = BeautifulSoup(uid_html,'lxml')
    # print(html)
    uid = html.find(name = 'a',attrs = {"class":"title"})
    # print(uid)
    #uid = '<a class="title" href="//space.bilibili.com/390461123?from=search" target="_blank" title="徐大sao">徐大sao</a>'
    uid = re.findall(r'<a.*?href="//space.bilibili.com/(.+)f.*',str(uid))
    uid = uid[0][:-1]
    print('以获取up主的id为',uid)
    return uid


def a_id(aid_html):#文章的aid
        videos = json.loads(aid_html)
        # print(videos)
        videos_lists = videos['data']['list']['vlist']
        aid_list = []
        for videos_list in videos_lists:
            aid = videos_list['aid']
            aid_list.append(aid)
        print('已获得视频id长度为:',len(aid_list))
        return aid_list
 
def comment_save(name, comment_html):  #爬取每个视频下的评论
        videos = json.loads(comment_html.text)
        videos_lists = videos['data']['replies']#['replies']['content']
        # print(videos_lists)
        bvid_list = []
        if videos_lists :
            for videos_list in videos_lists:
                bvid = videos_list['content']['message']
                bvid_list.append(bvid)
                print(bvid_list)
                with open(f'{name}.txt','a+',encoding ='utf-8') as f:
                    f.write(bvid)

        print('提取完毕!')


def main(name): #主函数
    #获得阿婆主的id
    url_uid = f"https://search.bilibili.com/all?keyword={name}"
    uid_html = Gethtml(url_uid)
    uid = u_id(uid_html)
    # print(uid)

    # 循环获取至少9页的视频(不一定有那么多)
    for i in range(1,10):
        aid_url = f"https://api.bilibili.com/x/space/arc/search?mid={uid}&ps=30&tid=0&pn={i}&keyword=&order=pubdate&jsonp=jsonp"
        aid_html = Gethtml(aid_url)
        aid_list = a_id(aid_html)
    # print(aid_list)

    #获取保存评论
        for j in range(len(aid_list)):
            for i in range(1,15):
                comment_uil = f"https://api.bilibili.com/x/v2/reply?type=1&oid={aid_list[j]}&&pn={i}"
                comment_html = requests.get(comment_uil)
                comment_save(name, comment_html)

            # print(comment_html)


main('敬汉卿')#调用主函数
#贤宝宝baby
#老师好我叫何同学
#大祥哥来了
#女胖胖
#记录生活的蛋黄派

词云生成器(自己修改)

from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import jieba
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

d = path.dirname(__file__)

# Read the whole text. encoding = ' gbk'或'utf-8'
#文档地址
f = open("D:\python3.7.4\爬虫\女胖胖.txt", 'r' , encoding = ' utf-8')
t = f.read()
ls = jieba.lcut(t)
text = ' '.join(ls)
#线下词云图必须是png格式白底的不然不好看!
alice_coloring = np.array(Image.open(path.join(d, "手绘美女.png")))

# 设置停用词这里面原来是英文的需要自己添加,中文的停用词太多自己网上找
stopwords = set(STOPWORDS)
stopwords.add(" ")
# print(stopwords)
# 你可以通过 mask 参数 来设置词云形状
wc = WordCloud(background_color="white",font_path = "msyh.ttc" , max_words=2000, mask=alice_coloring,
               stopwords=stopwords, max_font_size=40, random_state=42)
# generate word cloud
wc.generate(text)

# create coloring from image
image_colors = ImageColorGenerator(alice_coloring)

# show
# 在只设置mask的情况下,你将会得到一个拥有图片形状的词云
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.figure()
# recolor wordcloud and show
# we could also give color_func=image_colors directly in the constructor
# 我们还可以直接在构造函数中直接给颜色
# 通过这种方式词云将会按照给定的图片颜色布局生成字体颜色策略
plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")
plt.figure()
plt.imshow(alice_coloring, cmap=plt.cm.gray, interpolation="bilinear")
plt.axis("off")
plt.show()

在这里插入图片描述
李子柒最后的词云图,因为数据处理太麻烦,没有好好处理,大家多多包涵!!!

大家加油!时间不多,还得多多努力!!!奥利给!

发布了41 篇原创文章 · 获赞 124 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/weixin_45623093/article/details/105774630