Table of contents
1. Environment and description
2.1. The complete source code is as follows
3.2. Saved file content results
1. Environment and description
1.1. Environment
Operating system: win10 home edition
Editor:pycharmedu
Version: python 3.10
Libraries used: requests, re, numpy, Image, WordCloud,
Idea: Use the requests module to send requests, the re module to parse and extract the page source code, the numpy module to draw data visualization, Image to convert data, and WordCloud to draw word clouds.
1.2. Description
What we want to get this time is the content of the top 250 db movies: The Matrix 2 reviews (at least 100)
Data visualization is: word cloud chart
The location where the data is saved is: in the same directory as the current .py file
The location of the image to generate the word cloud is: in the same directory as the current .py file
Tip: The generated word cloud picture can be changed at will
The URL used will not be real, and the porting test must identify and change it.
2. Complete code
2.1. The complete source code is as follows
import requests
import re
import jieba
import numpy
import PIL.Image as Image
from wordcloud import WordCloud
from bs4 import BeautifulSoup
class GetDiscuss:
def __init__(self, text_one, text_two, headers):
print(text_one)
self.t_two = text_two
self.headers = headers
# 第一个请求,要拿到跳转到具体电影的页面url
def one_requests(self):
global one_tru
one_tru = [] # 保存全部分页url
one_url = "这里的代码应该是排行top250的url" # 拿到第一个界面的url
one_rep = requests.get(one_url, headers=self.headers) # 发起请求
one_text = one_rep.text # 第一个页面源代码
one_rep.encoding = 'utf-8' # utf-8方式编码第一个页面的源代码
one_result = BeautifulSoup(one_text, "html.parser")
one_div = one_result.find("div", class_="paginator").find_all("a")
global tru
tru = [] # 保存全部分页url
for i in one_div:
once_href = i.get("href")
urls = "排行top250的url" + once_href
tru.append(urls) # 每一个分页都存入
# print(tru)
# 第二次请求,进入具体电影页面,并且进入影评界面
def two_requests(self):
global three_tru
global f_hrefs
global f_href
three_tru = []
two_tru = tru
# print(two_tru)
two_rep = requests.get(two_tru[8], headers=self.headers) # 发起请求
two_text = two_rep.text # 第一个页面源代码
two_rep.encoding = 'utf-8' # utf-8方式编码页面的源代码
# 正则提取url,re.S规则是使.可以匹配空格
two_obj = re.compile(f'<em class="">228</em>.*?<div class="hd">.*?<a href="(?P<two_href>.*?)" class="">', re.S)
two_result = two_obj.finditer(two_text)
for i in two_result:
t_href = i.group("two_href")
# print(t_href) # 输出测试获取的链接是否是我们要的
# 第三次发起请求,进入完整的影评界面
three_rep = requests.get(t_href, headers=self.headers) # 发起请求
three_text = three_rep.text # 页面源代码
three_rep.encoding = 'utf-8' # utf-8方式编码页面的源代码
three_obj = re.compile(f'> <a href="(?P<three_href>.*?)" >', re.S)
three_result = three_obj.finditer(three_text)
for j in three_result:
f_href = j.group("three_href")
# print(f_href) # 输出测试获取的链接是否是我们要的 comments?sort=new_score&status=P
f_hrefs = "https://movie.douban.com/subject/1304141/" + f_href
# print(f_hrefs)
# 实际上是发起的第四次请求,分别拿到5页的评论,要拿更多需要更改上面的x元组
def three_requests(self):
global f_all_hrefs
global f_hrefs
f_all_href = f_hrefs.strip("?")[:-23] # 通过观察得出需要拼接部分的url,进行截取
# 拼接评论完整url
f_all_hrefs = f_all_href + "start={}&limit=20&status=P&sort=new_score"
x = [20, 40, 60, 80, 100, 120]
for k in range(0, 1):
f_all_url = f_all_hrefs.format(x[k])
# print(f_all_url)
four_rep = requests.get(f_all_url, headers=self.headers) # 发起请求
four_text = four_rep.text # 第一个页面源代码
print('4', four_rep.status_code)
# print(four_text)
four_rep.encoding = 'utf-8' # utf-8方式编码第一个页面的源代码
four_obj = re.compile(f'<span class="short">(?P<pape_dis>.*?)</span>', re.S)
four_result = four_obj.finditer(four_text)
for p in four_result:
ones_dis = p.group("pape_dis")
with open("all_dis.txt", mode="a", encoding='utf-8') as file:
file.write(ones_dis)
print(self.t_two)
def word_cloud():
with open("all_dis.txt", encoding='utf-8', mode='r') as f:
text1 = f.read()
text2 = jieba.cut(text1)
wordsDict = {}
for word in text2:
if len(word) == 1:
continue
elif word.isdigit() == True:
continue
elif word in wordsDict:
wordsDict[word] += 1
else:
wordsDict[word] = 1
wordsDict_seq = sorted(wordsDict.items(), key=lambda x:x[1], reverse=True) # 按字典的值降序排序
print(wordsDict_seq[:20])
mask_pic = numpy.array(Image.open("one.jpg"))
text3 = " ".join(jieba.cut(text1))
image = WordCloud(font_path="msyh.ttc", mask=mask_pic).generate(text3)
image = image.to_image()
image.show()
if __name__ == '__main__':
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36 SE 2.X MetaSr 1.0",
}
one_pr = "开始爬取"
two_pr = "爬取结束"
global f_hrefs
global f_href
global tru
global one_tru
global three_tru
global f_all_hrefs
example = GetDiscuss(one_pr, two_pr, header)
example.one_requests()
example.two_requests()
example.three_requests()
word_cloud()
3. Results
3.1. Result pictures
3.2. Saved file content results