获取豆瓣最新电影的ID号和电影名称
import requests
from bs4 import BeautifulSoup
url = "https://movie.douban.com/cinema/nowplaying/xian/"
# 1). 获取页面信息
response = requests.get(url)
content = response.text
# print(content)
# 2). 分析页面, 获取id和电影名
soup = BeautifulSoup(content, 'lxml')
# 线找到所有的电影信息对应的li标签;
nowplaying_movie_list = soup.find_all('li', class_='list-item')
print(nowplaying_movie_list[0])
print(type(nowplaying_movie_list[0]))
# 存储所有电影信息[{'title':"名称", "id":"id号"}]
movies_info = []
# 依次遍历每一个li标签, 再次提取需要的信息
for item in nowplaying_movie_list:
nowplaying_movie_dict = {}
# 根据属性获取title内容和id内容
# item['data-title']获取li标签里面的指定属性data-title对应的value值;
nowplaying_movie_dict['title'] = item['data-title']
nowplaying_movie_dict['id'] = item['id']
nowplaying_movie_dict['actors'] = item['data-actors']
nowplaying_movie_dict['director'] = item['data-director']
# 将获取的{'title':"名称", "id":"id号"}添加到列表中;
movies_info.append(nowplaying_movie_dict)
print(movies_info)
获取指定电影的影评信息
# 目标:
# 1). 爬取某一页的评论信息;
# 2).爬取某个电影的前10页评论信息;
# 3). 获取所有电影的评论信息;
import threading
import requests
from bs4 import BeautifulSoup
# # 1). 爬取某一页的评论信息;
def getOnePageComment(id, pageNum):
# 1). 根据页数确定start变量的值
# 第一页: https://movie.douban.com/subject/26425063/comments?start=0&limit=20&sort=new_score&status=P
# 第二页: https://movie.douban.com/subject/26425063/comments?start=20&limit=20&sort=new_score&status=P
# 第三页: https://movie.douban.com/subject/26425063/comments?start=20&limit=40&sort=new_score&status=P
start = (pageNum-1)*20
url = "https://movie.douban.com/subject/%s/comments?start=%s&limit=20&sort=new_score&status=P" %(id, start)
# 2). 爬取评论信息的网页内容
content = requests.get(url).text
# 3). 通过bs4分析网页
soup = BeautifulSoup(content, 'lxml')
# 分析网页得知, 所有的评论信息都是在span标签, 并且class为short;
commentsList = soup.find_all('span', class_='short')
pageComments = ""
# 依次遍历每一个span标签, 获取标签里面的评论信息, 并将所有的评论信息存储到pageComments变量中;
for commentTag in commentsList:
pageComments += commentTag.text
# return pageComments
print("%s page" %(pageNum))
global comments
comments += pageComments
# 2).爬取某个电影的前10页评论信息;
id = '26425063'
comments = ''
threads = []
# 爬取前10页的评论信息;获取前几页就循环几次;
for pageNum in range(10): # 0 , 1 2 3 4...9
pageNum = pageNum + 1
# getOnePageComment(id, pageNum)
# 通过启动多线程获取每页评论信息
t = threading.Thread(target=getOnePageComment, args=(id, pageNum))
threads.append(t)
t.start()
# 等待所有的子线程执行结束, 再执行主线程内容;
_ = [thread.join() for thread in threads]
print("执行结束")
with open("%s.txt" %(id), 'w',encoding='utf-8') as f:
f.write(comments)
数据清洗
完整的分析过程:
- 数据的获取: 通过爬虫获取(urllib|requests<获取页面内容> + re|bs4<分析页面内容>)
- 数据清洗: 按照一定的格式岁文本尽心处理;
"""
import re
# 1. 对于爬取的评论信息进行数据清洗(删除不必要的逗号, 句号, 表情, 只留下中文或者英文内容)
with open('./doc/26425063.txt',encoding='utf-8') as f:
comments = f.read()
# 通过正则表达式实现
pattern = re.compile(r'([\u4e00-\u9fa5]+|[a-zA-Z]+)')
deal_comments = re.findall(pattern, comments)
newComments = ''
for item in deal_comments:
newComments += item
print(newComments)
词云分析
import jieba
import wordcloud
import numpy as np
from PIL import Image
text= "马云曾公开表态称对钱没兴趣称其从来没碰过钱上了微博热搜"
# 2). '微博热', '搜'切割有问题, 可以强调
# jieba.suggest_freq(('微博'),True)
# jieba.suggest_freq(('热搜'),True)
# 强调文件中出现的所有词语;
jieba.load_userdict('./doc/newWord')
# 1). 切割中文, lcut返回一个列表, cut返回一个生成器;
result = jieba.lcut(text)
print("切分结果:", result)
# 4). 绘制词云
wc = wordcloud.WordCloud(
background_color='snow',
font_path='./font/msyh.ttf', # 处理中文数据时
min_font_size=5, # 图片中最小字体大小;
max_font_size=50, # 图片中最大字体大小;
width=200, # 指定生成图片的宽度
)
wc.generate(",".join(result))
wc.to_file('./doc/douban.png')