先使用进程池爬取豆瓣电影短评
import requests
import re
import random
import time
import pandas as pd
from pymongo import MongoClient
from multiprocessing import Pool
from fake_useragent import *
#简单反反爬措施,使用多个UA
ua = UserAgent()
#连接mongo数据库
conn = MongoClient('localhost')
db =conn.spyder
myset = db.douban
def get_db(page):
#随机获取UA抓取数据并判断状态码
headers = {"User-Agent":random.choice(ua)}
url ="https://movie.douban.com/subject/26752088/comments?start={}&limit=20&sort=new_score&status=P".format(page*20)
req = requests.get(url,headers=headers)
req.encoding ="utf-8"
if req.status_code ==200:
print("第%s页爬取成功"%str(page+1))
else:
print("第%s页爬取失败"%str(page+1))
result = req.text
# 将目标HTML使用正则表达式筛选信息并使用yield返回
pattern =re.compile('<a href="https://www.douban.com/people/[\s\S]*?" class="">([\s\S]*?)</a>[\s\S]*?<span class="short">([\s\S]*?)</span>')
rf = re.findall(pattern,result)
for i in rf:
yield {'name':i[0].strip(),
'content':i[1].strip()}
def save_to_mongo(file):
"""
插入数据库
"""
if myset.insert(file):
print('插入成功')
else:
print('失败')
def main(page):
"""
主函数
"""
for file in get_db(page):
save_to_mongo(file)
if __name__ =="__main__":
#使用进程池提速抓取,可以考虑sleep
pool = Pool()
pool.map(main,[i for i in range(50)])
pool.close()
pool.join()
将数据从数据库取出并生成词云
import jieba
from wordcloud import WordCloud
import pymongo
import matplotlib.pyplot as mp
from jieba import analyse
text = None
with pymongo.MongoClient('localhost') as client:
#获取集合
db = client.spyder
myset = db.douban
#打印数据库总评论数
print('count:',myset.estimated_document_count())
cursor = myset.find()
#遍历数据,这里只遍历短评数
text = ''.join(map(lambda doc:doc.get('content'),cursor))
#对短语数据文本分词
#添加自定义分词
[jieba.add_word(k) for k in []]
#取Top50的词生成词云
tags = analyse.extract_tags(text,topK=50,withWeight=False)
new_text = ' '.join(tags)
print(new_text)
#对分词文本生成词云
#生成词云,需要指定中文字体,否则无法生成中文词云
image=mp.imread('./wc.jpg')
wc = WordCloud(
#设置背景色,默认黑色
# background_color = 'white',
#设置词云最大单词数
max_words=200,
#设置词云中字号最大值
#max_font_size = 80,
#设置词云图片宽,高,
# mask=image,
width=768,
height=1024,
#设置词云文件字体(美化和解决中文乱码问题)
font_path='STZHONGS.TTF'
).generate(new_text)
#绘图(标准长方形图)
mp.imshow(wc,cmap='rainbow',interpolation='bilinear')
mp.figure('db.wordcloud')
mp.axis('off')
#将图片输出到文件
# wc.to_file(r'./images/wc.png')
mp.show()
生成图片可以明显知道哪些高频词语