# -*- coding: utf-8 -*-
"""
Created on Sun Dec 27 18:25:34 2020
@author: David
"""
#导入相关的包
from lxml import etree
import requests
import time
#爬取数据写入文件
douban=open(r'C:\Users\David\Desktop\douban.txt','a+',encoding='utf-8')
for a in range(10):
url = 'https://book.douban.com/top250?start={}'.format(a*25)
header = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
data = requests.get(url,headers=header).text
s=etree.HTML(data)
file=s.xpath('//*[@id="content"]/div/div[1]/div/table')
for div in file:
title =div.xpath("./tr/td[2]/div[1]/a/@title")[0]
href=div.xpath("./tr/td[2]/div[1]/a/@href")[0]
score=div.xpath("./tr/td[2]/div[2]/span[2]/text()")[0]
num=div.xpath("./tr/td[2]/div[2]/span[3]/text()")[0].strip("(").strip(")").strip()
author=div.xpath("./tr/td[2]/p[1]/text()")
scrible=div.xpath("./tr/td[2]/p[2]/span/text()")
if len(scrible)>0:
print("{},{},{},{},{},{}\n".format(title,href,score,num,author,scrible[0]))
else:
print("{},{},{},{},{}\n".format(title,href,score,num,author))
for i in author:
b=str(i)
c=b.split('/')[0]
douban.write(c)
#爬取短评数据写入文件
douban_dunanpin=open(r'C:\Users\David\Desktop\douban_dunanpin.txt','a+',encoding='utf-8')
for a in range(10):
url = 'https://book.douban.com/top250?start={}'.format(a*25)
header = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
data = requests.get(url,headers=header).text
s=etree.HTML(data)
file=s.xpath('//*[@id="content"]/div/div[1]/div/table')
for div in file:
scrible=div.xpath("./tr/td[2]/p[2]/span/text()")
douban_dunanpin.write(str(scrible))
#导入词云想关的包
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
#将文件读取出来
with open(r'C:\Users\David\Desktop\douban.txt','r',encoding='utf-8') as f:
authors=f.readlines()
print(authors)
#分隔列表为字符串
a=' '.join(authors)
#运用jieba进行分词
douban_cut=jieba.cut(a)
#分词后的字符串
douban_text=" ".join(douban_cut)
#最高词频作者词云图绘制部分
x,y = np.ogrid[:300,:300]
mask = (x-150) ** 2 + (y-150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)
wc=WordCloud(font_path="C:/Windows/Fonts/simfang.ttf",background_color="white",repeat=True,mask=mask)
wordcloud=wc.generate(douban_text)
wordcloud.to_file(r'C:\Users\David\Desktop\豆瓣.png')
#评分和评价信息
# 导入相关的包
import pandas as pd
#用列表存入信息
sample=[]
for a in range(10):
url = 'https://book.douban.com/top250?start={}'.format(a*25)
header = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
data = requests.get(url,headers=header).text
s=etree.HTML(data)
file=s.xpath('//*[@id="content"]/div/div[1]/div/table')
for div in file:
title =div.xpath("./tr/td[2]/div[1]/a/@title")[0]
score=div.xpath("./tr/td[2]/div[2]/span[2]/text()")[0]
num=div.xpath("./tr/td[2]/div[2]/span[3]/text()")[0].strip("(").strip(")").strip()
sample.append([title,score,num[:-3]])
df = pd.DataFrame(sample,columns=['title','score','num'])
import matplotlib.pyplot as plt
import numpy as np
# 构建数据
x_data = list(df['title'][:10])
y_data = list(df['score'][:10])
# 绘图
plt.rcParams['font.sans-serif']=['SimHei']
plt.bar(x=x_data, height=y_data, color='steelblue', alpha=0.8)
plt.xticks(rotation=45)
# 设置标题
plt.title("排名前十的图书得分")
# 为两条坐标轴设置名称
plt.xlabel("书名")
plt.ylabel("评分")
# 显示图例
plt.legend()
import matplotlib.pyplot as plt
import numpy as np
# 构建数据
x_data = list(df['title'][:10])
y_data = list(df['num'][:10])
# 绘图
plt.rcParams['font.sans-serif']=['SimHei']
plt.bar(x=x_data, height=y_data, color='steelblue', alpha=0.8)
plt.xticks(rotation=45)
# 设置标题
plt.title("排名前十的图书评论数")
# 为两条坐标轴设置名称
plt.xlabel("书名")
plt.ylabel("评论数目")
# 显示图例
plt.legend()
#爬取短评数据写入文件
douban_dunanpin=open(r'C:\Users\David\Desktop\douban_dunanpin.txt','a+',encoding='utf-8')
for a in range(10):
url = 'https://book.douban.com/top250?start={}'.format(a*25)
header = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
data = requests.get(url,headers=header).text
s=etree.HTML(data)
file=s.xpath('//*[@id="content"]/div/div[1]/div/table')
for div in file:
scrible=div.xpath("./tr/td[2]/p[2]/span/text()")
douban_dunanpin.write(str(scrible))
#导入词云想关的包
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
#将文件读取出来
with open(r'C:\Users\David\Desktop\douban_dunanpin.txt','r',encoding='utf-8') as f:
duan=f.readlines()
print(duan)
#分隔列表为字符串
a=' '.join(duan)
#运用jieba进行分词
duan_cut=jieba.cut(a)
#分词后的字符串
duan_text=" ".join(duan_cut)
#最高词频作者词云图绘制部分
x,y = np.ogrid[:300,:300]
mask = (x-150) ** 2 + (y-150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)
wc=WordCloud(font_path="C:/Windows/Fonts/simfang.ttf",background_color="white",repeat=True,mask=mask)
wordcloud=wc.generate(duan_text)
wordcloud.to_file(r'C:\Users\David\Desktop\豆瓣duan.png')
python豆瓣图书top250+词云
猜你喜欢
转载自blog.csdn.net/weixin_44322234/article/details/111828329
今日推荐
周排行