一、爬取新闻
爬取的网站见代码部分url_list列表,共有三个网站。
import requests
from lxml import etree
headers = {
"user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Mobile Safari/537.36",
}
url_list = ["https://www.sohu.com/a/480408395_260616?scm=0.0.0.0&spm=smpc.subject.column-1.1.1627626619935JQrji3x",
"https://www.sohu.com/a/479875718_162758?scm=0.0.0.0&spm=smpc.subject.column-3.7.1627628959142K9LwxVn",
"https://www.sohu.com/a/480084797_260616?spm=smpc.subject.Slider-1.1.1627626596286niwehku"]
def parse_url(url,headers):
resp = requests.get(url, headers=headers).content.decode("utf-8") # 发起request请求,调用content方法并以utf-8方法解码内容
html_element = etree.HTML(resp) # 创建element对象
f = open("new.txt","a+",encoding="utf-8")
ps = html_element.xpath('//p') #找到所有段落标签
for p in ps:
try:
contend = p.xpath(".//text()")
for i in contend:
print(i)
f.write(i+"\n") #写入文本
except:
continue #当获取的内容为空时候则跳过这一次循环
f.close() #关闭文本
if __name__ == '__main__':
for url in url_list: #循环三次调用请求解析函数
parse_url(url,headers)
二、词频统计
import jieba
import nltk
import pandas as pd
raw = pd.read_table("new.txt",names = ["txt"]) # 读取文本
raw[:10]
raw["temp"] = 1 #临时创建列分组
for i in range(len(raw)):
if raw.txt[i] == '(本文来自澎湃新闻,更多原创资讯请下载“澎湃新闻”APP) 'or raw.txt[i]== '源自:魅力湘西官网':
raw.loc[i,'temp'] = 0 #筛选无用信息
else:
pass
raw["temp"] = 1 #临时创建列分组
rawgrp = raw.groupby("temp") #将字符串组合在一起
rawgrp = rawgrp.agg(sum) #分组后求和
word_list = jieba.lcut(rawgrp.txt[1]) #分词
sw = list((pd.read_table("停用词.txt",names = ["w"],sep = 'aaa', encoding = "utf-8")).w) #读取常见停用词
newlist = [i for i in word_list if i not in ([" ",",","。","、","”","!",":",";","?","…","“",'《','》','日',"月","APP","例","(",")","病例"]+sw)]#分词筛选无用词
##NLTK统计词频
fdist = nltk.FreqDist(newlist)
三、词云图
from matplotlib import pyplot as plt
import wordcloud
myfont = "simsun.ttc" #中文格式
cloudobj = wordcloud.WordCloud(font_path = myfont,width=700,height = 350
,mode="RGBA",background_color=100).fit_words(fdist)
plt.imshow(cloudobj)
plt.axis("off")
#导出图片
plt.savefig("1.png")
plt.show()