爬虫 and 词云 结合例子

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/lshdp/article/details/83864368
import jieba.analyse
from wordcloud import WordCloud, ImageColorGenerator,STOPWORDS
from imageio import imread
import matplotlib.pyplot as plt
from urllib import request
import time
from lxml import etree


class wc():
    def __init__(self, txt_file, img_file,front):
        self.f = open(txt_file, 'r', encoding='utf-8')
        self.txt = self.f.read()
        self.f.close()
        self.tags = jieba.analyse.extract_tags(self.txt, topK=100)
        self.img = imread(img_file)

        # font_path指的是字体文件路径,因为wordcloud自带的字体不支持中文所以我们要指定一个字体文件,否者输出的图片全是框框
        # background_color 默认是黑色 我设置成白色
        # max_words最大显示的词数
        # mask 背景图片
        # max_font_size 最大字体字号
        self.text = ' '.join(self.tags)
        self.wc = WordCloud(
                            font_path='./fonts/simhei.ttf',
                            background_color='white',
                            max_words=100,
                            mask=self.img,
                            max_font_size=80).generate(self.text)


    def get_url_content(self):
        url = 'https://read.qidian.com/chapter/_AaqI-dPJJ4uTkiRw_sFYA2/eSlFKP1Chzg1'
        headers = {
            'Referer': 'https://read.qidian.com/chapter/_AaqI-dPJJ4uTkiRw_sFYA2/eSlFKP1Chzg1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
        }
        #构建请求
        req = request.Request(url=url,headers=headers)
        # 发送请求
        response = request.urlopen(req)
        content = response.read()
        xpath_content = etree.HTML(content)
        #//*[@id="chapter-339991957"]/div/div[2]
        #//*[@id="chapter-339991957"]/div/div/p/text()
        new_content = xpath_content.xpath('//*[@id="chapter-339991957"]/div/div/p/text()')

        with open('sanwen.txt','w',encoding='utf-8') as f:
            for i in new_content:
                f.writelines(i.strip())
        time.sleep(2)

    def show_wc(self):
        plt.imshow(self.wc)
        # 可以通过 plt.imshow(self.wc.recolor(color_func=img_color))使图片颜色跟字体颜色一样
        plt.axis("off")
        plt.show()
        self.wc.to_file('result.png')


if __name__ == '__main__':
    mywc = wc('sanwen.txt', 'timg.png','AGENCYR.TTF')
    mywc.get_url_content() #网上爬虫,爬内容写到TXT文件中
    mywc.show_wc() #词云,填充显示我们图片轮廓

猜你喜欢

转载自blog.csdn.net/lshdp/article/details/83864368