jieba切割生成词云

import sqlite3
import pandas as pd
import numpy as np
import jieba
from collections import Counter

from pyecharts import WordCloud

class MyWordCloud:
    '''
    自定义的词云分割
    '''
    def __init__(self,db_path,table_name,stop_word_file_path,field_name):
        self.sqlit3_db =  sqlite3.connect(db_path)#链接sqlite3
        sql = 'select * from {}'.format(table_name)
        self.content_from_db = pd.read_sql(sql,self.sqlit3_db)#从sqliter3中加载数据
        self.stop_words = []#停止词,即不需要的一些词语,比如:的、得、你、我、他等
        self.stop_word_path = stop_word_file_path#停止词词库文件路径
        self.field_filter = field_name#过滤字段
        self.user_dic = jieba.load_userdict('user_dic.txt')

    def stop_word_file(self,):
        '''
        加载停止词词库,主要针对中文
        :param stop_word_path: 词库文件路径
        :return:
        '''
        if self.stop_word_path:
            with open(self.stop_word_path, encoding='utf-8') as f:
                self.stop_words.append(f.read().split('\n'))
        else:
            print('未添加词云停止词库,请添加停止词库文件或者开启filter_more为True')
    def mk_word_list(self,record_count=None,filter_more_bol=True):
        '''
        生成所需得词库列表
        :param field_need: 使用jieba要剪取得字段或者说类别
        :param record_count: sqlite3中所需要进行切词分析的记录条数
        :param filter_more_bol: 进一步或者说更加精准的过滤不需要的词
        :return:
        '''
        word_list = []
        if record_count is not None:
            content_need = self.content_from_db[self.field_filter].head(record_count)
        else:
            content_need = self.content_from_db[self.field_filter]
        for record_one in content_need:
            words = jieba.cut(record_one)
            if filter_more_bol:
                words = self.filter_more(words)
            for word in words:
                if word not in self.stop_words:
                    word_list.append(word)
        return word_list

    def filter_more(self,words:list):
        '''
        过滤出中文和英文以及过滤掉单个字符
        :param words: jieba切过的词语序列
        :return:
        '''
        word_list = []
        for word in words:
            if word.isalpha() and len(word) >1:
                word_list.append(word)
        return word_list


    def words_cloud(self,words_list:list):
        '''
        生成词云文件
        :param words_list: 词云所需词库
        :return:
        '''
        content = pd.Series(words_list).value_counts()
        words_show = content.index
        words_count = content.values
        wd = WordCloud(width=1300, height=620)
        wd.add('', words_show, words_count, word_size_range=(20, 100))
        wd.render('wordcloud.html')

    def start(self,record_count=10):
        '''
        直接生成词云文件的函数,当然也可以自己一步一步进行函数调用
        :param record_count: 需要的sqlite3数据库记录数,默认10条
        :return:
        '''
        self.stop_word_file()
        word_list = self.mk_word_list(record_count=record_count)
        self.words_cloud(word_list)

if __name__ == '__main__':
    wordcloud = MyWordCloud('recruit.db','recruit','stopword.txt','job_detail')
    wordcloud.start()

所需要的三个文件:

recruit.db:sqlite3数据库文件,提供数据

stopword.txt:需要过滤的字符文件,不需要字符或者字符串写入该文件,注意每个字符或者字符串都要换行

user_dic.txt:自定义字典,不进行切割的字符或者字符串

猜你喜欢

转载自blog.csdn.net/ryuhfxz/article/details/86380500