Real case: micro-blog sentiment analysis
Data: each text file containing data corresponding to the class
0:喜悦;1:愤怒;2:厌恶;3:低落
step
Text read
Split the training set, testing set
Feature Extraction
Training model, forecast
Code:
tools.py
import re
import jieba. posseg as pseg
import pandas as pd
import math
import numpy as np
stopwords1 = [ line. rstrip( ) for line in open ( './中文停用词库.txt' , 'r' , encoding= 'utf-8' ) ]
stopwords = stopwords1
def proc_text ( raw_line) :
"""
处理每行的文本数据
返回分词结果
"""
filter_pattern = re. compile ( '[^\u4E00-\u9FD5]+' )
chinese_only = filter_pattern. sub( '' , raw_line)
words_lst = pseg. cut( chinese_only)
meaninful_words = [ ]
for word, flag in words_lst:
if word not in stopwords:
meaninful_words. append( word)
return ' ' . join( meaninful_words)
def split_train_test ( text_df, size= 0.8 ) :
"""
分割训练集和测试集
"""
train_text_df = pd. DataFrame( )
test_text_df = pd. DataFrame( )
labels = [ 0 , 1 , 2 , 3 ]
for label in labels:
text_df_w_label = text_df[ text_df[ 'label' ] == label]
text_df_w_label = text_df_w_label. reset_index( )
n_lines = text_df_w_label. shape[ 0 ]
split_line_no = math. floor( n_lines * size)
text_df_w_label_train = text_df_w_label. iloc[ : split_line_no, : ]
text_df_w_label_test = text_df_w_label. iloc[ split_line_no: , : ]
train_text_df = train_text_df. append( text_df_w_label_train)
test_text_df = test_text_df. append( text_df_w_label_test)
train_text_df = train_text_df. reset_index( )
test_text_df = test_text_df. reset_index( )
return train_text_df, test_text_df
def get_word_list_from_data ( text_df) :
"""
将数据集中的单词放入到一个列表中
"""
word_list = [ ]
for _, r_data in text_df. iterrows( ) :
word_list += r_data[ 'text' ] . split( ' ' )
return word_list
def extract_feat_from_data ( text_df, text_collection, common_words_freqs) :
"""
特征提取
"""
n_sample = text_df. shape[ 0 ]
n_feat = len ( common_words_freqs)
common_words = [ word for word, _ in common_words_freqs]
X = np. zeros( [ n_sample, n_feat] )
y = np. zeros( n_sample)
print ( '提取特征...' )
for i, r_data in text_df. iterrows( ) :
if ( i + 1 ) % 5000 == 0 :
print ( '已完成{}个样本的特征提取' . format ( i + 1 ) )
text = r_data[ 'text' ]
feat_vec = [ ]
for word in common_words:
if word in text:
tf_idf_val = text_collection. tf_idf( word, text)
else :
tf_idf_val = 0
feat_vec. append( tf_idf_val)
X[ i, : ] = np. array( feat_vec)
y[ i] = int ( r_data[ 'label' ] )
return X, y
def cal_acc ( true_labels, pred_labels) :
"""
计算准确率
"""
n_total = len ( true_labels)
correct_list = [ true_labels[ i] == pred_labels[ i] for i in range ( n_total) ]
acc = sum ( correct_list) / n_total
return acc
main.py
import os
import pandas as pd
import nltk
from tools import proc_text, split_train_test, get_word_list_from_data, \
extract_feat_from_data, cal_acc
from nltk. text import TextCollection
from sklearn. naive_bayes import GaussianNB
dataset_path = './dataset'
text_filenames = [ '0_simplifyweibo.txt' , '1_simplifyweibo.txt' ,
'2_simplifyweibo.txt' , '3_simplifyweibo.txt' ]
output_text_filename = 'raw_weibo_text.csv'
output_cln_text_filename = 'clean_weibo_text.csv'
is_first_run = True
def read_and_save_to_csv ( ) :
"""
读取原始文本数据,将标签和文本数据保存成csv
"""
text_w_label_df_lst = [ ]
for text_filename in text_filenames:
text_file = os. path. join( dataset_path, text_filename)
label = int ( text_filename[ 0 ] )
with open ( text_file, 'r' , encoding= 'utf-8' ) as f:
lines = f. read( ) . splitlines( )
labels = [ label] * len ( lines)
text_series = pd. Series( lines)
label_series = pd. Series( labels)
text_w_label_df = pd. concat( [ label_series, text_series] , axis= 1 )
text_w_label_df_lst. append( text_w_label_df)
result_df = pd. concat( text_w_label_df_lst, axis= 0 )
result_df. columns = [ 'label' , 'text' ]
result_df. to_csv( os. path. join( dataset_path, output_text_filename) ,
index= None , encoding= 'utf-8' )
def run_main ( ) :
"""
主函数
"""
if is_first_run:
print ( '处理清洗文本数据中...' , end= ' ' )
read_and_save_to_csv( )
text_df = pd. read_csv( os. path. join( dataset_path, output_text_filename) ,
encoding= 'utf-8' )
text_df[ 'text' ] = text_df[ 'text' ] . apply ( proc_text)
text_df = text_df[ text_df[ 'text' ] != '' ]
text_df. to_csv( os. path. join( dataset_path, output_cln_text_filename) ,
index= None , encoding= 'utf-8' )
print ( '完成,并保存结果。' )
print ( '加载处理好的文本数据' )
clean_text_df = pd. read_csv( os. path. join( dataset_path, output_cln_text_filename) ,
encoding= 'utf-8' )
train_text_df, test_text_df = split_train_test( clean_text_df)
print ( '训练集中各类的数据个数:' , train_text_df. groupby( 'label' ) . size( ) )
print ( '测试集中各类的数据个数:' , test_text_df. groupby( 'label' ) . size( ) )
n_common_words = 200
print ( '统计词频...' )
all_words_in_train = get_word_list_from_data( train_text_df)
fdisk = nltk. FreqDist( all_words_in_train)
common_words_freqs = fdisk. most_common( n_common_words)
print ( '出现最多的{}个词是:' . format ( n_common_words) )
for word, count in common_words_freqs:
print ( '{}: {}次' . format ( word, count) )
print ( )
text_collection = TextCollection( train_text_df[ 'text' ] . values. tolist( ) )
print ( '训练样本提取特征...' , end= ' ' )
train_X, train_y = extract_feat_from_data( train_text_df, text_collection, common_words_freqs)
print ( '完成' )
print ( )
print ( '测试样本提取特征...' , end= ' ' )
test_X, test_y = extract_feat_from_data( test_text_df, text_collection, common_words_freqs)
print ( '完成' )
print ( '训练模型...' , end= ' ' )
gnb = GaussianNB( )
gnb. fit( train_X, train_y)
print ( '完成' )
print ( )
print ( '测试模型...' , end= ' ' )
test_pred = gnb. predict( test_X)
print ( '完成' )
print ( '准确率:' , cal_acc( test_y, test_pred) )
if __name__ == '__main__' :
run_main( )