import package:
import pynlpir import pandas as pd import matplotlib.pyplot as plt from matplotlib.font_manager import FontProperties import multiprocessing,threading,time
Read in the initial text, stop word files, and create a Dataframe that holds the initial word segmentation data
f_1 = open(r"C:\Users\lenovo\Desktop\Mr. Xiao's reptile project\stopwords.txt", "r") stopwords = f_1.read().splitlines() f_1.close() f = open(r"C:\Users\lenovo\Desktop\Mr. Xiao's reptile project\data_3.txt", "r") pd_root = pd.DataFrame(columns=['vocabulary', 'part of speech'])
Some parameters:
time_start = time.time() # for both timing pynlpir.open() font = FontProperties(fname=r'c:\windows\fonts\simhei.ttf', size=13) #Set the font when drawing
Filter stop word function:
def stopword_delete(df): global stopwords for i in range(df.shape[0]): if (df.词汇[i] in stopwords): df.drop(i,inplace=True) else: pass return df
Due to the large amount of text in the file, direct reading, word segmentation, and filtering will be slow. Multithreading is used to read and process line by line.
One-line handler function:
def line_deal(line): global pd_root line = line.replace(" ", "") segment = pynlpir.segment(line, pos_names='parent', pos_english=False) #Single-line word segmentation pd_line = pd.DataFrame(segment,columns=['vocabulary','part of speech']) #single line datafrrame pd_line = stopword_delete(pd_line) #Filter stop words pd_root = pd_root.append(pd_line,ignore_index=True)
Read using multithreading:
threads_list = [] #Thread list thread_max = 30 #Maximum thread n=0 for line in f: p = threading.Thread(target=line_deal,args=(line,)) threads_list.append(p) p.start() n=n+1 print(len(threads_list),n) #Print the current number of threads and the number of lines read for pro in threads_list: if pro.is_alive() == True: continue else: threads_list.remove(pro) if len(threads_list) >= thread_max: time.sleep(0.1) else: continue
f.close() #Close the file after reading
Print the data after initial tokenization:
print(pd_root.head(10))
Create a vocabulary-frequency library:
pd_word_num = pd.DataFrame(pd_root['词汇'].value_counts()) pd_word_num.rename(columns={'word': 'frequency'}) pd_word_num.rename(columns={'word':'frequency'},inplace=True) pd_word_num['percent'] = pd_word_num['frequency'] / pd_word_num['frequency'].sum() print(pd_word_num.head(10))
Create part-of-speech-frequency library:
pd_qua_num = pd.DataFrame(pd_root['词性'].value_counts()) #change column names pd_qua_num.rename(columns={'part of speech':'frequency'},inplace=True) #Add percentage column: part of speech-frequency-percentage pd_qua_num['percentage'] = pd_qua_num['frequency'] / pd_qua_num['frequency'].sum() print(pd_qua_num.head(10))
Count the lexical distribution of several important parts of speech:
# Define 6 types of part-of-speech statistics data frame columns_selected=['verb','verb count','noun','noun count','pronoun','pronoun count', 'timeword', 'timeword count', 'adverb', 'adverb count', 'adjective', 'adjective count'] pd_Top6 = pd.DataFrame(columns=columns_selected) for i in range(0,12,2): pd_Top6[columns_selected[i]] = pd_root.loc[pd_root['词性']==columns_selected[i]]['词汇'].value_counts().reset_index()['index'] pd_Top6[columns_selected[i+1]] = pd_root.loc[pd_root['part of speech']==columns_selected[i]]['vocabulary'].value_counts().reset_index()['vocabulary'] print(pd_Top6.head(10))
Extract keywords from text:
key_words = pynlpir.get_key_words(str, weighted=True) print(key_words)
Drawing:
def paint(df,x,y,title): plt.subplots(figsize=(7,5)) plt.yticks(fontproperties=font,size=10) plt.xlabel(x,fontproperties=font,size=10) plt.ylabel(y,fontproperties=font,size=10) plt.title(title,fontproperties=font) df.iloc[:10]['Frequency'].plot(kind='barh') plt.show() paint(pd_word_num,"frequency","word","word distribution") paint(pd_qua_num,"frequency","part of speech","part of speech distribution")
fig = plt.figure(figsize=(10,5)) fig.subplots_adjust(hspace=0.3,wspace=0.2) for i in range(1,7): pd_qua = pd_Top6.iloc[:,[(2*i-2),2*i-1]] pd_qua.columns = [pd_qua.columns[0],'频数'] pd_qua = pd_qua.set_index(pd_qua.columns[0]) print(pd_qua) ax = fig.add_subplot(2,3,i) pd_qua.head(10)['频数'].plot(kind='bar') ax.set_xticklabels(pd_qua.head(10).index,fontproperties=font,size=10,rotation=30) ax.set_title(pd_qua.index.name,fontproperties=font) fig.tight_layout() fig.show()