pynlpir + pandas text analysis

import package:

import pynlpir
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import multiprocessing,threading,time

Read in the initial text, stop word files, and create a Dataframe that holds the initial word segmentation data

f_1 = open(r"C:\Users\lenovo\Desktop\Mr. Xiao's reptile project\stopwords.txt", "r")
stopwords = f_1.read().splitlines()
f_1.close()
f = open(r"C:\Users\lenovo\Desktop\Mr. Xiao's reptile project\data_3.txt", "r")
pd_root = pd.DataFrame(columns=['vocabulary', 'part of speech'])

Some parameters:

time_start = time.time() # for both timing
pynlpir.open()
font = FontProperties(fname=r'c:\windows\fonts\simhei.ttf', size=13) #Set the font when drawing
Filter stop word function:
def stopword_delete(df):
    global stopwords
    for i in range(df.shape[0]):
        if (df.词汇[i] in stopwords):
            df.drop(i,inplace=True)
        else:
            pass
    return df

Due to the large amount of text in the file, direct reading, word segmentation, and filtering will be slow. Multithreading is used to read and process line by line.

One-line handler function:

def line_deal(line):
    global pd_root
    line = line.replace(" ", "")
    segment = pynlpir.segment(line, pos_names='parent', pos_english=False) #Single-line word segmentation
    pd_line = pd.DataFrame(segment,columns=['vocabulary','part of speech']) #single line datafrrame
    pd_line = stopword_delete(pd_line) #Filter stop words
    pd_root = pd_root.append(pd_line,ignore_index=True)

Read using multithreading:

threads_list = [] #Thread list
thread_max = 30 #Maximum thread
n=0
for line in f:
    p = threading.Thread(target=line_deal,args=(line,))
    threads_list.append(p)
    p.start()
    n=n+1
    print(len(threads_list),n) #Print the current number of threads and the number of lines read
    for pro in threads_list:
        if pro.is_alive() == True:
            continue
        else:
            threads_list.remove(pro)
    if len(threads_list) >= thread_max:
        time.sleep(0.1)
    else:
        continue
f.close() #Close the file after reading

Print the data after initial tokenization:

print(pd_root.head(10))

Create a vocabulary-frequency library:

pd_word_num = pd.DataFrame(pd_root['词汇'].value_counts())
pd_word_num.rename(columns={'word': 'frequency'})
pd_word_num.rename(columns={'word':'frequency'},inplace=True)
pd_word_num['percent'] = pd_word_num['frequency'] / pd_word_num['frequency'].sum()
print(pd_word_num.head(10))

 

Create part-of-speech-frequency library:

pd_qua_num = pd.DataFrame(pd_root['词性'].value_counts())
#change column names
pd_qua_num.rename(columns={'part of speech':'frequency'},inplace=True)
#Add percentage column: part of speech-frequency-percentage
pd_qua_num['percentage'] = pd_qua_num['frequency'] / pd_qua_num['frequency'].sum()
print(pd_qua_num.head(10))

Count the lexical distribution of several important parts of speech:

# Define 6 types of part-of-speech statistics data frame
columns_selected=['verb','verb count','noun','noun count','pronoun','pronoun count',
                  'timeword', 'timeword count', 'adverb', 'adverb count', 'adjective', 'adjective count']
pd_Top6 = pd.DataFrame(columns=columns_selected)
for i in range(0,12,2):
    pd_Top6[columns_selected[i]] = pd_root.loc[pd_root['词性']==columns_selected[i]]['词汇'].value_counts().reset_index()['index']
    pd_Top6[columns_selected[i+1]] = pd_root.loc[pd_root['part of speech']==columns_selected[i]]['vocabulary'].value_counts().reset_index()['vocabulary']
print(pd_Top6.head(10))

Extract keywords from text:

key_words = pynlpir.get_key_words(str, weighted=True)
print(key_words)

Drawing:

 

def paint(df,x,y,title):
    plt.subplots(figsize=(7,5))
    plt.yticks(fontproperties=font,size=10)
    plt.xlabel(x,fontproperties=font,size=10)
    plt.ylabel(y,fontproperties=font,size=10)
    plt.title(title,fontproperties=font)
    df.iloc[:10]['Frequency'].plot(kind='barh')
    plt.show()

paint(pd_word_num,"frequency","word","word distribution")
paint(pd_qua_num,"frequency","part of speech","part of speech distribution")

 

fig = plt.figure(figsize=(10,5))
fig.subplots_adjust(hspace=0.3,wspace=0.2)
for i in range(1,7):
    pd_qua = pd_Top6.iloc[:,[(2*i-2),2*i-1]]
    pd_qua.columns = [pd_qua.columns[0],'频数']
    pd_qua = pd_qua.set_index(pd_qua.columns[0])
    print(pd_qua)
    ax = fig.add_subplot(2,3,i)
    pd_qua.head(10)['频数'].plot(kind='bar')
    ax.set_xticklabels(pd_qua.head(10).index,fontproperties=font,size=10,rotation=30)
    ax.set_title(pd_qua.index.name,fontproperties=font)
fig.tight_layout()
fig.show()

  

 

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325073752&siteId=291194637