Python is simple Chinese word frequency statistics examples

Simple statistics highest frequency of a novel which characters appear:

import codecs
import matplotlib.pyplot as plt
from pylab import mpl
mpl.rcParams [ ' font.sans serif- ' ] = [ ' FangSong ' ] # specify the default font 
mpl.rcParams [ ' axes.unicode_minus ' ] = False # resolve to save the image is a negative sign '-' display issue a block of 
 
word = []
counter = {}
 
with codecs.open('data.txt') as fr:
 for line in fr:
  line = line.strip()
  if len(line) == 0:
   continue
  for w in line:
   if not w in word:
    word.append(w)
   if not w in counter:
    counter[w] = 0
   else:
    counter[w] += 1
 
counter_list = sorted(counter.items(), key=lambda x: x[1], reverse=True)
 
print(counter_list[:50])
 
label = list(map(lambda x: x[0], counter_list[:50]))
value = list(map(lambda y: y[1], counter_list[:50]))
 
plt.bar(range(len(value)), value, tick_label=label)
plt.show()

The results are as follows:

[( ',', 288,508), ( '.', 261,584), ( 'the', 188,693), ( 'old', 92565), ( 'joy', 92505), ( 'no', 91234), ( 'Yes', 90562), (' a ', 86931), (' a ', 79059), (' the ', 77997), (' he '
, 71695), (' this', 63580), ( 'people ', 61210), (' '', 59719), ( '' ', 59115), (' there ', 56054), (' on ', 52862), (' a ', 49097), (' all ', 46850), ( 'you', 45400), ( 'come', 42659),
 ( 'I', 40057), ( 'the', 37676), ( 'they', 36966), ( 'the', 36351) , ( 'say', 35828), ( 'also', 35260), ( 'it', 32601), ( 'down', 31742), ( 'the', 30692), ( 'get', 29904), ( 'on', 2
9627), ( 'see', 28408), ( 'no', 28333), ( 'a', 27937), ( 'Road', 27732), ( 'large', 27012),( '?', 26729), ( 'that', 26589), ( 'should', 26076), ( 'child', 25035), ( 'self', 24012), ( '
point', 23942), ( ' good ', 21345), (' think ', 21242), (' there ', 20915), (' face ', 20661), (' she ', 20313), (' over ', 20304), (' words' , 20110)]

Guess you like

Origin www.cnblogs.com/ngxt/p/11789130.html