Simple statistics highest frequency of a novel which characters appear:
import codecs import matplotlib.pyplot as plt from pylab import mpl mpl.rcParams [ ' font.sans serif- ' ] = [ ' FangSong ' ] # specify the default font mpl.rcParams [ ' axes.unicode_minus ' ] = False # resolve to save the image is a negative sign '-' display issue a block of word = [] counter = {} with codecs.open('data.txt') as fr: for line in fr: line = line.strip() if len(line) == 0: continue for w in line: if not w in word: word.append(w) if not w in counter: counter[w] = 0 else: counter[w] += 1 counter_list = sorted(counter.items(), key=lambda x: x[1], reverse=True) print(counter_list[:50]) label = list(map(lambda x: x[0], counter_list[:50])) value = list(map(lambda y: y[1], counter_list[:50])) plt.bar(range(len(value)), value, tick_label=label) plt.show()
The results are as follows:
[( ',', 288,508), ( '.', 261,584), ( 'the', 188,693), ( 'old', 92565), ( 'joy', 92505), ( 'no', 91234), ( 'Yes', 90562), (' a ', 86931), (' a ', 79059), (' the ', 77997), (' he '
, 71695), (' this', 63580), ( 'people ', 61210), (' '', 59719), ( '' ', 59115), (' there ', 56054), (' on ', 52862), (' a ', 49097), (' all ', 46850), ( 'you', 45400), ( 'come', 42659),
( 'I', 40057), ( 'the', 37676), ( 'they', 36966), ( 'the', 36351) , ( 'say', 35828), ( 'also', 35260), ( 'it', 32601), ( 'down', 31742), ( 'the', 30692), ( 'get', 29904), ( 'on', 2
9627), ( 'see', 28408), ( 'no', 28333), ( 'a', 27937), ( 'Road', 27732), ( 'large', 27012),( '?', 26729), ( 'that', 26589), ( 'should', 26076), ( 'child', 25035), ( 'self', 24012), ( '
point', 23942), ( ' good ', 21345), (' think ', 21242), (' there ', 20915), (' face ', 20661), (' she ', 20313), (' over ', 20304), (' words' , 20110)]