Exercise 1: Statistics on key words in a file and code optimization.
Material requirements: A normal English document.
Code for normal implementation:
def makekey(s:str): chars = set(r"""!'"#./\()[],*-""") key = s.lower() ret = [] for i,c in enumerate(key): if c in chars: ret.append(' ') else: ret.append(c) return ''.join(ret).split() #The first solution above: makekey code is inefficient # #The second scheme below: makekey1 can also be optimized: def makekey1(s:str): chars = set(r"""!'"#./\()[],*-""") key = s.lower() ret = [] start = 0 for i,c in enumerate(key): if c in chars: if start == i: #If it is next to a special character, start must be equal to i. start += 1 #Add 1 and continue continue ret.append((key[start:i])) start = i+1 #Adding 1 is to skip this unnecessary special character c. else: if start < len(key): # Less than, indicating that there are valid characters, the program needs to be executed until the end. ret.append(key[start:]) return right #------------------------------------------# d= {} with open('sample.txt',encoding='utf-8') as f: for line in f: words = line.split() for wordlist in map(makekey1,words): for word in wordlist: d[word] = d.get(word,0) + 1 for i,(k,v) in enumerate(sorted(d.items(),key=lambda item:item[1],reverse=True),1): if not i > 10: print(i,k,v)
Results of the:
1 path 138 2 the 136 3 is 60 4 a 59 5 out of 49 6 if 43 7 and 40 8 to 34 9 on 33 10 of 33 Process finished with exit code 0
The following is the optimization of the code:
#todo ================The following is to optimize makekey1 in the above code ======================= =# #todo : remove the first scenario above: #todo : Optimize the second scheme: makekey1: #TODO : Exclude characters can be written in two ways, feel free to do so. # CHARS = set("""!'"#./\()[],*- \r\n\t""") # def _makekey2(key:str,chars=CHARS): def _makekey2(key: str, chars=set("""!'"#./\()[],*- \r\n\t""")): #If you use it multiple times, you can use the above plan. start = 0 for i,c in enumerate(key): if c in chars: if start == i: #If it is next to a special character, start must be equal to i. start += 1 #Add 1 and continue continue # ret.append((key[start:i])) yield key[start:i] start = i+1 #Adding 1 is to skip this unnecessary special character c. else: if start < len(key): #Less than, indicating that there are valid characters, and it has been until the end. # ret.append(key[start:]) yield key[start:] #todo : [The following is case-insensitive]: def wordcount(filename:str,encoding='utf-8',ignorewords=set())->dict: #Add the filter character ignorewords here. """This function performs word statistics """ d= {} with open('sample.txt',encoding=encoding) as f: for line in f: for word in map(str.lower,_makekey2(line)): #Case-insensitive. if word not in ignorewords: d[word] = d.get(word,0) + 1 return d # todo : [The following is case sensitive]: # def wordcount(filename:str,encoding='utf-8',ignorewords=set())->dict: # """This function performs word statistics # # """ # d= {} # with open('sample.txt',encoding=encoding) as f: # for line in f: # for word in _makekey2(line): #Case sensitive. # d[word] = d.get(word,0) + 1 # top10 def top(d:dict,n:int=10): #Iterate out the top data you want for i,(k,v) in enumerate(sorted(d.items(),key = lambda item:item[1],reverse=True)): if i >= n: break # print(k,v) #No need to print normally. yield k,v for k,v in top(wordcount('sample',ignorewords={'the','is'})): print (k, v)
Results of the:
path 138 a 59 the 49 if 43 and 40 to 34 on 33 of 33 return 30 windows 25 Process finished with exit code 0