import re, collections # Spell checker def words(text): # Only filter out letters, turn all letters into lowercase, and turn them into words return re.findall( '[az]+' , text.lower()) def train(features): # When encountering a new word, set the word frequency to 1 by default (representing a small probability) model = collections.defaultdict( lambda : 1 ) for f in features: model[f] += 1 return model # Import the corpus file, return a dictionary, the dictionary key is the word, the value is the number of occurrences of the word NWORDS = train(words( open ( 'big.txt' , encoding = 'utf-8' ).read())) alphabet = 'abcdefghijklmnopqrstuvwxyz' def edits1(word): # return all sets with an edit distance of 1 from word w splits = [] # split the word into a,b for i in range ( len (word)+ 1 ): # length It is necessary to add one. When taking the last one, it is [4:4] splits.append((word[:i], word[i:])) # print(splits) # splits = [(word[:i] ], word[i:]) for i in range(len(word) + 1)] # possibility of writing 1 more distance for b deletes = [a + b[ 1 :] for a, b in splits if b] # print(deletes) # Possibility of writing a distance of 1 wrong about b transposes = [a + b[ 1 ] + b[ 0 ] + b[ 2 :] for a, b in splits if len (b) > 1 ] # print(transposes) # replaces = [] # for a, b in splits: # for c in alphabet : # if b: # replaces.append((a + c + b[1:])) # Chance of misspelling the first word of b replaces = [a + c + b[ 1 :] for a, b in splits for c in alphabet if b] # print(replaces) # the possibility of one less word between a and b inserts = [a + c + b for a, b in splits for c in alphabet] print(inserts) return set(deletes + transposes + replaces + inserts) def known_edits2(word): # Set the set whose edit distance is equal to 2, which is based on one. # Optimization: Among the words whose edit distance is less than 2, only those correct words are used as candidate words print ( 6 ) return set (e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS) #Normally speaking, the probability of spelling one vowel into another is greater than that of consonants, and the probability of spelling the first word of a word incorrectly is small # ,~~~ However, if it is too complicated to calculate all these, here is simplified processing # Set the correct word with an edit distance of 1 to have a higher priority than an edit distance of 2, and an edit distance of 0 to have a higher priority than 1 # 0>1>2 # Probability of all words # 1: b write one more word Possibility #2: Possibility of writing one less word of b #3: Possibility of writing the first word of b wrong #4: Possibility of writing 1 distance left and right of b def known(words): list1 = [] for w in words: if w in NWORDS: list1.append(w) # data = set(w for w in words if w in NWORDS) return list1 # If known(set) is not empty, candidates will select this set and will not continue to calculate def correct(word): # If the previous priority is satisfied, return the word with higher priority candidates = known([word]) or known (edits1(word)) or known_edits2(word) or [word] # Take out the value with the largest number of times return max (candidates, key =NWORDS.get) # pass in the word to check data = correct( 'work' ) print (data)
Spell checker based on Bayesian formula
Guess you like
Origin http://43.154.161.224:23101/article/api/json?id=326279811&siteId=291194637
Recommended
Ranking