Spell checker based on Bayesian formula

 
 
import re, collections

# Spell checker
 def words(text):
     # Only filter out letters, turn all letters into lowercase, and turn them into words
 return re.findall( '[az]+' , text.lower())
    


def train(features):
     # When encountering a new word, set the word frequency to 1 by default (representing a small probability)
     model = collections.defaultdict( lambda : 1 )
     for f in features:
        model[f] += 1
return model    

# Import the corpus file, return a dictionary, the dictionary key is the word, the value is the number of occurrences of the word
 NWORDS = train(words( open ( 'big.txt' , encoding = 'utf-8' ).read()))

alphabet = 'abcdefghijklmnopqrstuvwxyz'
 def edits1(word):
     # return all sets with an edit distance of 1 from word w
 splits = []
     # split the word into a,b
 for i in range ( len (word)+ 1 ):
         # length It is necessary to add one. When taking the last one, it is [4:4]
 splits.append((word[:i], word[i:]))
     # print(splits)
     # splits = [(word[:i] ], word[i:]) for i in range(len(word) + 1)]
     # possibility of writing 1 more distance for
 b deletes = [a + b[ 1 :] for a, b in splits if b]
     # print(deletes)
     # Possibility of writing a distance of 1 wrong about b
 transposes = [a + b[

                        1 ] + b[ 0 ] + b[ 2 :] for a, b in splits if len (b) > 1 ]
     # print(transposes)
     # replaces = []
     # for a, b in splits:
     # for c in alphabet :
     # if b:
     # replaces.append((a + c + b[1:]))
     # Chance of misspelling the first word of b
     replaces = [a + c + b[ 1 :] for a, b in splits for c in alphabet if b]
     # print(replaces)
     # the possibility of one less word between a and b
    inserts = [a + c + b for a, b in splits for c in alphabet]
    print(inserts)
    return set(deletes + transposes + replaces + inserts)


def known_edits2(word):
     # Set the set whose edit distance is equal to 2, which is based on one.
     # Optimization: Among the words whose edit distance is less than 2, only those correct words are used as candidate words
 print ( 6 )
     return set (e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)    

#Normally speaking, the probability of spelling one vowel into another is greater than that of consonants, and the probability of spelling the first word of a word incorrectly is small
 # ,~~~ However, if it is too complicated to calculate all these, here is simplified processing
 # Set the correct word with an edit distance of 1 to have a higher priority than an edit distance of 2, and an edit distance of 0 to have a higher priority than 1
 # 0>1>2
 # Probability of all words
 # 1: b write one more word Possibility
 #2: Possibility of writing one less word of
 b #3: Possibility of writing the first word of b wrong
 #4: Possibility of writing 1 distance left and right of b
 def known(words):

    list1 = []
    for w in words:
        if w in NWORDS:
            list1.append(w)
    # data = set(w for w in words if w in NWORDS)
return list1    

# If known(set) is not empty, candidates will select this set and will not continue to calculate
 def correct(word):
     # If the previous priority is satisfied, return the word with higher priority
     candidates = known([word]) or known (edits1(word)) or known_edits2(word) or [word]
     # Take out the value with the largest number of times
 return max (candidates, key =NWORDS.get)    

# pass in the word to check
 data = correct( 'work' )
 print (data)
 
 



Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=326279811&siteId=291194637