A Spell checker (or spell corrector) is an application, program or a feature of a program that checks for misspellings in a text and offers possible solutions (candidate words). [1]
A basic spell checker carries out the following processes:
# Load Python libraries
import io
import re
from collections import Counter
# Util function to read a plain text file
def read_text_file(file_path):
text = ""
with io.open(file_path, 'r', encoding='ISO-8859-1') as f:
text = f.read()
return text
# Return an array with the clean words of the document
def get_doc_words(plain_text):
doc_words = []
# Cleaing the text
clean_text = re.sub('[^a-zA-Z]', ' ', plain_text.lower())
# Tokenize sentences in words
doc_words = clean_text.split()
return doc_words
# Get text sample
file_path = "../data/en/Moby Dick or the Whale - Herman Melville.txt"
plain_text = read_text_file(file_path)
len(plain_text)
1235134
# Create document word list
doc_words = get_doc_words(plain_text)
len(doc_words)
219153
# Show 100 first words of the vocabulary
print(doc_words[0:100])
['the', 'project', 'gutenberg', 'ebook', 'of', 'moby', 'dick', 'or', 'the', 'whale', 'by', 'herman', 'melville', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or', 're', 'use', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', 'www', 'gutenberg', 'org', 'title', 'moby', 'dick', 'or', 'the', 'whale', 'author', 'herman', 'melville', 'release', 'date', 'december', 'ebook', 'last', 'updated', 'december', 'language', 'english', 'character', 'set', 'encoding', 'utf', 'start', 'of', 'this', 'project', 'gutenberg', 'ebook', 'moby', 'dick', 'or', 'the', 'whale', 'produced', 'by', 'daniel', 'lazarus', 'jonesey', 'and', 'david']
# Create term frequency list of the document words
term_freq = Counter(doc_words)
term_freq['the']
14542
Based on Peter Norvig’s 21-line spelling corrector using probability theory. [2]
# Define Spell Checker class
class SpellChecker:
def __init__(self, term_freq):
"Constructor."
self.w_rank = {}
self.letters = 'abcdefghijklmnopqrstuvwxyz'
N = sum(term_freq.values())
for term in term_freq:
self.w_rank[term] = term_freq[term] / N
def P(self, word):
"Probability of 'word'."
return self.w_rank.get(word, 0)
def known(self, words):
"The subset of 'words' that appear in the dictionary of w_rank."
return set(w for w in words if w in self.w_rank)
def edits1(self, word):
"All edits that are one edit away from 'word'."
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
replaces = [L + c + R[1:] for L, R in splits if R for c in self.letters]
inserts = [L + c + R for L, R in splits for c in self.letters]
return set(deletes + transposes + replaces + inserts)
def edits2(self, word):
"All edits that are two edits away from 'word'."
return (e2 for e1 in self.edits1(word) for e2 in self.edits1(e1))
def correction(self, word):
"Most probable spelling correction for word."
return max(self.candidates(word), key = self.P)
def candidates(self, word):
"Generate possible spelling corrections for word."
return (self.known([word]) or self.known(self.edits1(word)) or self.known(self.edits2(word)) or [word])
Note: If you want to see Spell Checker algorithm in the VB.net language, click here.
# Create model
sp_model = SpellChecker(term_freq)
# Get probability of the word 'the'
sp_model.P('the')
0.06635546855393264
# Get probability of the word 'unmentioned'
sp_model.P('unmentioned')
0
# Get words that exist in the dictionary
sp_model.known(['the', 'unmentioned'])
{'the'}
# Get the correction for word 'castli'
sp_model.correction('castli')
'castle'
# Get candidates for word 'wlak'
sp_model.candidates('wlak')
{'walk', 'weak'}
Pure Python Spell Checking based on Peter Norvig’s blog post on setting up a simple spell checking algorithm. [3]
It uses a Levenshtein Distance algorithm to find permutations within an edit distance of 2 from the original word. It then compares all permutations (insertions, deletions, replacements, and transpositions) to known words in a word frequency list. Those words that are found more often in the frequency list are more likely the correct results. [4]
# Load Python libraries
from spellchecker import SpellChecker
# Create English model
spell = SpellChecker(language='en')
# Default Levenshtein distance
spell.distance
2
# Find those words that may be misspelled
misspelled = spell.unknown(['She', 'livis', 'in', 'a', 'beaotipul', 'castli'])
misspelled
{'beaotipul', 'castli', 'livis'}
# Get the one 'most likely' answer and options
for word in misspelled:
solution = {'word': word, 'answer': spell.correction(word), 'options': spell.candidates(word)}
print(solution)
{'word': 'beaotipul', 'answer': 'beautiful', 'options': {'beautiful', 'beautitul', 'beatiful'}} {'word': 'castli', 'answer': 'castle', 'options': {'castle'}} {'word': 'livis', 'answer': 'lives', 'options': {'livia', "liv's", 'livin', 'livid', 'lives', 'livir'}}
# Create Spanish model
spell = SpellChecker(language='es', distance=1)
# Find those words that may be misspelled
misspelled = spell.unknown(['Esta', 'es', 'una', 'frace', 'con', 'errodes'])
misspelled
{'errodes', 'frace'}
# Get the one 'most likely' answer and options
for word in misspelled:
solution = {'word': word, 'answer': spell.correction(word), 'options': spell.candidates(word)}
print(solution)
{'word': 'frace', 'answer': 'frase', 'options': {'grace', 'brace', 'race', 'frack', 'frase', 'frame', 'fracs', 'frac', 'face', 'trace'}} {'word': 'errodes', 'answer': 'errores', 'options': {'errores'}}