# Load Python libraries
import io
import re
from collections import Counter


# Util function to read a plain text file
def read_text_file(file_path):
    text = ""
    with io.open(file_path, 'r', encoding='ISO-8859-1') as f:
        text = f.read()
    return text


# Return an array with the clean words of the document
def get_doc_words(plain_text):
    doc_words = []
    
    # Cleaing the text
    clean_text = re.sub('[^a-zA-Z]', ' ', plain_text.lower())
    
    # Tokenize sentences in words
    doc_words = clean_text.split()
    
    return doc_words


# Get text sample
file_path = "../data/en/Moby Dick or the Whale - Herman Melville.txt"
plain_text = read_text_file(file_path)
len(plain_text)

1235134


# Create document word list
doc_words = get_doc_words(plain_text)
len(doc_words)

219153


# Show 100 first words of the vocabulary
print(doc_words[0:100])

['the', 'project', 'gutenberg', 'ebook', 'of', 'moby', 'dick', 'or', 'the', 'whale', 'by', 'herman', 'melville', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or', 're', 'use', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', 'www', 'gutenberg', 'org', 'title', 'moby', 'dick', 'or', 'the', 'whale', 'author', 'herman', 'melville', 'release', 'date', 'december', 'ebook', 'last', 'updated', 'december', 'language', 'english', 'character', 'set', 'encoding', 'utf', 'start', 'of', 'this', 'project', 'gutenberg', 'ebook', 'moby', 'dick', 'or', 'the', 'whale', 'produced', 'by', 'daniel', 'lazarus', 'jonesey', 'and', 'david']


# Create term frequency list of the document words
term_freq = Counter(doc_words)
term_freq['the']

14542


# Define Spell Checker class
class SpellChecker:
    
    def __init__(self, term_freq):
        "Constructor."
        self.w_rank = {}
        self.letters = 'abcdefghijklmnopqrstuvwxyz'
        
        N = sum(term_freq.values())
        for term in term_freq:
            self.w_rank[term] = term_freq[term] / N
    
    def P(self, word): 
        "Probability of 'word'."
        return self.w_rank.get(word, 0)

    def known(self, words): 
        "The subset of 'words' that appear in the dictionary of w_rank."
        return set(w for w in words if w in self.w_rank)

    def edits1(self, word):
        "All edits that are one edit away from 'word'."
        splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
        deletes    = [L + R[1:]               for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
        replaces   = [L + c + R[1:]           for L, R in splits if R for c in self.letters]
        inserts    = [L + c + R               for L, R in splits for c in self.letters]
        
        return set(deletes + transposes + replaces + inserts)

    def edits2(self, word): 
        "All edits that are two edits away from 'word'."
        return (e2 for e1 in self.edits1(word) for e2 in self.edits1(e1))
    
    def correction(self, word):
        "Most probable spelling correction for word."
        return max(self.candidates(word), key = self.P)
    
    def candidates(self, word): 
        "Generate possible spelling corrections for word."
        return (self.known([word]) or self.known(self.edits1(word)) or self.known(self.edits2(word)) or [word])


# Create model
sp_model = SpellChecker(term_freq)


# Get probability of the word 'the'
sp_model.P('the')

0.06635546855393264


# Get probability of the word 'unmentioned'
sp_model.P('unmentioned')

0


# Get words that exist in the dictionary
sp_model.known(['the', 'unmentioned'])

{'the'}


# Get the correction for word 'castli'
sp_model.correction('castli')

'castle'


# Get candidates for word 'wlak'
sp_model.candidates('wlak')

{'walk', 'weak'}


# Load Python libraries
from spellchecker import SpellChecker


# Create English model
spell = SpellChecker(language='en')


# Default Levenshtein distance
spell.distance

2


# Find those words that may be misspelled
misspelled = spell.unknown(['She', 'livis', 'in', 'a', 'beaotipul', 'castli'])
misspelled

{'beaotipul', 'castli', 'livis'}


# Get the one 'most likely' answer and options
for word in misspelled:
    solution = {'word': word, 'answer': spell.correction(word), 'options': spell.candidates(word)}
    print(solution)

{'word': 'beaotipul', 'answer': 'beautiful', 'options': {'beautiful', 'beautitul', 'beatiful'}}
{'word': 'castli', 'answer': 'castle', 'options': {'castle'}}
{'word': 'livis', 'answer': 'lives', 'options': {'livia', "liv's", 'livin', 'livid', 'lives', 'livir'}}


# Create Spanish model
spell = SpellChecker(language='es', distance=1)


# Find those words that may be misspelled
misspelled = spell.unknown(['Esta', 'es', 'una', 'frace', 'con', 'errodes'])
misspelled

{'errodes', 'frace'}


# Get the one 'most likely' answer and options
for word in misspelled:
    solution = {'word': word, 'answer': spell.correction(word), 'options': spell.candidates(word)}
    print(solution)

{'word': 'frace', 'answer': 'frase', 'options': {'grace', 'brace', 'race', 'frack', 'frase', 'frame', 'fracs', 'frac', 'face', 'trace'}}
{'word': 'errodes', 'answer': 'errores', 'options': {'errores'}}

NLP - Spell Checker¶

1. Spell Checker from Scratch¶

Create dictionary/vocabulary from a book¶

Create Spell Checker model¶

Test Spell Checker model¶

2. Spell Checker using PySpellChecker¶

English model¶

Spanish model¶

Reference¶