Source code for spellchecker

import re
from collections import Counter
import pandas as pd
import os.path

# initiate kata dasar from kata_dasar_kbbi
my_path = os.path.abspath(os.path.dirname(__file__))
path = os.path.join(my_path, "./data/kata_dasar_kbbi.csv")

# create WORDS as mapping count from kata_dasar_kbbi
WORDS = Counter(list(pd.read_csv(path,header=None)[0].values))

[docs]def P(word, N=sum(WORDS.values())):
    """Probability of `word`
    
    :param word: kata
    :type word: string
    :param N: jumlah n kata, defaults to sum(WORDS.values())
    :type N: integer
    :return: Probability of word
    :rtype: float
    """

    return WORDS[word] / N

[docs]def correction(word):
    """Most probable spelling correction for word
    `flow:`
    `word -->  edits1(word) --> edits2(word) --> known(words) --> candidates(word) --> correction(word) with P as key`
    
    :param word: kata
    :type word: string
    :return: word within maximum Probability
    :rtype: string
    """
    
    return max(candidates(word), key=P)

[docs]def candidates(word):
    """Generate possible spelling corrections for word
    
    :param word: kata
    :type word: string
    :return: set of candidates words
    :rtype: set
    """
    
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

[docs]def known(words):
    """The subset of `words` that appear in the dictionary of WORDS
    
    :param words: list of word
    :type words: list
    :return: set of words that appear in the dictionary of WORDS
    :rtype: set
    """
    
    return set(w for w in words if w in WORDS)

[docs]def edits1(word):
    """All edits that are one edit away from `word`
    
    :param word: kata
    :type word: string
    :return: all kinds edit that are one edit away from `word`
    :rtype: set
    """
    
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)] # [('', 'kemarin'), ('k', 'emarin'), ('ke', 'marin'), dst]
    deletes    = [L + R[1:]               for L, R in splits if R] # ['emarin', 'kmarin', 'kearin', dst]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1] # ['ekmarin', 'kmearin', 'keamrin', dst]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters] # ['aemarin', 'bemarin', 'cemarin', dst]
    inserts    = [L + c + R               for L, R in splits for c in letters] # ['akemarin', 'bkemarin', 'ckemarin', dst]
    return set(deletes + transposes + replaces + inserts)

[docs]def edits2(word):
    """All edits that are two edits away from `word`
    
    :param word: kata
    :type word: string
    :return: all kinds edit that are twice edit away from `word`
    :rtype: set
    """
    
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))