Source code for spellchecker

import re
from collections import Counter
import pandas as pd
import os.path

# initiate kata dasar from kata_dasar_kbbi
my_path = os.path.abspath(os.path.dirname(__file__))
path = os.path.join(my_path, "./data/kata_dasar_kbbi.csv")

# create WORDS as mapping count from kata_dasar_kbbi
WORDS = Counter(list(pd.read_csv(path,header=None)[0].values))

[docs]def P(word, N=sum(WORDS.values())): """Probability of `word` :param word: kata :type word: string :param N: jumlah n kata, defaults to sum(WORDS.values()) :type N: integer :return: Probability of word :rtype: float """ return WORDS[word] / N
[docs]def correction(word): """Most probable spelling correction for word `flow:` `word --> edits1(word) --> edits2(word) --> known(words) --> candidates(word) --> correction(word) with P as key` :param word: kata :type word: string :return: word within maximum Probability :rtype: string """ return max(candidates(word), key=P)
[docs]def candidates(word): """Generate possible spelling corrections for word :param word: kata :type word: string :return: set of candidates words :rtype: set """ return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
[docs]def known(words): """The subset of `words` that appear in the dictionary of WORDS :param words: list of word :type words: list :return: set of words that appear in the dictionary of WORDS :rtype: set """ return set(w for w in words if w in WORDS)
[docs]def edits1(word): """All edits that are one edit away from `word` :param word: kata :type word: string :return: all kinds edit that are one edit away from `word` :rtype: set """ letters = 'abcdefghijklmnopqrstuvwxyz' splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] # [('', 'kemarin'), ('k', 'emarin'), ('ke', 'marin'), dst] deletes = [L + R[1:] for L, R in splits if R] # ['emarin', 'kmarin', 'kearin', dst] transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1] # ['ekmarin', 'kmearin', 'keamrin', dst] replaces = [L + c + R[1:] for L, R in splits if R for c in letters] # ['aemarin', 'bemarin', 'cemarin', dst] inserts = [L + c + R for L, R in splits for c in letters] # ['akemarin', 'bkemarin', 'ckemarin', dst] return set(deletes + transposes + replaces + inserts)
[docs]def edits2(word): """All edits that are two edits away from `word` :param word: kata :type word: string :return: all kinds edit that are twice edit away from `word` :rtype: set """ return (e2 for e1 in edits1(word) for e2 in edits1(e1))