Source code for preProc_misc

import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import word_tokenize
import os.path

# initiate dictionary for normalize text
my_path = os.path.abspath(os.path.dirname(__file__))
path = os.path.join(my_path, "./data/colloquial-indonesian-lexicon.csv")

# initiate stopwords from NLTK
stop_words = set(stopwords.words('indonesian'))

# initiate dataframe for mapping n normalize text
df_crc = pd.read_csv(path)[['slang','formal','category1']]
df_crc = df_crc[df_crc['category1']=='elongasi']

[docs]def koreksi_elongasi(word, df_crc=df_crc): """koreksi elongasi sebagai tahapan normalisasi text :param word: kata yang akan di normalize :type word: string :param df_crc: dataframe correction for normalize, defaults to df_crc :type df_crc: dataframe pandas, optional :return: normalized text after mapping :rtype: string """ if list(df_crc['formal'][df_crc['slang']=='{}'.format(word)].values) == []: return word return df_crc['formal'][df_crc['slang']=='{}'.format(word)].values[0]
[docs]def removeStopword(text, stop_words=stop_words): """membuang kata2 yang terdapat pada stopwords id :param text: list kata yang akan dibuang dari daftar stopwords yang ada :type text: list of string :param stop_words: set of stopwords from NLTK indonesian, defaults to stop_words initiate in beginning :type stop_words: dataframe pandas, optional :return: list of string after removing stop words :rtype: list """ word_tokens = word_tokenize(text) filtered_sentence = [w for w in word_tokens if not w in stop_words] return ' '.join(filtered_sentence)