Tutorial Singkat¶
Referensi merefer disini
Disini saya coba menjabarkan beberapa materi yaitu spellchecker , cleansing elongasi kata , dan penggunaan N-gramisasi untuk menganalisa teks.
penjelasan singkat dari notebook pada repositori¶
"""Contoh ini mendemonstrasikan penggunaan modul preprocessUII dan beberapa module pelangkapnya
`disini <https://github.com/Yuriowindiatmoko2401/tugas-text-analytics-2/blob/master/19917020_Yurio_txt2.ipynb>`_
"""
import pandas as pd
from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from preprocessUII.spellchecker import correction
from preprocessUII.preProc_misc import stop_words, removeStopword, koreksi_elongasi
# menambahkan beberapa stopwords untuk cleansing data id wiki 1000
stop_words.add('of')
stop_words.add('the')
stop_words.add('jpg')
stop_words.add('jmpl')
stop_words.add('px')
stop_words.add('cd')
def get_top_n_words(corpus, ngr=1, n=None):
"""get_top_n_words for any n-gram you decide
:param corpus: corpora of text you've created before , usually formatted as list of sentence
:type corpus: list
:param ngr: deciding n-gram you tend to build, defaults to 1
:type ngr: int, optional
:param n: deciding top n words like top 10 or top 20 of word or gram, defaults to None
:type n: int, optional
:return: list of tuple from word and frequency
:rtype: list of tuple
"""
vec = CountVectorizer(ngram_range=(ngr, ngr)).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
# 2. Koreksi ejaan untuk menangani salah ketik (Misal: yagn --> dikoreksi menjadi yang)
# 3. Word normalization untuk menangani lenghtening word (Misal: horeeee --> dinormalisasi menjadi hore)
print('koreksi : ', correction('yagn')) # percobaan pertama menggunakan correction function
print('koreksi : ', correction('horeeee'))
print('koreksi : ', koreksi_elongasi('horeeee')) # percobaan kedus menggunakan koreksi_elongasi function
# Buatlah code dengan Python untuk menangani permasalahan berikut:
# 1. Generate n-gram pada suatu corpus
sentence = "akankah diri ini terus bersamamu disaat orang lain sudah menjadi tuanmu"
n = 6
sixgrams = ngrams(sentence.split(), n)
for grams in sixgrams: # contoh simple proses n-gramisasi
print(grams)
corpus_wiki = open('idwiki_1k.txt').read().split("\n") # corpus indonesia wikipedia 1000 article
corpus_wiki_sw = [removeStopword(article) for article in corpus_wiki] # corpus indonesia wikipedia 1000 article yang sudah
# dibersihkan dari stopwords
############### unigram non stop words #################################
common_words = get_top_n_words(corpus_wiki, ngr=1, n=20) # menggunakan get_top_n_words untuk menganalisa dominasi frase
# n-gram pada suatu corpus
for word, freq in common_words: # melihat top 20 word
print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['ArticleText' , 'count'])
matplotlib.rcParams['figure.figsize'] = (14, 10) # melihat top 20 word visualisasi dengan matplotlib
ax = df2.plot.bar(x='ArticleText', y='count', rot='vertical')
plt.show()
ax = sns.barplot(x='ArticleText', y='count', data=df2) # melihat top 20 word visualisasi dengan seaborn
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.show()
############### unigram stop words ################################# Pengulangan pd corpus stopwords and so on....
common_words = get_top_n_words(corpus_wiki_sw, ngr=1, n=20)
for word, freq in common_words:
print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['ArticleText' , 'count'])
ax = df2.plot.bar(x='ArticleText', y='count', rot='vertical')
plt.show()
ax = sns.barplot(x='ArticleText', y='count', data=df2)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.show()
############### bigram non stop words #################################
common_words = get_top_n_words(corpus_wiki, ngr=2, n=20)
for word, freq in common_words:
print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['ArticleText' , 'count'])
ax = df2.plot.bar(x='ArticleText', y='count', rot='vertical')
plt.show()
ax = sns.barplot(x='ArticleText', y='count', data=df2)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.show()
############### bigram stop words #################################
common_words = get_top_n_words(corpus_wiki_sw, ngr=2, n=20)
for word, freq in common_words:
print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['ArticleText' , 'count'])
ax = df2.plot.bar(x='ArticleText', y='count', rot='vertical')
plt.show()
ax = sns.barplot(x='ArticleText', y='count', data=df2)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.show()
############### trigram non stop words #################################
common_words = get_top_n_words(corpus_wiki, ngr=3, n=20)
for word, freq in common_words:
print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['ArticleText' , 'count'])
ax = df2.plot.bar(x='ArticleText', y='count', rot='vertical')
plt.show()
ax = sns.barplot(x='ArticleText', y='count', data=df2)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.show()
############### trigram stop words #################################
common_words = get_top_n_words(corpus_wiki_sw, ngr=3, n=20)
for word, freq in common_words:
print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['ArticleText' , 'count'])
ax = df2.plot.bar(x='ArticleText', y='count', rot='vertical')
plt.show()
ax = sns.barplot(x='ArticleText', y='count', data=df2)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.show()
############### 4-gram non stop words #################################
common_words = get_top_n_words(corpus_wiki, ngr=4, n=20)
for word, freq in common_words:
print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['ArticleText' , 'count'])
ax = df2.plot.bar(x='ArticleText', y='count', rot='vertical')
plt.show()
ax = sns.barplot(x='ArticleText', y='count', data=df2)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.show()
############### 4-gram stop words #################################
common_words = get_top_n_words(corpus_wiki_sw, ngr=4, n=20)
for word, freq in common_words:
print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['ArticleText' , 'count'])
ax = df2.plot.bar(x='ArticleText', y='count', rot='vertical')
plt.show()
ax = sns.barplot(x='ArticleText', y='count', data=df2)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.show()
Referensi penulis¶
- https://towardsdatascience.com/a-complete-exploratory-data-analysis-and-visualization-for-text-data-29fb1b96fb6a
- https://medium.com/@arie.pratama.s/bahasa-indonesia-open-sourced-nlp-resources-8cb394193238
- https://raw.githubusercontent.com/nasalsabila/kamus-alay/master/colloquial-indonesian-lexicon.csv