import re, unicodedata
[docs]def rmNon_Ascii(text):
"""membuang character ascii atau emoticon
:return: clean string from ascii character
:rtype: string
"""
return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
[docs]def rmURLs(text):
"""membuang url
:return: clean string from url
:rtype: string
"""
return re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', text)
[docs]def rmPunc(text):
"""membuang tanda baca
:return: clean string from punctuation
:rtype: string
"""
return re.sub(r'[^\w]|_',' ',text)
[docs]def rmDigit_string(text):
"""membuang digit didalam string
:return: clean string from digit
:rtype: string
"""
return re.sub("\S*\d\S*", "", text).strip()
[docs]def rmDigitnumbers(text):
"""membuang digit dan angka
:return: clean string from digit n numbers
:rtype: string
"""
return re.sub(r"\b\d+\b", " ", text)
[docs]def rmHashtag(text):
"""membuang hashtag
:return: clean string from hashtag
:rtype: string
"""
return re.sub(r"#(\w+)", ' ', text, flags=re.MULTILINE)
[docs]def rmMention(text):
"""membuang mention
:return: clean string from mention @
:rtype: string
"""
return re.sub(r"@(\w+)", ' ', text, flags=re.MULTILINE)
[docs]def rmXML(text):
"""membuang xml character
:return: clean string from xml character
:rtype: string
"""
return re.sub("&(?:#([0-9]+)|#x([0-9a-fA-F]+)|([0-9a-zA-Z]+));"," ",text)
[docs]def rmRT(text):
"""membuang retweet atau RT
:return: clean string from RT
:rtype: string
"""
return re.sub("\s*RT\s*@[^:]*"," ",text)
[docs]def lowercase(text):
"""casefolding menjadi huruf kecil
:return: lower case string
:rtype: string
"""
return text.lower()
[docs]def rmAdditionalWs(text):
"""membuang spasi2 tambahan
:return: clean string whitespace unnecessary
:rtype: string
"""
return re.sub('[\s]+', ' ', text)