<aside> ✏️ 실습 : https://colab.research.google.com/drive/1FfhWsP9izQcuVl06P30r5cCxELA1ciVE?usp=sharing#scrollTo=8JXT1xXdOaMh
</aside>
punct = "/-'?!.,#$%\\'()*+-/:;<=>@[\\\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\\×™√²—–&'
punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }
def clean_punc(text, punct, mapping):
for p in mapping:
text = text.replace(p, mapping[p])
for p in punct:
text = text.replace(p, f' {p} ')
specials = {'\\u200b': ' ', '…': ' ... ', '\\ufeff': '', 'करना': '', 'है': ''}
for s in specials:
text = text.replace(s, specials[s])
return text.strip()
import re
def clean_text(texts):
corpus = []
for i in range(0, len(texts)):
# 구두점 제거
review = re.sub(r'[@%\\\\*=()/~#&\\+á?\\xc3\\xa1\\-\\|\\:\\;\\!\\-\\,\\_\\~\\$\\'\\"]', '',str(texts[i]))
review = review.lower() # 영어 -> 소문자
review = re.sub(r'\\s+', ' ', review) # " " -> " "
review = re.sub(r'<[^>]+>','',review) # remove Html tags
review = re.sub(r"^\\s+", '', review) # 시작하는 공백 제거
review = re.sub(r'\\s+$', '', review) # 끝나는 공백 제거
corpus.append(review)
return corpus