import pandas as pd import regex as re from csv import QUOTE_NONE ENCODING = "utf-8" REP = re.compile(r"[{}\[\]\&%^$*#\(\)@\t\n0123456789]+") REM = re.compile(r"'s|[\-]\\n|\-\\n|\p{P}") def read_csv(fname): return pd.read_csv( fname, sep="\t", on_bad_lines='skip', header=None, quoting=QUOTE_NONE, encoding=ENCODING ) def clean_text(text): res = str(text).lower().strip() res = res.replace("’", "'") res = REM.sub("", res) res = REP.sub(" ", res) res = res.replace("'s", " is") res = res.replace("'ll", " will") res = res.replace("won't", "will not") res = res.replace("isn't", "is not") res = res.replace("aren't", "are not") res = res.replace("'ve'", "have") return res.replace("'m", " am")