import nltk import pandas as pd import regex as re from csv import QUOTE_NONE ENCODING = "utf-8" REP = re.compile(r"[{}\[\]\&%^$*#\(\)@\t\n0123456789]+") REM = re.compile(r"'s|[\-]\\n|\-\\n|\p{P}") def clean_text(text): res = str(text).lower().strip() res = res.replace("’", "'") res = REM.sub("", res) res = REP.sub(" ", res) res = res.replace("'s", " is") res = res.replace("'ll", " will") res = res.replace("won't", "will not") return res.replace("'m", " am") def get_csv(fname): return pd.read_csv( fname, sep="\t", on_bad_lines='skip', header=None, quoting=QUOTE_NONE, encoding=ENCODING ) def check_prerequisites(): try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt')