import nltk import pandas as pd import regex as re from csv import QUOTE_NONE ENCODING = "utf-8" def clean_text(text): return re.sub(r"\p{P}", "", str(text).lower().replace("-\\n", "").replace("\\n", " ")) def get_csv(fname): return pd.read_csv( fname, sep="\t", on_bad_lines='skip', header=None, quoting=QUOTE_NONE, encoding=ENCODING ) def check_prerequisites(): try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt')