challenging-america-word-ga.../utils.py

39 lines
826 B
Python
Raw Normal View History

2022-04-04 15:07:07 +02:00
import nltk
import pandas as pd
import regex as re
from csv import QUOTE_NONE
ENCODING = "utf-8"
2022-04-05 19:08:22 +02:00
REP = re.compile(r"[{}\[\]\&%^$*#\(\)@\t\n0123456789]+")
REM = re.compile(r"'s|[\-]\\n|\-\\n|\p{P}")
2022-04-04 15:07:07 +02:00
def clean_text(text):
2022-04-05 19:08:22 +02:00
res = str(text).lower().strip()
res = res.replace("", "'")
res = REM.sub("", res)
res = REP.sub(" ", res)
res = res.replace("'s", " is")
res = res.replace("'ll", " will")
res = res.replace("won't", "will not")
return res.replace("'m", " am")
2022-04-04 15:07:07 +02:00
def get_csv(fname):
return pd.read_csv(
fname,
sep="\t",
on_bad_lines='skip',
header=None,
quoting=QUOTE_NONE,
encoding=ENCODING
)
def check_prerequisites():
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')