challenging-america-word-ga.../utils.py
Norbert Litkowski b78257156a arpa
2022-04-25 00:28:09 +02:00

39 lines
826 B
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import nltk
import pandas as pd
import regex as re
from csv import QUOTE_NONE
ENCODING = "utf-8"
REP = re.compile(r"[{}\[\]\&%^$*#\(\)@\t\n0123456789]+")
REM = re.compile(r"'s|[\-]\\n|\-\\n|\p{P}")
def clean_text(text):
res = str(text).lower().strip()
res = res.replace("", "'")
res = REM.sub("", res)
res = REP.sub(" ", res)
res = res.replace("'s", " is")
res = res.replace("'ll", " will")
res = res.replace("won't", "will not")
return res.replace("'m", " am")
def get_csv(fname):
return pd.read_csv(
fname,
sep="\t",
on_bad_lines='skip',
header=None,
quoting=QUOTE_NONE,
encoding=ENCODING
)
def check_prerequisites():
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')