29 lines
538 B
Python
29 lines
538 B
Python
import nltk
|
|
import pandas as pd
|
|
import regex as re
|
|
from csv import QUOTE_NONE
|
|
|
|
ENCODING = "utf-8"
|
|
|
|
|
|
def clean_text(text):
|
|
return re.sub(r"\p{P}", "", str(text).lower().replace("-\\n", "").replace("\\n", " "))
|
|
|
|
|
|
def get_csv(fname):
|
|
return pd.read_csv(
|
|
fname,
|
|
sep="\t",
|
|
on_bad_lines='skip',
|
|
header=None,
|
|
quoting=QUOTE_NONE,
|
|
encoding=ENCODING
|
|
)
|
|
|
|
|
|
def check_prerequisites():
|
|
try:
|
|
nltk.data.find('tokenizers/punkt')
|
|
except LookupError:
|
|
nltk.download('punkt')
|