This commit is contained in:
Bartosz Karwacki 2022-04-03 19:50:05 +02:00
parent a2064b7ed9
commit 2be2a96fe1
3 changed files with 16539 additions and 16546 deletions

File diff suppressed because it is too large Load Diff

23
run.py
View File

@ -12,7 +12,6 @@ data = pd.read_csv(
error_bad_lines=False, error_bad_lines=False,
header=None, header=None,
quoting=csv.QUOTE_NONE, quoting=csv.QUOTE_NONE,
nrows=250000
) )
train_labels = pd.read_csv( train_labels = pd.read_csv(
"train/expected.tsv", "train/expected.tsv",
@ -20,7 +19,6 @@ train_labels = pd.read_csv(
error_bad_lines=False, error_bad_lines=False,
header=None, header=None,
quoting=csv.QUOTE_NONE, quoting=csv.QUOTE_NONE,
nrows=250000
) )
train_data = data[[6, 7]] train_data = data[[6, 7]]
@ -33,22 +31,17 @@ model = defaultdict(lambda: defaultdict(lambda: 0))
def clean(text): def clean(text):
text = str(text) text = str(text)
# normalize text
text = ( text = (
unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode( unicodedata.normalize("NFKD", text)
'utf-8', 'ignore')) .encode("ascii", "ignore")
# replace html chars with ' ' .decode("utf-8", "ignore")
text = re.sub('<.*?>', ' ', text) )
# remove punctuation text = re.sub("<.*?>", " ", text)
text = text.translate(str.maketrans(' ', ' ', string.punctuation)) text = text.translate(str.maketrans(" ", " ", string.punctuation))
# only alphabets and numerics text = re.sub("[^a-zA-Z]", " ", text)
text = re.sub('[^a-zA-Z]', ' ', text)
# replace newline with space
text = re.sub("\n", " ", text) text = re.sub("\n", " ", text)
# lower case
text = text.lower() text = text.lower()
# split and join the words text = " ".join(text.split())
text = ' '.join(text.split())
return text return text

File diff suppressed because it is too large Load Diff