This commit is contained in:
alesad7 2022-04-03 22:23:38 +02:00
parent 5e504b2d51
commit 020c748f99

41
run.py
View File

@ -1,13 +1,30 @@
from nltk import trigrams, word_tokenize from nltk import trigrams, word_tokenize
from collections import defaultdict, Counter
import pandas as pd import pandas as pd
import csv import csv
import regex as re import regex as re
from collections import Counter, defaultdict
def preprocess(text): train_set = pd.read_csv(
text = text.lower().replace('-\\n', '').replace('\\n', ' ') 'train/in.tsv.xz',
return re.sub(r'\p{P}', '', text) sep='\t',
on_bad_lines='skip',
header=None,
uoting=csv.QUOTE_NONE,
nrows=20000)
train_labels = pd.read_csv(
'train/expected.tsv',
sep='\t',
on_bad_lines='skip',
header=None,
quoting=csv.QUOTE_NONE,
nrows=20000)
def data_preprocessing(text):
return re.sub(r'\p{P}', '', text.lower().replace('-\\n', '').replace('\\n', ' '))
def predict(before, after): def predict(before, after):
@ -27,7 +44,7 @@ def make_prediction(file):
data = pd.read_csv(f'{file}/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE) data = pd.read_csv(f'{file}/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
with open(f'{file}/out.tsv', 'w', encoding='utf-8') as file_out: with open(f'{file}/out.tsv', 'w', encoding='utf-8') as file_out:
for _, row in data.iterrows(): for _, row in data.iterrows():
before, after = word_tokenize(preprocess(str(row[6]))), word_tokenize(preprocess(str(row[7]))) before, after = word_tokenize(data_preprocessing(str(row[6]))), word_tokenize(data_preprocessing(str(row[7])))
if len(before) < 3 or len(after) < 3: if len(before) < 3 or len(after) < 3:
prediction = 'to:0.02 be:0.02 the:0.02 or:0.01 not:0.01 and:0.01 a:0.01 :0.9' prediction = 'to:0.02 be:0.02 the:0.02 or:0.01 not:0.01 and:0.01 a:0.01 :0.9'
else: else:
@ -35,19 +52,17 @@ def make_prediction(file):
file_out.write(prediction + '\n') file_out.write(prediction + '\n')
train_data = pd.read_csv('train/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE, nrows=20000) train_set = train_set[[6, 7]]
train_labels = pd.read_csv('train/expected.tsv', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE, nrows=20000) train_set = pd.concat([train_set, train_labels], axis=1)
train_data = train_data[[6, 7]] train_set['line'] = train_set[6] + train_set[0] + train_set[7]
train_data = pd.concat([train_data, train_labels], axis=1)
train_data['line'] = train_data[6] + train_data[0] + train_data[7]
trigram = defaultdict(lambda: defaultdict(lambda: 0)) trigram = defaultdict(lambda: defaultdict(lambda: 0))
rows = train_data.iterrows() rows = train_set.iterrows()
rows_len = len(train_data) rows_len = len(train_set)
for index, (_, row) in enumerate(rows): for index, (_, row) in enumerate(rows):
text = preprocess(str(row['line'])) text = data_preprocessing(str(row['line']))
words = word_tokenize(text) words = word_tokenize(text)
for word_1, word_2, word_3 in trigrams(words, pad_right=True, pad_left=True): for word_1, word_2, word_3 in trigrams(words, pad_right=True, pad_left=True):
if word_1 and word_2 and word_3: if word_1 and word_2 and word_3: