s430705 plusalpha

This commit is contained in:
ZarebaMichal 2022-04-10 21:17:55 +02:00
parent 9d77a3a7ee
commit 819ce98f3d
3 changed files with 17971 additions and 17967 deletions

File diff suppressed because it is too large Load Diff

148
run.py
View File

@ -1,62 +1,24 @@
import string
import unicodedata
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from nltk import trigrams from nltk import trigrams
from collections import defaultdict, Counter from collections import defaultdict, Counter
import pandas as pd import pandas as pd
import csv import csv
import regex as re
DEFAULT_PREDICTION = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1' class GapPredictor:
def __init__(self, alpha):
self.model = defaultdict(lambda: defaultdict(lambda: 0))
self.alpha = alpha
self.vocab = set()
self.DEFAULT_PREDICTION = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
@staticmethod
def preprocess_text(text): def preprocess_text(text):
text = text.lower().replace("-\\n", "").replace("\\n", " ") text = text.lower().replace("-\\n", "").replace("\\n", " ")
return text return text
@staticmethod
def predict_probs(word1, word2): def _prepare_train_data():
raw_prediction = dict(model[word1, word2])
prediction = dict(Counter(raw_prediction).most_common(6))
total_prob = 0.0
str_prediction = ''
for word, prob in prediction.items():
total_prob += prob
str_prediction += f'{word}:{prob} '
if total_prob == 0.0:
return DEFAULT_PREDICTION
remaining_prob = 1 - total_prob
if remaining_prob < 0.01:
remaining_prob = 0.01
str_prediction += f':{remaining_prob}'
return str_prediction
def train_model(training_data):
for index, row in training_data.iterrows():
text = preprocess_text(str(row["final"]))
words = word_tokenize(text)
for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
if w1 and w2 and w3:
model[(w2, w3)][w1] += 1
model[(w1, w2)][w3] += 1
for word_pair in model:
num_n_grams = float(sum(model[word_pair].values()))
for word in model[word_pair]:
model[word_pair][word] /= num_n_grams
data = pd.read_csv( data = pd.read_csv(
"train/in.tsv.xz", "train/in.tsv.xz",
sep="\t", sep="\t",
@ -64,7 +26,7 @@ data = pd.read_csv(
warn_bad_lines=False, warn_bad_lines=False,
header=None, header=None,
quoting=csv.QUOTE_NONE, quoting=csv.QUOTE_NONE,
nrows=100000, nrows=90000,
) )
train_labels = pd.read_csv( train_labels = pd.read_csv(
@ -73,39 +35,81 @@ train_labels = pd.read_csv(
error_bad_lines=False, error_bad_lines=False,
header=None, header=None,
quoting=csv.QUOTE_NONE, quoting=csv.QUOTE_NONE,
nrows=100000, nrows=90000,
) )
train_data = data[[6, 7]] train_data = data[[6, 7]]
train_data = pd.concat([train_data, train_labels], axis=1) train_data = pd.concat([train_data, train_labels], axis=1)
train_data["final"] = train_data[6] + train_data[0] + train_data[7] train_data["final"] = train_data[6] + train_data[0] + train_data[7]
model = defaultdict(lambda: defaultdict(lambda: 0)) return train_data
def train_model(self):
dev_data = pd.read_csv('dev-0/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE) training_data = self._prepare_train_data()
test_data = pd.read_csv('test-A/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE) for index, row in training_data.iterrows():
text = self.preprocess_text(str(row["final"]))
train_model(train_data)
with open("dev-0/out.tsv", "w") as file:
for _, row in dev_data.iterrows():
text = preprocess_text(str(row[7]))
words = word_tokenize(text) words = word_tokenize(text)
for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
if w1 and w2 and w3:
self.model[(w2, w3)][w1] += 1
self.model[(w1, w2)][w3] += 1
self.vocab.add(w1)
self.vocab.add(w2)
self.vocab.add(w3)
for word_pair in self.model:
num_n_grams = float(sum(self.model[word_pair].values()))
for word in self.model[word_pair]:
self.model[word_pair][word] = (
self.model[word_pair][word] + self.alpha
) / (num_n_grams + self.alpha * len(self.vocab))
def predict_probs(self, words):
if len(words) < 3: if len(words) < 3:
prediction = DEFAULT_PREDICTION return self.DEFAULT_PREDICTION
else:
prediction = predict_probs(words[0], words[1]) word1, word2 = words[0], words[1]
raw_prediction = dict(self.model[word1, word2])
prediction = dict(Counter(raw_prediction).most_common(6))
total_prob = 0.0
str_prediction = ""
for word, prob in prediction.items():
total_prob += prob
str_prediction += f"{word}:{prob} "
if total_prob == 0.0:
return self.DEFAULT_PREDICTION
remaining_prob = 1 - total_prob
if remaining_prob < 0.01:
remaining_prob = 0.01
str_prediction += f":{remaining_prob}"
return str_prediction
def prepare_output(self, input_file, output_file):
with open(output_file, "w") as file:
data = pd.read_csv(
input_file,
sep="\t",
error_bad_lines=False,
warn_bad_lines=False,
header=None,
quoting=csv.QUOTE_NONE,
)
for _, row in data.iterrows():
text = self.preprocess_text(str(row[7]))
words = word_tokenize(text)
prediction = self.predict_probs(words)
file.write(prediction + "\n") file.write(prediction + "\n")
with open("test-A/out.tsv", "w") as file:
for _, row in test_data.iterrows():
text = preprocess_text(str(row[7]))
words = word_tokenize(text)
if len(words) < 3:
prediction = DEFAULT_PREDICTION
else:
prediction = predict_probs(words[0], words[1])
file.write(prediction + "\n")
predictor = GapPredictor(alpha=0.00002)
predictor.train_model()
predictor.prepare_output("dev-0/in.tsv.xz", "dev-0/out.tsv")
predictor.prepare_output("test-A/in.tsv.xz", "test-A/out.tsv")

File diff suppressed because it is too large Load Diff