s430705 plusalpha

This commit is contained in:
ZarebaMichal 2022-04-10 21:17:55 +02:00
parent 9d77a3a7ee
commit 819ce98f3d
3 changed files with 17971 additions and 17967 deletions

File diff suppressed because it is too large Load Diff

190
run.py
View File

@ -1,111 +1,115 @@
import string
import unicodedata
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from nltk import trigrams from nltk import trigrams
from collections import defaultdict, Counter from collections import defaultdict, Counter
import pandas as pd import pandas as pd
import csv import csv
import regex as re
DEFAULT_PREDICTION = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1' class GapPredictor:
def __init__(self, alpha):
self.model = defaultdict(lambda: defaultdict(lambda: 0))
self.alpha = alpha
self.vocab = set()
self.DEFAULT_PREDICTION = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
@staticmethod
def preprocess_text(text):
text = text.lower().replace("-\\n", "").replace("\\n", " ")
return text
def preprocess_text(text): @staticmethod
text = text.lower().replace("-\\n", "").replace("\\n", " ") def _prepare_train_data():
data = pd.read_csv(
"train/in.tsv.xz",
sep="\t",
error_bad_lines=False,
warn_bad_lines=False,
header=None,
quoting=csv.QUOTE_NONE,
nrows=90000,
)
return text train_labels = pd.read_csv(
"train/expected.tsv",
sep="\t",
error_bad_lines=False,
header=None,
quoting=csv.QUOTE_NONE,
nrows=90000,
)
train_data = data[[6, 7]]
train_data = pd.concat([train_data, train_labels], axis=1)
train_data["final"] = train_data[6] + train_data[0] + train_data[7]
def predict_probs(word1, word2): return train_data
raw_prediction = dict(model[word1, word2])
prediction = dict(Counter(raw_prediction).most_common(6))
total_prob = 0.0 def train_model(self):
str_prediction = '' training_data = self._prepare_train_data()
for index, row in training_data.iterrows():
text = self.preprocess_text(str(row["final"]))
words = word_tokenize(text)
for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
if w1 and w2 and w3:
self.model[(w2, w3)][w1] += 1
self.model[(w1, w2)][w3] += 1
self.vocab.add(w1)
self.vocab.add(w2)
self.vocab.add(w3)
for word, prob in prediction.items(): for word_pair in self.model:
total_prob += prob num_n_grams = float(sum(self.model[word_pair].values()))
str_prediction += f'{word}:{prob} ' for word in self.model[word_pair]:
self.model[word_pair][word] = (
self.model[word_pair][word] + self.alpha
) / (num_n_grams + self.alpha * len(self.vocab))
if total_prob == 0.0: def predict_probs(self, words):
return DEFAULT_PREDICTION
remaining_prob = 1 - total_prob
if remaining_prob < 0.01:
remaining_prob = 0.01
str_prediction += f':{remaining_prob}'
return str_prediction
def train_model(training_data):
for index, row in training_data.iterrows():
text = preprocess_text(str(row["final"]))
words = word_tokenize(text)
for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
if w1 and w2 and w3:
model[(w2, w3)][w1] += 1
model[(w1, w2)][w3] += 1
for word_pair in model:
num_n_grams = float(sum(model[word_pair].values()))
for word in model[word_pair]:
model[word_pair][word] /= num_n_grams
data = pd.read_csv(
"train/in.tsv.xz",
sep="\t",
error_bad_lines=False,
warn_bad_lines=False,
header=None,
quoting=csv.QUOTE_NONE,
nrows=100000,
)
train_labels = pd.read_csv(
"train/expected.tsv",
sep="\t",
error_bad_lines=False,
header=None,
quoting=csv.QUOTE_NONE,
nrows=100000,
)
train_data = data[[6, 7]]
train_data = pd.concat([train_data, train_labels], axis=1)
train_data["final"] = train_data[6] + train_data[0] + train_data[7]
model = defaultdict(lambda: defaultdict(lambda: 0))
dev_data = pd.read_csv('dev-0/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
test_data = pd.read_csv('test-A/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
train_model(train_data)
with open("dev-0/out.tsv", "w") as file:
for _, row in dev_data.iterrows():
text = preprocess_text(str(row[7]))
words = word_tokenize(text)
if len(words) < 3: if len(words) < 3:
prediction = DEFAULT_PREDICTION return self.DEFAULT_PREDICTION
else:
prediction = predict_probs(words[0], words[1])
file.write(prediction + "\n")
with open("test-A/out.tsv", "w") as file: word1, word2 = words[0], words[1]
for _, row in test_data.iterrows(): raw_prediction = dict(self.model[word1, word2])
text = preprocess_text(str(row[7])) prediction = dict(Counter(raw_prediction).most_common(6))
words = word_tokenize(text)
if len(words) < 3:
prediction = DEFAULT_PREDICTION
else:
prediction = predict_probs(words[0], words[1])
file.write(prediction + "\n")
total_prob = 0.0
str_prediction = ""
for word, prob in prediction.items():
total_prob += prob
str_prediction += f"{word}:{prob} "
if total_prob == 0.0:
return self.DEFAULT_PREDICTION
remaining_prob = 1 - total_prob
if remaining_prob < 0.01:
remaining_prob = 0.01
str_prediction += f":{remaining_prob}"
return str_prediction
def prepare_output(self, input_file, output_file):
with open(output_file, "w") as file:
data = pd.read_csv(
input_file,
sep="\t",
error_bad_lines=False,
warn_bad_lines=False,
header=None,
quoting=csv.QUOTE_NONE,
)
for _, row in data.iterrows():
text = self.preprocess_text(str(row[7]))
words = word_tokenize(text)
prediction = self.predict_probs(words)
file.write(prediction + "\n")
predictor = GapPredictor(alpha=0.00002)
predictor.train_model()
predictor.prepare_output("dev-0/in.tsv.xz", "dev-0/out.tsv")
predictor.prepare_output("test-A/in.tsv.xz", "test-A/out.tsv")

File diff suppressed because it is too large Load Diff