This commit is contained in:
ZarebaMichal 2022-04-03 18:44:40 +02:00
parent add921bdc7
commit 3d96a41f40
11 changed files with 457501 additions and 0 deletions

1
config.txt Normal file
View File

@ -0,0 +1 @@
--metric PerplexityHashed --precision 2 --in-header in-header.tsv --out-header out-header.tsv

10519
dev-0/expected.tsv Normal file

File diff suppressed because it is too large Load Diff

BIN
dev-0/in.tsv.xz Normal file

Binary file not shown.

7414
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

1
in-header.tsv Normal file
View File

@ -0,0 +1 @@
FileId Year LeftContext RightContext
1 FileId Year LeftContext RightContext

1
out-header.tsv Normal file
View File

@ -0,0 +1 @@
Word
1 Word

129
run.py Normal file
View File

@ -0,0 +1,129 @@
import string
import unicodedata
from nltk.tokenize import word_tokenize
from nltk import trigrams
from collections import defaultdict, Counter
import pandas as pd
import csv
import regex as re
DEFAULT_PREDICTION = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
def preprocess_text(text):
# normalize text
text = (
unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode(
'utf-8', 'ignore'))
# replace html chars with ' '
text = re.sub('<.*?>', ' ', text)
# remove punctuation
text = text.translate(str.maketrans(' ', ' ', string.punctuation))
# only alphabets and numerics
text = re.sub('[^a-zA-Z]', ' ', text)
# replace newline with space
text = re.sub("\n", " ", text)
# lower case
text = text.lower()
# split and join the words
text = ' '.join(text.split())
return text
def predict_probs(word1, word2):
raw_prediction = dict(model[word1, word2])
prediction = dict(Counter(raw_prediction).most_common(6))
total_prob = 0.0
str_prediction = ''
for word, prob in prediction.items():
total_prob += prob
str_prediction += f'{word}:{prob} '
if total_prob == 0.0:
return DEFAULT_PREDICTION
remaining_prob = 1 - total_prob
if remaining_prob < 0.01:
remaining_prob = 0.01
str_prediction += f':{remaining_prob}'
return str_prediction
def prepare_output(file_path):
with open(file_path, 'w') as file:
for index, row in test_data.iterrows():
text = preprocess_text(str(row[7]))
words = word_tokenize(text)
if len(words) < 4:
prediction = DEFAULT_PREDICTION
else:
prediction = predict_probs(words[0], words[1])
file.write(prediction + '\n')
def train_model(training_data):
for _, row in training_data.iterrows():
text = preprocess_text(str(row["final"]))
words = word_tokenize(text)
for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
if all([w1, w2]):
model[(w1, w2)][w2] += 1
total_count = 0
for w1, w2 in model:
total_count = float(sum(model[(w1, w2)].values()))
for w3 in model[(w1, w2)]:
model[(w1, w2)][w3] /= total_count
# for index, row in training_data.iterrows():
# text = preprocess_text(str(row['final']))
# words = word_tokenize(text)
# for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
# if w1 and w2 and w3:
# model[(w1, w2)][w3] += 1
#
# for w1, w2 in model:
# total_count = float(sum(model[(w1, w2)].values()))
# for w3 in model:
# model[(w1, w2)][w3] /= total_count
# print(model)
data = pd.read_csv(
"train/in.tsv.xz",
sep="\t",
error_bad_lines=False,
warn_bad_lines=False,
header=None,
quoting=csv.QUOTE_NONE,
nrows=200000,
)
train_labels = pd.read_csv(
"train/expected.tsv",
sep="\t",
error_bad_lines=False,
header=None,
quoting=csv.QUOTE_NONE,
nrows=200000,
)
train_data = data[[6, 7]]
train_data = pd.concat([train_data, train_labels], axis=1)
train_data["final"] = train_data[6] + train_data[0] + train_data[7]
model = defaultdict(lambda: defaultdict(lambda: 0))
dev_data = pd.read_csv('dev-0/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
test_data = pd.read_csv('test-A/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
train_model(train_data)
prepare_output("dev-0/out.tsv")
prepare_output("test-A/out.tsv")

BIN
test-A/in.tsv.xz Normal file

Binary file not shown.

7414
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

432022
train/expected.tsv Normal file

File diff suppressed because it is too large Load Diff

BIN
train/in.tsv.xz Normal file

Binary file not shown.