s430705

2022-04-03 18:44:40 +02:00 · 2022-04-03 18:44:40 +02:00 · 3d96a41f40
commit 3d96a41f40
parent add921bdc7
11 changed files with 457501 additions and 0 deletions
--- a/config.txt
+++ b/config.txt
@ -0,0 +1 @@
+--metric PerplexityHashed --precision 2  --in-header in-header.tsv  --out-header out-header.tsv
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/in.tsv.xz
+++ b/dev-0/in.tsv.xz
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/in-header.tsv
+++ b/in-header.tsv
@ -0,0 +1 @@
+FileId	Year	LeftContext	RightContext
--- a/out-header.tsv
+++ b/out-header.tsv
@ -0,0 +1 @@
+Word
--- a/run.py
+++ b/run.py
@ -0,0 +1,129 @@
+import string
+import unicodedata
+
+from nltk.tokenize import word_tokenize
+from nltk import trigrams
+from collections import defaultdict, Counter
+import pandas as pd
+import csv
+import regex as re
+
+
+DEFAULT_PREDICTION = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
+
+
+def preprocess_text(text):
+    # normalize text
+    text = (
+        unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode(
+            'utf-8', 'ignore'))
+    # replace html chars with ' '
+    text = re.sub('<.*?>', ' ', text)
+    # remove punctuation
+    text = text.translate(str.maketrans(' ', ' ', string.punctuation))
+    # only alphabets and numerics
+    text = re.sub('[^a-zA-Z]', ' ', text)
+    # replace newline with space
+    text = re.sub("\n", " ", text)
+    # lower case
+    text = text.lower()
+    # split and join the words
+    text = ' '.join(text.split())
+    return text
+
+
+def predict_probs(word1, word2):
+    raw_prediction = dict(model[word1, word2])
+    prediction = dict(Counter(raw_prediction).most_common(6))
+
+    total_prob = 0.0
+    str_prediction = ''
+
+    for word, prob in prediction.items():
+        total_prob += prob
+        str_prediction += f'{word}:{prob} '
+
+    if total_prob == 0.0:
+        return DEFAULT_PREDICTION
+
+    remaining_prob = 1 - total_prob
+
+    if remaining_prob < 0.01:
+        remaining_prob = 0.01
+
+    str_prediction += f':{remaining_prob}'
+
+    return str_prediction
+
+
+def prepare_output(file_path):
+    with open(file_path, 'w') as file:
+        for index, row in test_data.iterrows():
+            text = preprocess_text(str(row[7]))
+            words = word_tokenize(text)
+            if len(words) < 4:
+                prediction = DEFAULT_PREDICTION
+            else:
+                prediction = predict_probs(words[0], words[1])
+            file.write(prediction + '\n')
+
+
+def train_model(training_data):
+    for _, row in training_data.iterrows():
+        text = preprocess_text(str(row["final"]))
+        words = word_tokenize(text)
+        for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
+            if all([w1, w2]):
+                model[(w1, w2)][w2] += 1
+    total_count = 0
+    for w1, w2 in model:
+        total_count = float(sum(model[(w1, w2)].values()))
+        for w3 in model[(w1, w2)]:
+            model[(w1, w2)][w3] /= total_count
+    # for index, row in training_data.iterrows():
+    #     text = preprocess_text(str(row['final']))
+    #     words = word_tokenize(text)
+    #     for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
+    #         if w1 and w2 and w3:
+    #             model[(w1, w2)][w3] += 1
+    #
+    # for w1, w2 in model:
+    #     total_count = float(sum(model[(w1, w2)].values()))
+    #     for w3 in model:
+    #         model[(w1, w2)][w3] /= total_count
+
+        # print(model)
+
+
+data = pd.read_csv(
+    "train/in.tsv.xz",
+    sep="\t",
+    error_bad_lines=False,
+    warn_bad_lines=False,
+    header=None,
+    quoting=csv.QUOTE_NONE,
+    nrows=200000,
+)
+train_labels = pd.read_csv(
+    "train/expected.tsv",
+    sep="\t",
+    error_bad_lines=False,
+    header=None,
+    quoting=csv.QUOTE_NONE,
+    nrows=200000,
+)
+
+train_data = data[[6, 7]]
+train_data = pd.concat([train_data, train_labels], axis=1)
+train_data["final"] = train_data[6] + train_data[0] + train_data[7]
+
+model = defaultdict(lambda: defaultdict(lambda: 0))
+
+
+dev_data = pd.read_csv('dev-0/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
+test_data = pd.read_csv('test-A/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
+
+
+train_model(train_data)
+prepare_output("dev-0/out.tsv")
+prepare_output("test-A/out.tsv")
--- a/test-A/in.tsv.xz
+++ b/test-A/in.tsv.xz
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/expected.tsv
+++ b/train/expected.tsv
--- a/train/in.tsv.xz
+++ b/train/in.tsv.xz
				`@ -0,0 +1 @@`
				`--metric PerplexityHashed --precision 2 --in-header in-header.tsv --out-header out-header.tsv`