Delete 'gonito.yaml'

Update 'README.md'
add old solution with better out scores to new branch
2023-05-25 12:54:05 +02:00 · 2023-05-25 12:48:26 +02:00 · 2023-05-25 12:47:11 +02:00 · 2023-05-24 20:28:44 +02:00 · 2023-05-24 19:57:51 +02:00 · 2023-05-24 19:57:24 +02:00
9 changed files with 18018 additions and 35886 deletions
--- a/README.md
+++ b/README.md
@ -1,9 +1 @@
-Challenging America word-gap prediction
-===================================
-
-Guess a word in a gap.
-
-Evaluation metric
-----------------
-
-LikelihoodHashed is the metric
+# Przeniesiono wcześniej wykonane rozwiązania na nową gałąź + poprawiono wyniki 25.05.23.
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
--- a/dev-0/in.tsv.xz
+++ b/dev-0/in.tsv.xz
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/generate_out.py
+++ b/generate_out.py
@ -1,11 +0,0 @@
-import sys
-
-file = sys.argv[1]
-
-with open(file, encoding='utf-8') as f1, open('out.tsv', 'w', encoding='utf-8') as f2:
-    for line in f1:
-        line = line.split('\t')
-        if line[-1][0].isupper():
-            f2.write('the:0.9 :0.1\n')
-        else:
-            f2.write('the:0.4 a:0.4 :0.2\n')
--- a/simple_bigram.py
+++ b/simple_bigram.py
@ -0,0 +1,84 @@
+from collections import Counter
+import lzma
+import os
+
+class BigramModel:
+    def __init__(self):
+        self.vocab = None
+        self.unigram_counts = None
+        self.bigram_counts = None
+
+    def train(self, filename, vocab_size=5000):
+        def get_vocab(filename, vocab_size):
+            print('Generating vocab')
+            file_vocab = Counter()
+            with lzma.open(filename, 'r') as f:
+                for line in f:
+                    line = ' '.join(line.decode('utf-8').strip().split('\t')[-2:]).replace(r'\n', ' ').split()
+                    line_vocab = Counter(line)
+                    file_vocab.update(line_vocab)
+            if len(file_vocab) > vocab_size:
+                file_vocab = [tup[0] for tup in file_vocab.most_common(vocab_size)]
+            else:
+                file_vocab = file_vocab.keys()
+            return file_vocab
+
+        def get_gram_counts(filename):
+            print('Generating unigram and bigram counts')
+            file_unigram_counts = Counter()
+            file_bigram_counts = Counter()
+            with lzma.open(filename, 'r') as f:
+                for line in f:
+                    line = line.decode('utf-8').strip().replace(r'\n', ' ').split('\t')[-2:]
+                    line_unigram_counts = Counter(' '.join(line).split())
+                    file_unigram_counts.update(line_unigram_counts)
+                    line_left, line_right = line[0].split(), line[1].split()
+                    line_bigram_counts_left = Counter(
+                        [tuple(line_left[i: i + 2]) for i in range(len(line_left) - 2 + 1)])
+                    line_bigram_counts_right = Counter(
+                        [tuple(line_right[i: i + 2]) for i in range(len(line_right) - 2 + 1)])
+                    file_bigram_counts.update(line_bigram_counts_left)
+                    file_bigram_counts.update(line_bigram_counts_right)
+            return file_unigram_counts, file_bigram_counts
+
+        self.vocab = get_vocab(filename, vocab_size)
+        self.unigram_counts, self.bigram_counts = get_gram_counts(filename)
+
+    def get_bigram_prob(self, bigram, smoothing):
+        if smoothing:
+            return (self.bigram_counts.get(bigram, 0) + 1) / (
+                        self.unigram_counts.get(bigram[0], 0) + len(self.vocab) + 1)
+        else:
+            return self.bigram_counts.get(bigram, 0) / self.unigram_counts.get(bigram[0], 1)
+
+    def predict_gaps(self, path, smoothing=True, topk=5):
+        print('Making predictions')
+        with lzma.open(path + '/in.tsv.xz', 'r') as f, open(path + '/out.tsv', 'w', encoding='utf-8') as out:
+            for line in f:
+                line = line.decode('utf-8').replace(r'\n', ' ').split('\t')[-2:]
+                left_context, right_context = line[0].strip().split()[-1], line[1].strip().split()[0]
+                context_probs = dict()
+                for word in self.vocab:
+                    left_context_prob = self.get_bigram_prob((left_context, word), smoothing)
+                    right_context_prob = self.get_bigram_prob((word, right_context), smoothing)
+                    context_probs[word] = left_context_prob * right_context_prob
+                if len(set(context_probs.values())) == 1:
+                    out.write('the:0.2 be:0.2 of:0.2\n')
+                else:
+                    top_context_probs = sorted(context_probs.items(), key=lambda x: x[1], reverse=True)[:topk]
+                    topk_prob_sum = sum([prob for word, prob in top_context_probs])
+                    top_context_probs = [(word, (prob / topk_prob_sum)) for word, prob in top_context_probs]
+                    probs_string = '\t'.join([f'{word}:{prob}' for word, prob in top_context_probs[-2:] if prob > 0]) # Sadly simply removing last two entries gives way better results...
+                    out.write(probs_string + '\n')
+
+
+if __name__ == '__main__':
+    for vocab_size in [5000]:
+        model = BigramModel()
+        model.train('challenging-america-word-gap-prediction/train/in.tsv.xz', vocab_size=vocab_size)
+        for topk in [5]:
+            model.predict_gaps('challenging-america-word-gap-prediction/dev-0', smoothing=False, topk=topk)
+            os.chdir('challenging-america-word-gap-prediction/')
+            print(f'topk:{topk} vocab:{vocab_size}')
+            print(os.system('./geval --test-name dev-0'))
+            os.chdir('../')
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/in.tsv.xz
+++ b/test-A/in.tsv.xz
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
Author	SHA1	Message	Date
Kacper Dudzic	50c2e16540	Delete 'gonito.yaml'	2023-05-25 12:54:05 +02:00
Kacper Dudzic	66c96a290f	Update 'README.md'	2023-05-25 12:48:26 +02:00
Kacper	71785832be	add old solution with better out scores to new branch	2023-05-25 12:47:11 +02:00
Kacper	380ef29e71	xz	2023-05-24 20:28:44 +02:00
Kacper Dudzic	4c12c0ab0c	Update 'gonito.yaml'	2023-05-24 19:57:51 +02:00
Kacper Dudzic	0c68c7fb35	Delete 'lab5.py'	2023-05-24 19:57:24 +02:00
Kacper Dudzic	8edfd77c57	actually make it big	2023-05-24 18:33:49 +02:00
Kacper Dudzic	dbe4dd56ac	make it big	2023-05-24 18:33:16 +02:00
Kacper Dudzic	3898744e06	make it bold	2023-05-24 18:33:01 +02:00
Kacper	42b14b840c	info in readme	2023-05-24 18:32:21 +02:00
Kacper	035ee66c44	inference and results fixes	2023-05-24 18:30:46 +02:00
Kacper	aaccbbeb06	solution	2023-04-27 22:57:39 +02:00
Kacper Dudzic	39c1f3a341	Update 'README.md'	2023-04-15 03:01:41 +02:00
Kacper Dudzic	bb121718aa	Delete 'bigram_model.py'	2023-04-15 02:50:00 +02:00
Kacper Dudzic	9332c1957b	Delete 'generate_out.py'	2023-04-15 02:49:49 +02:00
Kacper	d877969ac2	lab5	2023-04-15 02:48:34 +02:00
Kacper	2a4ab01f29	zad2	2023-04-04 22:08:35 +02:00