Final model

2024-04-28 00:56:04 +02:00 · 2024-04-28 00:56:04 +02:00 · 0734c5d906
commit 0734c5d906
parent 4470830adf
17 changed files with 918826 additions and 918826 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,8 +1,8 @@
-
-*~
-*.swp
-*.bak
-*.pyc
-*.o
-.DS_Store
-.token
+*~
+*.swp
+*.bak
+*.pyc
+*.o
+.DS_Store
+.token
+model.pkl
--- a/README.md
+++ b/README.md
@ -1,9 +1,15 @@
-Challenging America word-gap prediction
-===================================
-
-Guess a word in a gap.
-
-Evaluation metric
-----------------
-
-LikelihoodHashed is the metric
+Challenging America word-gap prediction
+===================================
+
+This task is to predict the word-gap between two sentences.
+
+Evaluation
+-----------------
+
+PerplexityHashed is the metric so to check the performance of the model. The lower the perplexity, the better the model. To run evaluation run the following command:
+
+```bash
+./geval --metric PerplexityHashed --test-name dev-0
+```
+
+Perplexity calculated on `dev-0` is equal `981.69`
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/hate-speech-info.tsv
+++ b/dev-0/hate-speech-info.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/BIN
+++ b/BIN
--- a/in-header.tsv
+++ b/in-header.tsv
@ -1 +1 @@
-FileId	Year	LeftContext	RightContext
+FileId	Year	LeftContext	RightContext
--- a/out-header.tsv
+++ b/out-header.tsv
@ -1 +1 @@
-Word
+Word
--- a/src/04_statystyczny_model_językowy.ipynb
+++ b/src/04_statystyczny_model_językowy.ipynb
--- a/src/evaluate.py
+++ b/src/evaluate.py
@ -1,48 +1,48 @@
-import sys
-import os
-import pandas as pd
-import csv
-from model import Model
-from tqdm import tqdm
-import re
-import numpy as np
-import math
-
-print("Loading model")
-dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'in.tsv.xz'))
-model = Model.load(os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'model.pkl')))
-
-print("Evaluating")
-dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', sys.argv[1], 'in.tsv.xz'))
-output_dir = os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'out.tsv'))
-
-df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE)
-df = df.replace(r'\\r+|\\n+|\\t+','', regex=True)
-
-final = ""
-
-for i, (_, row) in tqdm(enumerate(df.iterrows()), total=len(df)):
-  text = ""
-  prob_sum = 0.0
-  
-  probs = model.fill_gap(re.split(r"\s+", row['LeftContext']), re.split(r"\s+", row['RightContext']))
-  
-  if len(probs) == 0:
-    text = ":1"
-  else:
-    prob_sum = sum([prob for _, prob in probs])
-    
-    for word, prob in probs:
-      new_prob = math.floor(prob / prob_sum * 1000) / 1000
-      
-      if new_prob == 1.0:
-        new_prob = 0.999
-      
-      text += f"{word}:{new_prob} "
-      
-    text += ":0.001"
-  
-  final += text + "\n"
-  
-with open(output_dir, 'w', encoding="UTF-8") as f:
-  f.write(final)
+import sys
+import os
+import pandas as pd
+import csv
+from model import Model
+from tqdm import tqdm
+import re
+import numpy as np
+import math
+
+print("Loading model")
+dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'in.tsv.xz'))
+model = Model.load(os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'model.pkl')))
+
+print("Evaluating")
+dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', sys.argv[1], 'in.tsv.xz'))
+output_dir = os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'out.tsv'))
+
+df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE)
+df = df.replace(r'\\r+|\\n+|\\t+', ' ', regex=True)
+
+final = ""
+
+for i, (_, row) in tqdm(enumerate(df.iterrows()), total=len(df)):
+  text = ""
+  prob_sum = 0.0
+  
+  probs = model.fill_gap(re.split(r"\s+", row['LeftContext']), re.split(r"\s+", row['RightContext']))
+  
+  if len(probs) == 0:
+    text = ":1"
+  else:
+    prob_sum = sum([prob for _, prob in probs])
+    
+    for word, prob in probs:
+      new_prob = math.floor(prob / prob_sum * 100) / 100
+      
+      if new_prob == 1.0:
+        new_prob = 0.99
+      
+      text += f"{word}:{new_prob} "
+      
+    text += ":0.01"
+  
+  final += text + "\n"
+
+with open(output_dir, 'w', encoding="UTF-8") as f:
+  f.write(final)
--- a/src/model.py
+++ b/src/model.py
@ -1,108 +1,111 @@
-from collections import defaultdict, Counter
-from tqdm import tqdm
-import nltk
-import random
-import pickle
-import math
-
-class Model():
-    
-    def __init__(self, UNK_token = '<UNK>', n = 3):
-        self.n = n
-        self.UNK_token = UNK_token
-        self.ngrams = defaultdict(defaultdict(int).copy)
-        self.contexts = defaultdict(int)
-        self.tokenizer = { UNK_token: 0 }
-        self.reverse_tokenizer = { 0: UNK_token }
-        self._tokenizer_index = 1
-        self.vocab = set()
-        
-        self.n_split = self.n // 2
-        
-    def train_tokenizer(self, corpus: list) -> list[int]:
-        for word in tqdm(corpus):
-            if word not in self.vocab:
-                self.vocab.add(word)
-                self.tokenizer[word] = self._tokenizer_index
-                self.reverse_tokenizer[self._tokenizer_index] = word
-                
-                self._tokenizer_index += 1
-        
-    def tokenize(self, corpus: list, verbose = False) -> list[int]:
-        result = []
-        
-        for word in tqdm(corpus) if verbose else corpus:
-            if word not in self.vocab:
-                result.append(self.tokenizer[self.UNK_token])
-            else:
-                result.append(self.tokenizer[word])
-        
-        return result
-    
-    def train(self, corpus: list) -> None:
-        
-        print("Training tokenizer")
-        self.train_tokenizer(corpus)
-        
-        print("Tokenizing corpus")
-        corpus = self.tokenize(corpus, verbose = True)
-
-        print("Saving n-grams")
-        n_grams = list(nltk.ngrams(corpus, self.n))
-        for gram in tqdm(n_grams):
-            left_context = gram[:self.n_split]
-            right_context = gram[self.n_split + 1:]
-            word = gram[self.n_split]
-
-            if word == self.UNK_token:
-                continue
-
-        self.ngrams[(left_context, right_context)][word] += 1
-        self.contexts[(left_context, right_context)] += 1
-    
-    def get_conditional_probability_for_word(self, left_context: list, right_context: list, word: str) -> float:
-        left_context = tuple(left_context[-self.n_split:])
-        right_context = tuple(right_context[:self.n_split])
-        
-        total_count = self.contexts[(left_context, right_context)]
-        
-        if total_count == 0:
-            return 0.0
-        else:
-            word_count = self.ngrams[(left_context, right_context)][word]
-
-            return word_count / total_count
-    
-    def get_probabilities(self, left_context: list, right_context: list) -> float:
-        left_context = tuple(left_context[-self.n_split:])
-        right_context = tuple(right_context[:self.n_split])
-
-        words = list(self.ngrams[(left_context, right_context)].keys())
-        probs = []
-
-        for word in words:
-            prob = self.get_conditional_probability_for_word(left_context, right_context, word)
-            probs.append((word, prob))
-        
-        return sorted(probs, reverse = True, key = lambda x: x[0])[:10]
-    
-    def fill_gap(self, left_context: list, right_context: list) -> list:
-        left_context = self.tokenize(left_context)
-        right_context = self.tokenize(right_context)
-        
-        result = []
-        probabilities = self.get_probabilities(left_context, right_context)
-        for probability in probabilities:
-            word = self.reverse_tokenizer[probability[0]]
-            result.append((word, probability[1]))
-            
-        return result
-    
-    def save(self, output_dir: str) -> None:
-        with open(output_dir, 'wb') as f:
-            pickle.dump(self, f)
-            
-    @staticmethod
-    def load(model_path: str) -> 'Model':
-        with open(model_path, 'rb') as f:
+from collections import defaultdict, Counter
+from tqdm import tqdm
+import nltk
+import random
+import pickle
+from multiprocessing import Pool
+import math
+from bidict import bidict
+
+class Model():
+    
+    def __init__(self, UNK_token = '<UNK>', n = 3):
+        self.n = n
+        self.UNK_token = UNK_token
+        self.ngrams = defaultdict(defaultdict(int).copy)
+        self.contexts = defaultdict(int)
+        self.tokenizer = bidict({ UNK_token: 0 })
+        self._tokenizer_index = 1
+        self.vocab = set()
+        
+        self.n_split = self.n // 2
+        
+    def train_tokenizer(self, corpus: list) -> list[int]:
+        for word in tqdm(corpus):
+            if word not in self.vocab:
+                self.vocab.add(word)
+                self.tokenizer[word] = self._tokenizer_index
+                
+                self._tokenizer_index += 1
+        
+    def tokenize(self, corpus: list, verbose = False) -> list[int]:
+        result = []
+        
+        for word in tqdm(corpus) if verbose else corpus:
+            if word not in self.vocab:
+                result.append(self.tokenizer[self.UNK_token])
+            else:
+                result.append(self.tokenizer[word])
+        
+        return result
+    
+    def process_gram(self, gram: tuple) -> tuple:
+        left_context = gram[:self.n_split]
+        right_context = gram[self.n_split + 1:]
+        word = gram[self.n_split]
+
+        if word == self.UNK_token:
+            return
+
+        self.ngrams[(left_context, right_context)][word] += 1
+        self.contexts[(left_context, right_context)] += 1
+    
+    def train(self, corpus: list) -> None:
+        
+        print("Training tokenizer")
+        self.train_tokenizer(corpus)
+        
+        print("Tokenizing corpus")
+        corpus = self.tokenize(corpus, verbose = True)
+
+        print("Saving n-grams")
+        n_grams = list(nltk.ngrams(corpus, self.n))
+        for gram in tqdm(n_grams):
+            self.process_gram(gram)
+    
+    def get_conditional_probability_for_word(self, left_context: list, right_context: list, word: str) -> float:
+        left_context = tuple(left_context[-self.n_split:])
+        right_context = tuple(right_context[:self.n_split])
+        
+        total_count = self.contexts[(left_context, right_context)]
+        
+        if total_count == 0:
+            return 0.0
+        else:
+            word_count = self.ngrams[(left_context, right_context)][word]
+
+            return word_count / total_count
+    
+    def get_probabilities(self, left_context: list, right_context: list) -> float:
+        left_context = tuple(left_context[-self.n_split:])
+        right_context = tuple(right_context[:self.n_split])
+
+        words = list(self.ngrams[(left_context, right_context)].keys())
+        probs = []
+
+        for word in words:
+            prob = self.get_conditional_probability_for_word(left_context, right_context, word)
+            probs.append((word, prob))
+        
+        return sorted(probs, reverse = True, key = lambda x: x[1])[:10]
+    
+    def fill_gap(self, left_context: list, right_context: list) -> list:
+        left_context = self.tokenize(left_context)
+        right_context = self.tokenize(right_context)
+        
+        result = []
+        probabilities = self.get_probabilities(left_context, right_context)
+        for token, probability in probabilities:
+            word = self.tokenizer.inverse[token]
+            result.append((word, probability))
+            
+        return result
+    
+    def save(self, output_dir: str) -> None:
+        with open(output_dir, 'wb') as f:
+            pickle.dump(self, f)
+            
+    @staticmethod
+    def load(model_path: str) -> 'Model':
+        with open(model_path, 'rb') as f:
            return pickle.load(f)
--- a/src/train.py
+++ b/src/train.py
@ -1,41 +1,32 @@
-from collections import Counter, defaultdict
-from tqdm import tqdm
-import re
-import nltk
-import random
-import os
-import sys
-import pickle
-import csv
-import pandas as pd
-from model import Model
-
-dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'in.tsv.xz'))
-expected_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'expected.tsv'))
-
-model = Model(n = 3)
-
-df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, chunksize=10 ** 2)
-expected_df = pd.read_csv(expected_dir, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, chunksize=10 ** 2)
-
-print('Loading training corpus...')
-corpus = []
-for j, chunk in tqdm(enumerate(zip(df, expected_df)), total=4321):
-    df, expected_df = chunk
-    
-    df = df.replace(r'\\r+|\\n+|\\t+','', regex=True)
-    
-    for (_, row1), (_, row2) in zip(df.iterrows(), expected_df.iterrows()):
-        word = row2['Word']
-        left_context = row1['LeftContext']
-        right_context = row1['RightContext']
-        
-        corpus.extend(left_context.split() + [word] + right_context.split())
-        
-    # if j > 50:
-    #     break
-
-print('Training model...')
-model.train(corpus)
-print('Saving model...')
+from collections import Counter, defaultdict
+from tqdm import tqdm
+import re
+import nltk
+import random
+import os
+import sys
+import pickle
+import csv
+import pandas as pd
+from model import Model
+
+dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'in.tsv.xz'))
+expected_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'expected.tsv'))
+
+model = Model(n = 3)
+
+df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)
+expected_df = pd.read_csv(expected_dir, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)
+
+print('Loading training corpus...')
+corpus = []
+for j, (df, expected_df) in tqdm(enumerate(zip(df, expected_df)), total=432):
+    df = df.replace(r'\\r+|\\n+|\\t+', ' ', regex=True)
+    
+    for left_context, word, right_context in zip(df['LeftContext'].to_list(), expected_df['Word'].to_list(), df['LeftContext'].to_list()):
+        corpus.extend(re.split(r"\s+", left_context.strip()) + [str(word).strip()] + re.split(r"\s+", right_context.strip()))
+
+print('Training model...')
+model.train(corpus)
+print('Saving model...')
 model.save(os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'model.pkl')))
--- a/test-A/hate-speech-info.tsv
+++ b/test-A/hate-speech-info.tsv
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/expected.tsv
+++ b/train/expected.tsv
--- a/train/hate-speech-info.tsv
+++ b/train/hate-speech-info.tsv