Final model

This commit is contained in:
Marcin Czerniak 2024-04-28 00:56:04 +02:00
parent 4470830adf
commit 0734c5d906
17 changed files with 918826 additions and 918826 deletions

16
.gitignore vendored
View File

@ -1,8 +1,8 @@
*~
*~ *.swp
*.swp *.bak
*.bak *.pyc
*.pyc *.o
*.o .DS_Store
.DS_Store .token
.token model.pkl

View File

@ -1,9 +1,15 @@
Challenging America word-gap prediction Challenging America word-gap prediction
=================================== ===================================
Guess a word in a gap. This task is to predict the word-gap between two sentences.
Evaluation metric Evaluation
----------------- -----------------
LikelihoodHashed is the metric PerplexityHashed is the metric so to check the performance of the model. The lower the perplexity, the better the model. To run evaluation run the following command:
```bash
./geval --metric PerplexityHashed --test-name dev-0
```
Perplexity calculated on `dev-0` is equal `981.69`

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

BIN
geval Normal file

Binary file not shown.

View File

@ -1 +1 @@
FileId Year LeftContext RightContext FileId Year LeftContext RightContext

1 FileId Year LeftContext RightContext

View File

@ -1 +1 @@
Word Word

1 Word

File diff suppressed because it is too large Load Diff

View File

@ -1,48 +1,48 @@
import sys import sys
import os import os
import pandas as pd import pandas as pd
import csv import csv
from model import Model from model import Model
from tqdm import tqdm from tqdm import tqdm
import re import re
import numpy as np import numpy as np
import math import math
print("Loading model") print("Loading model")
dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'in.tsv.xz')) dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'in.tsv.xz'))
model = Model.load(os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'model.pkl'))) model = Model.load(os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'model.pkl')))
print("Evaluating") print("Evaluating")
dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', sys.argv[1], 'in.tsv.xz')) dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', sys.argv[1], 'in.tsv.xz'))
output_dir = os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'out.tsv')) output_dir = os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'out.tsv'))
df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE) df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE)
df = df.replace(r'\\r+|\\n+|\\t+','', regex=True) df = df.replace(r'\\r+|\\n+|\\t+', ' ', regex=True)
final = "" final = ""
for i, (_, row) in tqdm(enumerate(df.iterrows()), total=len(df)): for i, (_, row) in tqdm(enumerate(df.iterrows()), total=len(df)):
text = "" text = ""
prob_sum = 0.0 prob_sum = 0.0
probs = model.fill_gap(re.split(r"\s+", row['LeftContext']), re.split(r"\s+", row['RightContext'])) probs = model.fill_gap(re.split(r"\s+", row['LeftContext']), re.split(r"\s+", row['RightContext']))
if len(probs) == 0: if len(probs) == 0:
text = ":1" text = ":1"
else: else:
prob_sum = sum([prob for _, prob in probs]) prob_sum = sum([prob for _, prob in probs])
for word, prob in probs: for word, prob in probs:
new_prob = math.floor(prob / prob_sum * 1000) / 1000 new_prob = math.floor(prob / prob_sum * 100) / 100
if new_prob == 1.0: if new_prob == 1.0:
new_prob = 0.999 new_prob = 0.99
text += f"{word}:{new_prob} " text += f"{word}:{new_prob} "
text += ":0.001" text += ":0.01"
final += text + "\n" final += text + "\n"
with open(output_dir, 'w', encoding="UTF-8") as f: with open(output_dir, 'w', encoding="UTF-8") as f:
f.write(final) f.write(final)

View File

@ -1,108 +1,111 @@
from collections import defaultdict, Counter from collections import defaultdict, Counter
from tqdm import tqdm from tqdm import tqdm
import nltk import nltk
import random import random
import pickle import pickle
import math from multiprocessing import Pool
import math
class Model(): from bidict import bidict
def __init__(self, UNK_token = '<UNK>', n = 3): class Model():
self.n = n
self.UNK_token = UNK_token def __init__(self, UNK_token = '<UNK>', n = 3):
self.ngrams = defaultdict(defaultdict(int).copy) self.n = n
self.contexts = defaultdict(int) self.UNK_token = UNK_token
self.tokenizer = { UNK_token: 0 } self.ngrams = defaultdict(defaultdict(int).copy)
self.reverse_tokenizer = { 0: UNK_token } self.contexts = defaultdict(int)
self._tokenizer_index = 1 self.tokenizer = bidict({ UNK_token: 0 })
self.vocab = set() self._tokenizer_index = 1
self.vocab = set()
self.n_split = self.n // 2
self.n_split = self.n // 2
def train_tokenizer(self, corpus: list) -> list[int]:
for word in tqdm(corpus): def train_tokenizer(self, corpus: list) -> list[int]:
if word not in self.vocab: for word in tqdm(corpus):
self.vocab.add(word) if word not in self.vocab:
self.tokenizer[word] = self._tokenizer_index self.vocab.add(word)
self.reverse_tokenizer[self._tokenizer_index] = word self.tokenizer[word] = self._tokenizer_index
self._tokenizer_index += 1 self._tokenizer_index += 1
def tokenize(self, corpus: list, verbose = False) -> list[int]: def tokenize(self, corpus: list, verbose = False) -> list[int]:
result = [] result = []
for word in tqdm(corpus) if verbose else corpus: for word in tqdm(corpus) if verbose else corpus:
if word not in self.vocab: if word not in self.vocab:
result.append(self.tokenizer[self.UNK_token]) result.append(self.tokenizer[self.UNK_token])
else: else:
result.append(self.tokenizer[word]) result.append(self.tokenizer[word])
return result return result
def train(self, corpus: list) -> None: def process_gram(self, gram: tuple) -> tuple:
left_context = gram[:self.n_split]
print("Training tokenizer") right_context = gram[self.n_split + 1:]
self.train_tokenizer(corpus) word = gram[self.n_split]
print("Tokenizing corpus") if word == self.UNK_token:
corpus = self.tokenize(corpus, verbose = True) return
print("Saving n-grams") self.ngrams[(left_context, right_context)][word] += 1
n_grams = list(nltk.ngrams(corpus, self.n)) self.contexts[(left_context, right_context)] += 1
for gram in tqdm(n_grams):
left_context = gram[:self.n_split] def train(self, corpus: list) -> None:
right_context = gram[self.n_split + 1:]
word = gram[self.n_split] print("Training tokenizer")
self.train_tokenizer(corpus)
if word == self.UNK_token:
continue print("Tokenizing corpus")
corpus = self.tokenize(corpus, verbose = True)
self.ngrams[(left_context, right_context)][word] += 1
self.contexts[(left_context, right_context)] += 1 print("Saving n-grams")
n_grams = list(nltk.ngrams(corpus, self.n))
def get_conditional_probability_for_word(self, left_context: list, right_context: list, word: str) -> float: for gram in tqdm(n_grams):
left_context = tuple(left_context[-self.n_split:]) self.process_gram(gram)
right_context = tuple(right_context[:self.n_split])
def get_conditional_probability_for_word(self, left_context: list, right_context: list, word: str) -> float:
total_count = self.contexts[(left_context, right_context)] left_context = tuple(left_context[-self.n_split:])
right_context = tuple(right_context[:self.n_split])
if total_count == 0:
return 0.0 total_count = self.contexts[(left_context, right_context)]
else:
word_count = self.ngrams[(left_context, right_context)][word] if total_count == 0:
return 0.0
return word_count / total_count else:
word_count = self.ngrams[(left_context, right_context)][word]
def get_probabilities(self, left_context: list, right_context: list) -> float:
left_context = tuple(left_context[-self.n_split:]) return word_count / total_count
right_context = tuple(right_context[:self.n_split])
def get_probabilities(self, left_context: list, right_context: list) -> float:
words = list(self.ngrams[(left_context, right_context)].keys()) left_context = tuple(left_context[-self.n_split:])
probs = [] right_context = tuple(right_context[:self.n_split])
for word in words: words = list(self.ngrams[(left_context, right_context)].keys())
prob = self.get_conditional_probability_for_word(left_context, right_context, word) probs = []
probs.append((word, prob))
for word in words:
return sorted(probs, reverse = True, key = lambda x: x[0])[:10] prob = self.get_conditional_probability_for_word(left_context, right_context, word)
probs.append((word, prob))
def fill_gap(self, left_context: list, right_context: list) -> list:
left_context = self.tokenize(left_context) return sorted(probs, reverse = True, key = lambda x: x[1])[:10]
right_context = self.tokenize(right_context)
def fill_gap(self, left_context: list, right_context: list) -> list:
result = [] left_context = self.tokenize(left_context)
probabilities = self.get_probabilities(left_context, right_context) right_context = self.tokenize(right_context)
for probability in probabilities:
word = self.reverse_tokenizer[probability[0]] result = []
result.append((word, probability[1])) probabilities = self.get_probabilities(left_context, right_context)
for token, probability in probabilities:
return result word = self.tokenizer.inverse[token]
result.append((word, probability))
def save(self, output_dir: str) -> None:
with open(output_dir, 'wb') as f: return result
pickle.dump(self, f)
def save(self, output_dir: str) -> None:
@staticmethod with open(output_dir, 'wb') as f:
def load(model_path: str) -> 'Model': pickle.dump(self, f)
with open(model_path, 'rb') as f:
@staticmethod
def load(model_path: str) -> 'Model':
with open(model_path, 'rb') as f:
return pickle.load(f) return pickle.load(f)

View File

@ -1,41 +1,32 @@
from collections import Counter, defaultdict from collections import Counter, defaultdict
from tqdm import tqdm from tqdm import tqdm
import re import re
import nltk import nltk
import random import random
import os import os
import sys import sys
import pickle import pickle
import csv import csv
import pandas as pd import pandas as pd
from model import Model from model import Model
dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'in.tsv.xz')) dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'in.tsv.xz'))
expected_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'expected.tsv')) expected_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'expected.tsv'))
model = Model(n = 3) model = Model(n = 3)
df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, chunksize=10 ** 2) df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)
expected_df = pd.read_csv(expected_dir, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, chunksize=10 ** 2) expected_df = pd.read_csv(expected_dir, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)
print('Loading training corpus...') print('Loading training corpus...')
corpus = [] corpus = []
for j, chunk in tqdm(enumerate(zip(df, expected_df)), total=4321): for j, (df, expected_df) in tqdm(enumerate(zip(df, expected_df)), total=432):
df, expected_df = chunk df = df.replace(r'\\r+|\\n+|\\t+', ' ', regex=True)
df = df.replace(r'\\r+|\\n+|\\t+','', regex=True) for left_context, word, right_context in zip(df['LeftContext'].to_list(), expected_df['Word'].to_list(), df['LeftContext'].to_list()):
corpus.extend(re.split(r"\s+", left_context.strip()) + [str(word).strip()] + re.split(r"\s+", right_context.strip()))
for (_, row1), (_, row2) in zip(df.iterrows(), expected_df.iterrows()):
word = row2['Word'] print('Training model...')
left_context = row1['LeftContext'] model.train(corpus)
right_context = row1['RightContext'] print('Saving model...')
corpus.extend(left_context.split() + [word] + right_context.split())
# if j > 50:
# break
print('Training model...')
model.train(corpus)
print('Saving model...')
model.save(os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'model.pkl'))) model.save(os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'model.pkl')))

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff