Final model

This commit is contained in:
Marcin Czerniak 2024-04-28 00:56:04 +02:00
parent 4470830adf
commit 0734c5d906
17 changed files with 918826 additions and 918826 deletions

16
.gitignore vendored
View File

@ -1,8 +1,8 @@
*~
*.swp
*.bak
*.pyc
*.o
.DS_Store
.token
*~
*.swp
*.bak
*.pyc
*.o
.DS_Store
.token
model.pkl

View File

@ -1,9 +1,15 @@
Challenging America word-gap prediction
===================================
Guess a word in a gap.
Evaluation metric
-----------------
LikelihoodHashed is the metric
Challenging America word-gap prediction
===================================
This task is to predict the word-gap between two sentences.
Evaluation
-----------------
PerplexityHashed is the metric so to check the performance of the model. The lower the perplexity, the better the model. To run evaluation run the following command:
```bash
./geval --metric PerplexityHashed --test-name dev-0
```
Perplexity calculated on `dev-0` is equal `981.69`

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

BIN
geval Normal file

Binary file not shown.

View File

@ -1 +1 @@
FileId Year LeftContext RightContext
FileId Year LeftContext RightContext

1 FileId Year LeftContext RightContext

View File

@ -1 +1 @@
Word
Word

1 Word

File diff suppressed because it is too large Load Diff

View File

@ -1,48 +1,48 @@
import sys
import os
import pandas as pd
import csv
from model import Model
from tqdm import tqdm
import re
import numpy as np
import math
print("Loading model")
dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'in.tsv.xz'))
model = Model.load(os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'model.pkl')))
print("Evaluating")
dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', sys.argv[1], 'in.tsv.xz'))
output_dir = os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'out.tsv'))
df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE)
df = df.replace(r'\\r+|\\n+|\\t+','', regex=True)
final = ""
for i, (_, row) in tqdm(enumerate(df.iterrows()), total=len(df)):
text = ""
prob_sum = 0.0
probs = model.fill_gap(re.split(r"\s+", row['LeftContext']), re.split(r"\s+", row['RightContext']))
if len(probs) == 0:
text = ":1"
else:
prob_sum = sum([prob for _, prob in probs])
for word, prob in probs:
new_prob = math.floor(prob / prob_sum * 1000) / 1000
if new_prob == 1.0:
new_prob = 0.999
text += f"{word}:{new_prob} "
text += ":0.001"
final += text + "\n"
with open(output_dir, 'w', encoding="UTF-8") as f:
f.write(final)
import sys
import os
import pandas as pd
import csv
from model import Model
from tqdm import tqdm
import re
import numpy as np
import math
print("Loading model")
dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'in.tsv.xz'))
model = Model.load(os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'model.pkl')))
print("Evaluating")
dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', sys.argv[1], 'in.tsv.xz'))
output_dir = os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'out.tsv'))
df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE)
df = df.replace(r'\\r+|\\n+|\\t+', ' ', regex=True)
final = ""
for i, (_, row) in tqdm(enumerate(df.iterrows()), total=len(df)):
text = ""
prob_sum = 0.0
probs = model.fill_gap(re.split(r"\s+", row['LeftContext']), re.split(r"\s+", row['RightContext']))
if len(probs) == 0:
text = ":1"
else:
prob_sum = sum([prob for _, prob in probs])
for word, prob in probs:
new_prob = math.floor(prob / prob_sum * 100) / 100
if new_prob == 1.0:
new_prob = 0.99
text += f"{word}:{new_prob} "
text += ":0.01"
final += text + "\n"
with open(output_dir, 'w', encoding="UTF-8") as f:
f.write(final)

View File

@ -1,108 +1,111 @@
from collections import defaultdict, Counter
from tqdm import tqdm
import nltk
import random
import pickle
import math
class Model():
def __init__(self, UNK_token = '<UNK>', n = 3):
self.n = n
self.UNK_token = UNK_token
self.ngrams = defaultdict(defaultdict(int).copy)
self.contexts = defaultdict(int)
self.tokenizer = { UNK_token: 0 }
self.reverse_tokenizer = { 0: UNK_token }
self._tokenizer_index = 1
self.vocab = set()
self.n_split = self.n // 2
def train_tokenizer(self, corpus: list) -> list[int]:
for word in tqdm(corpus):
if word not in self.vocab:
self.vocab.add(word)
self.tokenizer[word] = self._tokenizer_index
self.reverse_tokenizer[self._tokenizer_index] = word
self._tokenizer_index += 1
def tokenize(self, corpus: list, verbose = False) -> list[int]:
result = []
for word in tqdm(corpus) if verbose else corpus:
if word not in self.vocab:
result.append(self.tokenizer[self.UNK_token])
else:
result.append(self.tokenizer[word])
return result
def train(self, corpus: list) -> None:
print("Training tokenizer")
self.train_tokenizer(corpus)
print("Tokenizing corpus")
corpus = self.tokenize(corpus, verbose = True)
print("Saving n-grams")
n_grams = list(nltk.ngrams(corpus, self.n))
for gram in tqdm(n_grams):
left_context = gram[:self.n_split]
right_context = gram[self.n_split + 1:]
word = gram[self.n_split]
if word == self.UNK_token:
continue
self.ngrams[(left_context, right_context)][word] += 1
self.contexts[(left_context, right_context)] += 1
def get_conditional_probability_for_word(self, left_context: list, right_context: list, word: str) -> float:
left_context = tuple(left_context[-self.n_split:])
right_context = tuple(right_context[:self.n_split])
total_count = self.contexts[(left_context, right_context)]
if total_count == 0:
return 0.0
else:
word_count = self.ngrams[(left_context, right_context)][word]
return word_count / total_count
def get_probabilities(self, left_context: list, right_context: list) -> float:
left_context = tuple(left_context[-self.n_split:])
right_context = tuple(right_context[:self.n_split])
words = list(self.ngrams[(left_context, right_context)].keys())
probs = []
for word in words:
prob = self.get_conditional_probability_for_word(left_context, right_context, word)
probs.append((word, prob))
return sorted(probs, reverse = True, key = lambda x: x[0])[:10]
def fill_gap(self, left_context: list, right_context: list) -> list:
left_context = self.tokenize(left_context)
right_context = self.tokenize(right_context)
result = []
probabilities = self.get_probabilities(left_context, right_context)
for probability in probabilities:
word = self.reverse_tokenizer[probability[0]]
result.append((word, probability[1]))
return result
def save(self, output_dir: str) -> None:
with open(output_dir, 'wb') as f:
pickle.dump(self, f)
@staticmethod
def load(model_path: str) -> 'Model':
with open(model_path, 'rb') as f:
from collections import defaultdict, Counter
from tqdm import tqdm
import nltk
import random
import pickle
from multiprocessing import Pool
import math
from bidict import bidict
class Model():
def __init__(self, UNK_token = '<UNK>', n = 3):
self.n = n
self.UNK_token = UNK_token
self.ngrams = defaultdict(defaultdict(int).copy)
self.contexts = defaultdict(int)
self.tokenizer = bidict({ UNK_token: 0 })
self._tokenizer_index = 1
self.vocab = set()
self.n_split = self.n // 2
def train_tokenizer(self, corpus: list) -> list[int]:
for word in tqdm(corpus):
if word not in self.vocab:
self.vocab.add(word)
self.tokenizer[word] = self._tokenizer_index
self._tokenizer_index += 1
def tokenize(self, corpus: list, verbose = False) -> list[int]:
result = []
for word in tqdm(corpus) if verbose else corpus:
if word not in self.vocab:
result.append(self.tokenizer[self.UNK_token])
else:
result.append(self.tokenizer[word])
return result
def process_gram(self, gram: tuple) -> tuple:
left_context = gram[:self.n_split]
right_context = gram[self.n_split + 1:]
word = gram[self.n_split]
if word == self.UNK_token:
return
self.ngrams[(left_context, right_context)][word] += 1
self.contexts[(left_context, right_context)] += 1
def train(self, corpus: list) -> None:
print("Training tokenizer")
self.train_tokenizer(corpus)
print("Tokenizing corpus")
corpus = self.tokenize(corpus, verbose = True)
print("Saving n-grams")
n_grams = list(nltk.ngrams(corpus, self.n))
for gram in tqdm(n_grams):
self.process_gram(gram)
def get_conditional_probability_for_word(self, left_context: list, right_context: list, word: str) -> float:
left_context = tuple(left_context[-self.n_split:])
right_context = tuple(right_context[:self.n_split])
total_count = self.contexts[(left_context, right_context)]
if total_count == 0:
return 0.0
else:
word_count = self.ngrams[(left_context, right_context)][word]
return word_count / total_count
def get_probabilities(self, left_context: list, right_context: list) -> float:
left_context = tuple(left_context[-self.n_split:])
right_context = tuple(right_context[:self.n_split])
words = list(self.ngrams[(left_context, right_context)].keys())
probs = []
for word in words:
prob = self.get_conditional_probability_for_word(left_context, right_context, word)
probs.append((word, prob))
return sorted(probs, reverse = True, key = lambda x: x[1])[:10]
def fill_gap(self, left_context: list, right_context: list) -> list:
left_context = self.tokenize(left_context)
right_context = self.tokenize(right_context)
result = []
probabilities = self.get_probabilities(left_context, right_context)
for token, probability in probabilities:
word = self.tokenizer.inverse[token]
result.append((word, probability))
return result
def save(self, output_dir: str) -> None:
with open(output_dir, 'wb') as f:
pickle.dump(self, f)
@staticmethod
def load(model_path: str) -> 'Model':
with open(model_path, 'rb') as f:
return pickle.load(f)

View File

@ -1,41 +1,32 @@
from collections import Counter, defaultdict
from tqdm import tqdm
import re
import nltk
import random
import os
import sys
import pickle
import csv
import pandas as pd
from model import Model
dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'in.tsv.xz'))
expected_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'expected.tsv'))
model = Model(n = 3)
df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, chunksize=10 ** 2)
expected_df = pd.read_csv(expected_dir, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, chunksize=10 ** 2)
print('Loading training corpus...')
corpus = []
for j, chunk in tqdm(enumerate(zip(df, expected_df)), total=4321):
df, expected_df = chunk
df = df.replace(r'\\r+|\\n+|\\t+','', regex=True)
for (_, row1), (_, row2) in zip(df.iterrows(), expected_df.iterrows()):
word = row2['Word']
left_context = row1['LeftContext']
right_context = row1['RightContext']
corpus.extend(left_context.split() + [word] + right_context.split())
# if j > 50:
# break
print('Training model...')
model.train(corpus)
print('Saving model...')
from collections import Counter, defaultdict
from tqdm import tqdm
import re
import nltk
import random
import os
import sys
import pickle
import csv
import pandas as pd
from model import Model
dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'in.tsv.xz'))
expected_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'expected.tsv'))
model = Model(n = 3)
df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)
expected_df = pd.read_csv(expected_dir, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)
print('Loading training corpus...')
corpus = []
for j, (df, expected_df) in tqdm(enumerate(zip(df, expected_df)), total=432):
df = df.replace(r'\\r+|\\n+|\\t+', ' ', regex=True)
for left_context, word, right_context in zip(df['LeftContext'].to_list(), expected_df['Word'].to_list(), df['LeftContext'].to_list()):
corpus.extend(re.split(r"\s+", left_context.strip()) + [str(word).strip()] + re.split(r"\s+", right_context.strip()))
print('Training model...')
model.train(corpus)
print('Saving model...')
model.save(os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'model.pkl')))

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff