Final model

This commit is contained in:
Marcin Czerniak 2024-04-28 00:56:04 +02:00
parent 4470830adf
commit 0734c5d906
17 changed files with 918826 additions and 918826 deletions

2
.gitignore vendored
View File

@ -1,4 +1,3 @@
*~ *~
*.swp *.swp
*.bak *.bak
@ -6,3 +5,4 @@
*.o *.o
.DS_Store .DS_Store
.token .token
model.pkl

View File

@ -1,9 +1,15 @@
Challenging America word-gap prediction Challenging America word-gap prediction
=================================== ===================================
Guess a word in a gap. This task is to predict the word-gap between two sentences.
Evaluation metric Evaluation
----------------- -----------------
LikelihoodHashed is the metric PerplexityHashed is the metric so to check the performance of the model. The lower the perplexity, the better the model. To run evaluation run the following command:
```bash
./geval --metric PerplexityHashed --test-name dev-0
```
Perplexity calculated on `dev-0` is equal `981.69`

File diff suppressed because it is too large Load Diff

BIN
geval Normal file

Binary file not shown.

View File

@ -17,7 +17,7 @@ dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', sys.
output_dir = os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'out.tsv')) output_dir = os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'out.tsv'))
df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE) df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE)
df = df.replace(r'\\r+|\\n+|\\t+','', regex=True) df = df.replace(r'\\r+|\\n+|\\t+', ' ', regex=True)
final = "" final = ""
@ -33,14 +33,14 @@ for i, (_, row) in tqdm(enumerate(df.iterrows()), total=len(df)):
prob_sum = sum([prob for _, prob in probs]) prob_sum = sum([prob for _, prob in probs])
for word, prob in probs: for word, prob in probs:
new_prob = math.floor(prob / prob_sum * 1000) / 1000 new_prob = math.floor(prob / prob_sum * 100) / 100
if new_prob == 1.0: if new_prob == 1.0:
new_prob = 0.999 new_prob = 0.99
text += f"{word}:{new_prob} " text += f"{word}:{new_prob} "
text += ":0.001" text += ":0.01"
final += text + "\n" final += text + "\n"

View File

@ -3,7 +3,9 @@ from tqdm import tqdm
import nltk import nltk
import random import random
import pickle import pickle
from multiprocessing import Pool
import math import math
from bidict import bidict
class Model(): class Model():
@ -12,8 +14,7 @@ class Model():
self.UNK_token = UNK_token self.UNK_token = UNK_token
self.ngrams = defaultdict(defaultdict(int).copy) self.ngrams = defaultdict(defaultdict(int).copy)
self.contexts = defaultdict(int) self.contexts = defaultdict(int)
self.tokenizer = { UNK_token: 0 } self.tokenizer = bidict({ UNK_token: 0 })
self.reverse_tokenizer = { 0: UNK_token }
self._tokenizer_index = 1 self._tokenizer_index = 1
self.vocab = set() self.vocab = set()
@ -24,7 +25,6 @@ class Model():
if word not in self.vocab: if word not in self.vocab:
self.vocab.add(word) self.vocab.add(word)
self.tokenizer[word] = self._tokenizer_index self.tokenizer[word] = self._tokenizer_index
self.reverse_tokenizer[self._tokenizer_index] = word
self._tokenizer_index += 1 self._tokenizer_index += 1
@ -39,6 +39,17 @@ class Model():
return result return result
def process_gram(self, gram: tuple) -> tuple:
left_context = gram[:self.n_split]
right_context = gram[self.n_split + 1:]
word = gram[self.n_split]
if word == self.UNK_token:
return
self.ngrams[(left_context, right_context)][word] += 1
self.contexts[(left_context, right_context)] += 1
def train(self, corpus: list) -> None: def train(self, corpus: list) -> None:
print("Training tokenizer") print("Training tokenizer")
@ -50,15 +61,7 @@ class Model():
print("Saving n-grams") print("Saving n-grams")
n_grams = list(nltk.ngrams(corpus, self.n)) n_grams = list(nltk.ngrams(corpus, self.n))
for gram in tqdm(n_grams): for gram in tqdm(n_grams):
left_context = gram[:self.n_split] self.process_gram(gram)
right_context = gram[self.n_split + 1:]
word = gram[self.n_split]
if word == self.UNK_token:
continue
self.ngrams[(left_context, right_context)][word] += 1
self.contexts[(left_context, right_context)] += 1
def get_conditional_probability_for_word(self, left_context: list, right_context: list, word: str) -> float: def get_conditional_probability_for_word(self, left_context: list, right_context: list, word: str) -> float:
left_context = tuple(left_context[-self.n_split:]) left_context = tuple(left_context[-self.n_split:])
@ -84,7 +87,7 @@ class Model():
prob = self.get_conditional_probability_for_word(left_context, right_context, word) prob = self.get_conditional_probability_for_word(left_context, right_context, word)
probs.append((word, prob)) probs.append((word, prob))
return sorted(probs, reverse = True, key = lambda x: x[0])[:10] return sorted(probs, reverse = True, key = lambda x: x[1])[:10]
def fill_gap(self, left_context: list, right_context: list) -> list: def fill_gap(self, left_context: list, right_context: list) -> list:
left_context = self.tokenize(left_context) left_context = self.tokenize(left_context)
@ -92,9 +95,9 @@ class Model():
result = [] result = []
probabilities = self.get_probabilities(left_context, right_context) probabilities = self.get_probabilities(left_context, right_context)
for probability in probabilities: for token, probability in probabilities:
word = self.reverse_tokenizer[probability[0]] word = self.tokenizer.inverse[token]
result.append((word, probability[1])) result.append((word, probability))
return result return result

View File

@ -15,25 +15,16 @@ expected_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tr
model = Model(n = 3) model = Model(n = 3)
df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, chunksize=10 ** 2) df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)
expected_df = pd.read_csv(expected_dir, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, chunksize=10 ** 2) expected_df = pd.read_csv(expected_dir, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)
print('Loading training corpus...') print('Loading training corpus...')
corpus = [] corpus = []
for j, chunk in tqdm(enumerate(zip(df, expected_df)), total=4321): for j, (df, expected_df) in tqdm(enumerate(zip(df, expected_df)), total=432):
df, expected_df = chunk df = df.replace(r'\\r+|\\n+|\\t+', ' ', regex=True)
df = df.replace(r'\\r+|\\n+|\\t+','', regex=True) for left_context, word, right_context in zip(df['LeftContext'].to_list(), expected_df['Word'].to_list(), df['LeftContext'].to_list()):
corpus.extend(re.split(r"\s+", left_context.strip()) + [str(word).strip()] + re.split(r"\s+", right_context.strip()))
for (_, row1), (_, row2) in zip(df.iterrows(), expected_df.iterrows()):
word = row2['Word']
left_context = row1['LeftContext']
right_context = row1['RightContext']
corpus.extend(left_context.split() + [word] + right_context.split())
# if j > 50:
# break
print('Training model...') print('Training model...')
model.train(corpus) model.train(corpus)

File diff suppressed because it is too large Load Diff