Final model

This commit is contained in:
Marcin Czerniak 2024-04-28 00:56:04 +02:00
parent 4470830adf
commit 0734c5d906
17 changed files with 918826 additions and 918826 deletions

2
.gitignore vendored
View File

@ -1,4 +1,3 @@
*~
*.swp
*.bak
@ -6,3 +5,4 @@
*.o
.DS_Store
.token
model.pkl

View File

@ -1,9 +1,15 @@
Challenging America word-gap prediction
===================================
Guess a word in a gap.
This task is to predict the word-gap between two sentences.
Evaluation metric
Evaluation
-----------------
LikelihoodHashed is the metric
PerplexityHashed is the metric so to check the performance of the model. The lower the perplexity, the better the model. To run evaluation run the following command:
```bash
./geval --metric PerplexityHashed --test-name dev-0
```
Perplexity calculated on `dev-0` is equal `981.69`

File diff suppressed because it is too large Load Diff

BIN
geval Normal file

Binary file not shown.

View File

@ -17,7 +17,7 @@ dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', sys.
output_dir = os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'out.tsv'))
df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE)
df = df.replace(r'\\r+|\\n+|\\t+','', regex=True)
df = df.replace(r'\\r+|\\n+|\\t+', ' ', regex=True)
final = ""
@ -33,14 +33,14 @@ for i, (_, row) in tqdm(enumerate(df.iterrows()), total=len(df)):
prob_sum = sum([prob for _, prob in probs])
for word, prob in probs:
new_prob = math.floor(prob / prob_sum * 1000) / 1000
new_prob = math.floor(prob / prob_sum * 100) / 100
if new_prob == 1.0:
new_prob = 0.999
new_prob = 0.99
text += f"{word}:{new_prob} "
text += ":0.001"
text += ":0.01"
final += text + "\n"

View File

@ -3,7 +3,9 @@ from tqdm import tqdm
import nltk
import random
import pickle
from multiprocessing import Pool
import math
from bidict import bidict
class Model():
@ -12,8 +14,7 @@ class Model():
self.UNK_token = UNK_token
self.ngrams = defaultdict(defaultdict(int).copy)
self.contexts = defaultdict(int)
self.tokenizer = { UNK_token: 0 }
self.reverse_tokenizer = { 0: UNK_token }
self.tokenizer = bidict({ UNK_token: 0 })
self._tokenizer_index = 1
self.vocab = set()
@ -24,7 +25,6 @@ class Model():
if word not in self.vocab:
self.vocab.add(word)
self.tokenizer[word] = self._tokenizer_index
self.reverse_tokenizer[self._tokenizer_index] = word
self._tokenizer_index += 1
@ -39,6 +39,17 @@ class Model():
return result
def process_gram(self, gram: tuple) -> tuple:
left_context = gram[:self.n_split]
right_context = gram[self.n_split + 1:]
word = gram[self.n_split]
if word == self.UNK_token:
return
self.ngrams[(left_context, right_context)][word] += 1
self.contexts[(left_context, right_context)] += 1
def train(self, corpus: list) -> None:
print("Training tokenizer")
@ -50,15 +61,7 @@ class Model():
print("Saving n-grams")
n_grams = list(nltk.ngrams(corpus, self.n))
for gram in tqdm(n_grams):
left_context = gram[:self.n_split]
right_context = gram[self.n_split + 1:]
word = gram[self.n_split]
if word == self.UNK_token:
continue
self.ngrams[(left_context, right_context)][word] += 1
self.contexts[(left_context, right_context)] += 1
self.process_gram(gram)
def get_conditional_probability_for_word(self, left_context: list, right_context: list, word: str) -> float:
left_context = tuple(left_context[-self.n_split:])
@ -84,7 +87,7 @@ class Model():
prob = self.get_conditional_probability_for_word(left_context, right_context, word)
probs.append((word, prob))
return sorted(probs, reverse = True, key = lambda x: x[0])[:10]
return sorted(probs, reverse = True, key = lambda x: x[1])[:10]
def fill_gap(self, left_context: list, right_context: list) -> list:
left_context = self.tokenize(left_context)
@ -92,9 +95,9 @@ class Model():
result = []
probabilities = self.get_probabilities(left_context, right_context)
for probability in probabilities:
word = self.reverse_tokenizer[probability[0]]
result.append((word, probability[1]))
for token, probability in probabilities:
word = self.tokenizer.inverse[token]
result.append((word, probability))
return result

View File

@ -15,25 +15,16 @@ expected_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tr
model = Model(n = 3)
df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, chunksize=10 ** 2)
expected_df = pd.read_csv(expected_dir, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, chunksize=10 ** 2)
df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)
expected_df = pd.read_csv(expected_dir, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)
print('Loading training corpus...')
corpus = []
for j, chunk in tqdm(enumerate(zip(df, expected_df)), total=4321):
df, expected_df = chunk
for j, (df, expected_df) in tqdm(enumerate(zip(df, expected_df)), total=432):
df = df.replace(r'\\r+|\\n+|\\t+', ' ', regex=True)
df = df.replace(r'\\r+|\\n+|\\t+','', regex=True)
for (_, row1), (_, row2) in zip(df.iterrows(), expected_df.iterrows()):
word = row2['Word']
left_context = row1['LeftContext']
right_context = row1['RightContext']
corpus.extend(left_context.split() + [word] + right_context.split())
# if j > 50:
# break
for left_context, word, right_context in zip(df['LeftContext'].to_list(), expected_df['Word'].to_list(), df['LeftContext'].to_list()):
corpus.extend(re.split(r"\s+", left_context.strip()) + [str(word).strip()] + re.split(r"\s+", right_context.strip()))
print('Training model...')
model.train(corpus)

File diff suppressed because it is too large Load Diff