Final model
This commit is contained in:
parent
4470830adf
commit
0734c5d906
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,4 +1,3 @@
|
||||
|
||||
*~
|
||||
*.swp
|
||||
*.bak
|
||||
@ -6,3 +5,4 @@
|
||||
*.o
|
||||
.DS_Store
|
||||
.token
|
||||
model.pkl
|
||||
|
12
README.md
12
README.md
@ -1,9 +1,15 @@
|
||||
Challenging America word-gap prediction
|
||||
===================================
|
||||
|
||||
Guess a word in a gap.
|
||||
This task is to predict the word-gap between two sentences.
|
||||
|
||||
Evaluation metric
|
||||
Evaluation
|
||||
-----------------
|
||||
|
||||
LikelihoodHashed is the metric
|
||||
PerplexityHashed is the metric so to check the performance of the model. The lower the perplexity, the better the model. To run evaluation run the following command:
|
||||
|
||||
```bash
|
||||
./geval --metric PerplexityHashed --test-name dev-0
|
||||
```
|
||||
|
||||
Perplexity calculated on `dev-0` is equal `981.69`
|
||||
|
9936
dev-0/out.tsv
9936
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
@ -17,7 +17,7 @@ dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', sys.
|
||||
output_dir = os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'out.tsv'))
|
||||
|
||||
df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE)
|
||||
df = df.replace(r'\\r+|\\n+|\\t+','', regex=True)
|
||||
df = df.replace(r'\\r+|\\n+|\\t+', ' ', regex=True)
|
||||
|
||||
final = ""
|
||||
|
||||
@ -33,14 +33,14 @@ for i, (_, row) in tqdm(enumerate(df.iterrows()), total=len(df)):
|
||||
prob_sum = sum([prob for _, prob in probs])
|
||||
|
||||
for word, prob in probs:
|
||||
new_prob = math.floor(prob / prob_sum * 1000) / 1000
|
||||
new_prob = math.floor(prob / prob_sum * 100) / 100
|
||||
|
||||
if new_prob == 1.0:
|
||||
new_prob = 0.999
|
||||
new_prob = 0.99
|
||||
|
||||
text += f"{word}:{new_prob} "
|
||||
|
||||
text += ":0.001"
|
||||
text += ":0.01"
|
||||
|
||||
final += text + "\n"
|
||||
|
||||
|
35
src/model.py
35
src/model.py
@ -3,7 +3,9 @@ from tqdm import tqdm
|
||||
import nltk
|
||||
import random
|
||||
import pickle
|
||||
from multiprocessing import Pool
|
||||
import math
|
||||
from bidict import bidict
|
||||
|
||||
class Model():
|
||||
|
||||
@ -12,8 +14,7 @@ class Model():
|
||||
self.UNK_token = UNK_token
|
||||
self.ngrams = defaultdict(defaultdict(int).copy)
|
||||
self.contexts = defaultdict(int)
|
||||
self.tokenizer = { UNK_token: 0 }
|
||||
self.reverse_tokenizer = { 0: UNK_token }
|
||||
self.tokenizer = bidict({ UNK_token: 0 })
|
||||
self._tokenizer_index = 1
|
||||
self.vocab = set()
|
||||
|
||||
@ -24,7 +25,6 @@ class Model():
|
||||
if word not in self.vocab:
|
||||
self.vocab.add(word)
|
||||
self.tokenizer[word] = self._tokenizer_index
|
||||
self.reverse_tokenizer[self._tokenizer_index] = word
|
||||
|
||||
self._tokenizer_index += 1
|
||||
|
||||
@ -39,6 +39,17 @@ class Model():
|
||||
|
||||
return result
|
||||
|
||||
def process_gram(self, gram: tuple) -> tuple:
|
||||
left_context = gram[:self.n_split]
|
||||
right_context = gram[self.n_split + 1:]
|
||||
word = gram[self.n_split]
|
||||
|
||||
if word == self.UNK_token:
|
||||
return
|
||||
|
||||
self.ngrams[(left_context, right_context)][word] += 1
|
||||
self.contexts[(left_context, right_context)] += 1
|
||||
|
||||
def train(self, corpus: list) -> None:
|
||||
|
||||
print("Training tokenizer")
|
||||
@ -50,15 +61,7 @@ class Model():
|
||||
print("Saving n-grams")
|
||||
n_grams = list(nltk.ngrams(corpus, self.n))
|
||||
for gram in tqdm(n_grams):
|
||||
left_context = gram[:self.n_split]
|
||||
right_context = gram[self.n_split + 1:]
|
||||
word = gram[self.n_split]
|
||||
|
||||
if word == self.UNK_token:
|
||||
continue
|
||||
|
||||
self.ngrams[(left_context, right_context)][word] += 1
|
||||
self.contexts[(left_context, right_context)] += 1
|
||||
self.process_gram(gram)
|
||||
|
||||
def get_conditional_probability_for_word(self, left_context: list, right_context: list, word: str) -> float:
|
||||
left_context = tuple(left_context[-self.n_split:])
|
||||
@ -84,7 +87,7 @@ class Model():
|
||||
prob = self.get_conditional_probability_for_word(left_context, right_context, word)
|
||||
probs.append((word, prob))
|
||||
|
||||
return sorted(probs, reverse = True, key = lambda x: x[0])[:10]
|
||||
return sorted(probs, reverse = True, key = lambda x: x[1])[:10]
|
||||
|
||||
def fill_gap(self, left_context: list, right_context: list) -> list:
|
||||
left_context = self.tokenize(left_context)
|
||||
@ -92,9 +95,9 @@ class Model():
|
||||
|
||||
result = []
|
||||
probabilities = self.get_probabilities(left_context, right_context)
|
||||
for probability in probabilities:
|
||||
word = self.reverse_tokenizer[probability[0]]
|
||||
result.append((word, probability[1]))
|
||||
for token, probability in probabilities:
|
||||
word = self.tokenizer.inverse[token]
|
||||
result.append((word, probability))
|
||||
|
||||
return result
|
||||
|
||||
|
21
src/train.py
21
src/train.py
@ -15,25 +15,16 @@ expected_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tr
|
||||
|
||||
model = Model(n = 3)
|
||||
|
||||
df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, chunksize=10 ** 2)
|
||||
expected_df = pd.read_csv(expected_dir, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, chunksize=10 ** 2)
|
||||
df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)
|
||||
expected_df = pd.read_csv(expected_dir, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)
|
||||
|
||||
print('Loading training corpus...')
|
||||
corpus = []
|
||||
for j, chunk in tqdm(enumerate(zip(df, expected_df)), total=4321):
|
||||
df, expected_df = chunk
|
||||
for j, (df, expected_df) in tqdm(enumerate(zip(df, expected_df)), total=432):
|
||||
df = df.replace(r'\\r+|\\n+|\\t+', ' ', regex=True)
|
||||
|
||||
df = df.replace(r'\\r+|\\n+|\\t+','', regex=True)
|
||||
|
||||
for (_, row1), (_, row2) in zip(df.iterrows(), expected_df.iterrows()):
|
||||
word = row2['Word']
|
||||
left_context = row1['LeftContext']
|
||||
right_context = row1['RightContext']
|
||||
|
||||
corpus.extend(left_context.split() + [word] + right_context.split())
|
||||
|
||||
# if j > 50:
|
||||
# break
|
||||
for left_context, word, right_context in zip(df['LeftContext'].to_list(), expected_df['Word'].to_list(), df['LeftContext'].to_list()):
|
||||
corpus.extend(re.split(r"\s+", left_context.strip()) + [str(word).strip()] + re.split(r"\s+", right_context.strip()))
|
||||
|
||||
print('Training model...')
|
||||
model.train(corpus)
|
||||
|
14828
test-A/out.tsv
14828
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user