Final model
This commit is contained in:
parent
4470830adf
commit
0734c5d906
16
.gitignore
vendored
16
.gitignore
vendored
@ -1,8 +1,8 @@
|
|||||||
|
*~
|
||||||
*~
|
*.swp
|
||||||
*.swp
|
*.bak
|
||||||
*.bak
|
*.pyc
|
||||||
*.pyc
|
*.o
|
||||||
*.o
|
.DS_Store
|
||||||
.DS_Store
|
.token
|
||||||
.token
|
model.pkl
|
||||||
|
24
README.md
24
README.md
@ -1,9 +1,15 @@
|
|||||||
Challenging America word-gap prediction
|
Challenging America word-gap prediction
|
||||||
===================================
|
===================================
|
||||||
|
|
||||||
Guess a word in a gap.
|
This task is to predict the word-gap between two sentences.
|
||||||
|
|
||||||
Evaluation metric
|
Evaluation
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
LikelihoodHashed is the metric
|
PerplexityHashed is the metric so to check the performance of the model. The lower the perplexity, the better the model. To run evaluation run the following command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./geval --metric PerplexityHashed --test-name dev-0
|
||||||
|
```
|
||||||
|
|
||||||
|
Perplexity calculated on `dev-0` is equal `981.69`
|
||||||
|
21038
dev-0/expected.tsv
21038
dev-0/expected.tsv
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
21038
dev-0/out.tsv
21038
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
@ -1 +1 @@
|
|||||||
FileId Year LeftContext RightContext
|
FileId Year LeftContext RightContext
|
||||||
|
|
@ -1 +1 @@
|
|||||||
Word
|
Word
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
@ -1,48 +1,48 @@
|
|||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import csv
|
import csv
|
||||||
from model import Model
|
from model import Model
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import re
|
import re
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import math
|
import math
|
||||||
|
|
||||||
print("Loading model")
|
print("Loading model")
|
||||||
dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'in.tsv.xz'))
|
dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'in.tsv.xz'))
|
||||||
model = Model.load(os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'model.pkl')))
|
model = Model.load(os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'model.pkl')))
|
||||||
|
|
||||||
print("Evaluating")
|
print("Evaluating")
|
||||||
dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', sys.argv[1], 'in.tsv.xz'))
|
dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', sys.argv[1], 'in.tsv.xz'))
|
||||||
output_dir = os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'out.tsv'))
|
output_dir = os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'out.tsv'))
|
||||||
|
|
||||||
df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE)
|
df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE)
|
||||||
df = df.replace(r'\\r+|\\n+|\\t+','', regex=True)
|
df = df.replace(r'\\r+|\\n+|\\t+', ' ', regex=True)
|
||||||
|
|
||||||
final = ""
|
final = ""
|
||||||
|
|
||||||
for i, (_, row) in tqdm(enumerate(df.iterrows()), total=len(df)):
|
for i, (_, row) in tqdm(enumerate(df.iterrows()), total=len(df)):
|
||||||
text = ""
|
text = ""
|
||||||
prob_sum = 0.0
|
prob_sum = 0.0
|
||||||
|
|
||||||
probs = model.fill_gap(re.split(r"\s+", row['LeftContext']), re.split(r"\s+", row['RightContext']))
|
probs = model.fill_gap(re.split(r"\s+", row['LeftContext']), re.split(r"\s+", row['RightContext']))
|
||||||
|
|
||||||
if len(probs) == 0:
|
if len(probs) == 0:
|
||||||
text = ":1"
|
text = ":1"
|
||||||
else:
|
else:
|
||||||
prob_sum = sum([prob for _, prob in probs])
|
prob_sum = sum([prob for _, prob in probs])
|
||||||
|
|
||||||
for word, prob in probs:
|
for word, prob in probs:
|
||||||
new_prob = math.floor(prob / prob_sum * 1000) / 1000
|
new_prob = math.floor(prob / prob_sum * 100) / 100
|
||||||
|
|
||||||
if new_prob == 1.0:
|
if new_prob == 1.0:
|
||||||
new_prob = 0.999
|
new_prob = 0.99
|
||||||
|
|
||||||
text += f"{word}:{new_prob} "
|
text += f"{word}:{new_prob} "
|
||||||
|
|
||||||
text += ":0.001"
|
text += ":0.01"
|
||||||
|
|
||||||
final += text + "\n"
|
final += text + "\n"
|
||||||
|
|
||||||
with open(output_dir, 'w', encoding="UTF-8") as f:
|
with open(output_dir, 'w', encoding="UTF-8") as f:
|
||||||
f.write(final)
|
f.write(final)
|
||||||
|
217
src/model.py
217
src/model.py
@ -1,108 +1,111 @@
|
|||||||
from collections import defaultdict, Counter
|
from collections import defaultdict, Counter
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import nltk
|
import nltk
|
||||||
import random
|
import random
|
||||||
import pickle
|
import pickle
|
||||||
import math
|
from multiprocessing import Pool
|
||||||
|
import math
|
||||||
class Model():
|
from bidict import bidict
|
||||||
|
|
||||||
def __init__(self, UNK_token = '<UNK>', n = 3):
|
class Model():
|
||||||
self.n = n
|
|
||||||
self.UNK_token = UNK_token
|
def __init__(self, UNK_token = '<UNK>', n = 3):
|
||||||
self.ngrams = defaultdict(defaultdict(int).copy)
|
self.n = n
|
||||||
self.contexts = defaultdict(int)
|
self.UNK_token = UNK_token
|
||||||
self.tokenizer = { UNK_token: 0 }
|
self.ngrams = defaultdict(defaultdict(int).copy)
|
||||||
self.reverse_tokenizer = { 0: UNK_token }
|
self.contexts = defaultdict(int)
|
||||||
self._tokenizer_index = 1
|
self.tokenizer = bidict({ UNK_token: 0 })
|
||||||
self.vocab = set()
|
self._tokenizer_index = 1
|
||||||
|
self.vocab = set()
|
||||||
self.n_split = self.n // 2
|
|
||||||
|
self.n_split = self.n // 2
|
||||||
def train_tokenizer(self, corpus: list) -> list[int]:
|
|
||||||
for word in tqdm(corpus):
|
def train_tokenizer(self, corpus: list) -> list[int]:
|
||||||
if word not in self.vocab:
|
for word in tqdm(corpus):
|
||||||
self.vocab.add(word)
|
if word not in self.vocab:
|
||||||
self.tokenizer[word] = self._tokenizer_index
|
self.vocab.add(word)
|
||||||
self.reverse_tokenizer[self._tokenizer_index] = word
|
self.tokenizer[word] = self._tokenizer_index
|
||||||
|
|
||||||
self._tokenizer_index += 1
|
self._tokenizer_index += 1
|
||||||
|
|
||||||
def tokenize(self, corpus: list, verbose = False) -> list[int]:
|
def tokenize(self, corpus: list, verbose = False) -> list[int]:
|
||||||
result = []
|
result = []
|
||||||
|
|
||||||
for word in tqdm(corpus) if verbose else corpus:
|
for word in tqdm(corpus) if verbose else corpus:
|
||||||
if word not in self.vocab:
|
if word not in self.vocab:
|
||||||
result.append(self.tokenizer[self.UNK_token])
|
result.append(self.tokenizer[self.UNK_token])
|
||||||
else:
|
else:
|
||||||
result.append(self.tokenizer[word])
|
result.append(self.tokenizer[word])
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def train(self, corpus: list) -> None:
|
def process_gram(self, gram: tuple) -> tuple:
|
||||||
|
left_context = gram[:self.n_split]
|
||||||
print("Training tokenizer")
|
right_context = gram[self.n_split + 1:]
|
||||||
self.train_tokenizer(corpus)
|
word = gram[self.n_split]
|
||||||
|
|
||||||
print("Tokenizing corpus")
|
if word == self.UNK_token:
|
||||||
corpus = self.tokenize(corpus, verbose = True)
|
return
|
||||||
|
|
||||||
print("Saving n-grams")
|
self.ngrams[(left_context, right_context)][word] += 1
|
||||||
n_grams = list(nltk.ngrams(corpus, self.n))
|
self.contexts[(left_context, right_context)] += 1
|
||||||
for gram in tqdm(n_grams):
|
|
||||||
left_context = gram[:self.n_split]
|
def train(self, corpus: list) -> None:
|
||||||
right_context = gram[self.n_split + 1:]
|
|
||||||
word = gram[self.n_split]
|
print("Training tokenizer")
|
||||||
|
self.train_tokenizer(corpus)
|
||||||
if word == self.UNK_token:
|
|
||||||
continue
|
print("Tokenizing corpus")
|
||||||
|
corpus = self.tokenize(corpus, verbose = True)
|
||||||
self.ngrams[(left_context, right_context)][word] += 1
|
|
||||||
self.contexts[(left_context, right_context)] += 1
|
print("Saving n-grams")
|
||||||
|
n_grams = list(nltk.ngrams(corpus, self.n))
|
||||||
def get_conditional_probability_for_word(self, left_context: list, right_context: list, word: str) -> float:
|
for gram in tqdm(n_grams):
|
||||||
left_context = tuple(left_context[-self.n_split:])
|
self.process_gram(gram)
|
||||||
right_context = tuple(right_context[:self.n_split])
|
|
||||||
|
def get_conditional_probability_for_word(self, left_context: list, right_context: list, word: str) -> float:
|
||||||
total_count = self.contexts[(left_context, right_context)]
|
left_context = tuple(left_context[-self.n_split:])
|
||||||
|
right_context = tuple(right_context[:self.n_split])
|
||||||
if total_count == 0:
|
|
||||||
return 0.0
|
total_count = self.contexts[(left_context, right_context)]
|
||||||
else:
|
|
||||||
word_count = self.ngrams[(left_context, right_context)][word]
|
if total_count == 0:
|
||||||
|
return 0.0
|
||||||
return word_count / total_count
|
else:
|
||||||
|
word_count = self.ngrams[(left_context, right_context)][word]
|
||||||
def get_probabilities(self, left_context: list, right_context: list) -> float:
|
|
||||||
left_context = tuple(left_context[-self.n_split:])
|
return word_count / total_count
|
||||||
right_context = tuple(right_context[:self.n_split])
|
|
||||||
|
def get_probabilities(self, left_context: list, right_context: list) -> float:
|
||||||
words = list(self.ngrams[(left_context, right_context)].keys())
|
left_context = tuple(left_context[-self.n_split:])
|
||||||
probs = []
|
right_context = tuple(right_context[:self.n_split])
|
||||||
|
|
||||||
for word in words:
|
words = list(self.ngrams[(left_context, right_context)].keys())
|
||||||
prob = self.get_conditional_probability_for_word(left_context, right_context, word)
|
probs = []
|
||||||
probs.append((word, prob))
|
|
||||||
|
for word in words:
|
||||||
return sorted(probs, reverse = True, key = lambda x: x[0])[:10]
|
prob = self.get_conditional_probability_for_word(left_context, right_context, word)
|
||||||
|
probs.append((word, prob))
|
||||||
def fill_gap(self, left_context: list, right_context: list) -> list:
|
|
||||||
left_context = self.tokenize(left_context)
|
return sorted(probs, reverse = True, key = lambda x: x[1])[:10]
|
||||||
right_context = self.tokenize(right_context)
|
|
||||||
|
def fill_gap(self, left_context: list, right_context: list) -> list:
|
||||||
result = []
|
left_context = self.tokenize(left_context)
|
||||||
probabilities = self.get_probabilities(left_context, right_context)
|
right_context = self.tokenize(right_context)
|
||||||
for probability in probabilities:
|
|
||||||
word = self.reverse_tokenizer[probability[0]]
|
result = []
|
||||||
result.append((word, probability[1]))
|
probabilities = self.get_probabilities(left_context, right_context)
|
||||||
|
for token, probability in probabilities:
|
||||||
return result
|
word = self.tokenizer.inverse[token]
|
||||||
|
result.append((word, probability))
|
||||||
def save(self, output_dir: str) -> None:
|
|
||||||
with open(output_dir, 'wb') as f:
|
return result
|
||||||
pickle.dump(self, f)
|
|
||||||
|
def save(self, output_dir: str) -> None:
|
||||||
@staticmethod
|
with open(output_dir, 'wb') as f:
|
||||||
def load(model_path: str) -> 'Model':
|
pickle.dump(self, f)
|
||||||
with open(model_path, 'rb') as f:
|
|
||||||
|
@staticmethod
|
||||||
|
def load(model_path: str) -> 'Model':
|
||||||
|
with open(model_path, 'rb') as f:
|
||||||
return pickle.load(f)
|
return pickle.load(f)
|
71
src/train.py
71
src/train.py
@ -1,41 +1,32 @@
|
|||||||
from collections import Counter, defaultdict
|
from collections import Counter, defaultdict
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import re
|
import re
|
||||||
import nltk
|
import nltk
|
||||||
import random
|
import random
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import pickle
|
import pickle
|
||||||
import csv
|
import csv
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from model import Model
|
from model import Model
|
||||||
|
|
||||||
dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'in.tsv.xz'))
|
dataset_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'in.tsv.xz'))
|
||||||
expected_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'expected.tsv'))
|
expected_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'train', 'expected.tsv'))
|
||||||
|
|
||||||
model = Model(n = 3)
|
model = Model(n = 3)
|
||||||
|
|
||||||
df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, chunksize=10 ** 2)
|
df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)
|
||||||
expected_df = pd.read_csv(expected_dir, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, chunksize=10 ** 2)
|
expected_df = pd.read_csv(expected_dir, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)
|
||||||
|
|
||||||
print('Loading training corpus...')
|
print('Loading training corpus...')
|
||||||
corpus = []
|
corpus = []
|
||||||
for j, chunk in tqdm(enumerate(zip(df, expected_df)), total=4321):
|
for j, (df, expected_df) in tqdm(enumerate(zip(df, expected_df)), total=432):
|
||||||
df, expected_df = chunk
|
df = df.replace(r'\\r+|\\n+|\\t+', ' ', regex=True)
|
||||||
|
|
||||||
df = df.replace(r'\\r+|\\n+|\\t+','', regex=True)
|
for left_context, word, right_context in zip(df['LeftContext'].to_list(), expected_df['Word'].to_list(), df['LeftContext'].to_list()):
|
||||||
|
corpus.extend(re.split(r"\s+", left_context.strip()) + [str(word).strip()] + re.split(r"\s+", right_context.strip()))
|
||||||
for (_, row1), (_, row2) in zip(df.iterrows(), expected_df.iterrows()):
|
|
||||||
word = row2['Word']
|
print('Training model...')
|
||||||
left_context = row1['LeftContext']
|
model.train(corpus)
|
||||||
right_context = row1['RightContext']
|
print('Saving model...')
|
||||||
|
|
||||||
corpus.extend(left_context.split() + [word] + right_context.split())
|
|
||||||
|
|
||||||
# if j > 50:
|
|
||||||
# break
|
|
||||||
|
|
||||||
print('Training model...')
|
|
||||||
model.train(corpus)
|
|
||||||
print('Saving model...')
|
|
||||||
model.save(os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'model.pkl')))
|
model.save(os.path.abspath(os.path.join(os.path.dirname(dataset_dir), 'model.pkl')))
|
File diff suppressed because it is too large
Load Diff
14828
test-A/in.tsv
14828
test-A/in.tsv
File diff suppressed because it is too large
Load Diff
14828
test-A/out.tsv
14828
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
864044
train/expected.tsv
864044
train/expected.tsv
File diff suppressed because it is too large
Load Diff
864044
train/hate-speech-info.tsv
864044
train/hate-speech-info.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user