In [28]:
import pandas as pd
import numpy as np
import csv
import os
import re
import random
from collections import Counter, defaultdict
import nltk
import math
from tqdm import tqdm

In [21]:
directory = "train/in.tsv.xz"
directory_dev_0 = "dev-0/in.tsv.xz"
directory_test_A = "test-A/in.tsv.xz"

### MODEL N-GRAM

In [45]:
class Model():
 
 def __init__(self, vocab_size=30_000, UNK_token= '', n=3):
 if (n <= 1 or n % 2 == 0):
 raise "change N value !!!"
 self.n = n
 self.vocab_size = vocab_size
 self.UNK_token = UNK_token
 
 def train(self, corpus:list) -> None:
 if(self.n > 1):
 self.n_grams = list(nltk.ngrams(corpus, n=self.n))
 else:
 self.n_grams = corpus
 self.counter = Counter(self.n_grams)
 self.words_counter = Counter(corpus)
 self.all_quantities = Counter([gram[:math.floor(self.n/2)]+gram[math.ceil(self.n/2):] for gram in self.n_grams])

 self.all_grams = defaultdict(set)

 for gram in tqdm(self.n_grams):
 previous_words = tuple(gram[:math.floor(self.n/2)])
 next_words = tuple(gram[math.ceil(self.n/2):])
 word = gram[math.floor(self.n/2)]
 self.all_grams[(previous_words, next_words)].add(word)

 def get_conditional_prob_for_word(self, left_text: list, right_text: list, word: str) -> float:
 previous_words = tuple(left_text[-math.floor(self.n/2):])
 next_words = tuple(right_text[:math.floor(self.n/2)])
 quantity = self.counter[previous_words + tuple([word]) + next_words]
 all_quantity = self.all_quantities[previous_words + next_words]
 if (all_quantity <= 0):
 return 0
 return quantity/all_quantity
 
 def get_prob_for_text(self, text: list) -> float:
 prob = 1
 for gram in list(nltk.ngrams(text, self.n)):
 prob *= self.get_conditional_prob_for_word(gram[:math.floor(self.n/2)], gram[math.ceil(self.n/2):], gram[math.floor(self.n/2)])
 return prob
 
 def most_probable_words(self, left_text: list, right_text: list) -> str:
 previous_words = tuple(left_text[-math.floor(self.n/2):])
 next_words = tuple(right_text[:math.floor(self.n/2)])
 all_words = self.all_grams[(previous_words, next_words)]
 best_words = []
 for word in all_words:
 probability = self.get_conditional_prob_for_word(list(previous_words), list(next_words), word)
 best_words.append((word, probability))
 return sorted(best_words, key=(lambda l: l[1]), reverse=True)[:20]
 
 def generate_text(self, text_beggining:list, text_ending:list, greedy: bool) -> list:
 words = self.most_probable_words(text_beggining, text_ending)
 return words


### DATASET

In [46]:
dataframeList = pd.read_csv(directory, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], escapechar='\\', quoting=csv.QUOTE_NONE, chunksize=10000)

expectedList = pd.read_csv(directory, sep='\t', header=None, names=['Word'], escapechar='\\', quoting=csv.QUOTE_NONE, chunksize=10000)

DATASET = ""

for number, (dataframe, expected) in enumerate(zip(dataframeList, expectedList)):
 left_text = dataframe['LeftContext'].to_list()
 right_text = dataframe['RightContext'].to_list()
 word = expected['Word'].to_list()

 lines = zip(left_text, word, right_text)
 lines = list(map(lambda l: " ".join(l), lines))
 DATASET = DATASET + " ".join(lines)

FINAL_DATASET = re.split(r"\s+", DATASET)
print(FINAL_DATASET[:100])

['came', 'fiom', 'the', 'last', 'place', 'to', 'thisnplace,', 'and', 'this', 'place', 'is', 'Where', 'WenWere,', 'this', 'is', 'the', 'first', 'road', 'I', 'evernwas', 'on', 'where', 'you', 'can', 'ride', 'elsewherenfrom', 'anywhere', 'and', 'be', 'nowhere.nHe', 'says,', 'while', 'this', 'train', 'stops', 'every-nwhere,', 'it', 'never', 'stops', 'anywhere', 'un-nless', 'its', 'somewhere.', 'Well,', 'I', "says,nI'm", 'glad', 'to', 'hear', 'that,', 'but,', 'accord-ning', 'to', 'your', 'figures,', 'I', 'left', 'myselfnwhere', '1', 'was,', 'which', 'is', 'five', 'miles', 'near-ner', 'to', 'myself', 'than', 'I', 'was', 'when', 'wenwere', 'where', 'we', 'are', 'now.nWe', 'have', 'now', 'reached', "Slidell.nThat's", 'a', 'fine', 'place.', 'The', 'peoplendown', 'there', 'remind', 'me', 'of', 'bananas-nthey', 'come', 'and', 'go', 'in', 'bunches.', '811-ndell', 'used', 'to', 'be', 'noted']


### TRAIN

In [47]:
model_3gram = Model(n = 3)
model_3gram.train(FINAL_DATASET)

100%|██████████| 180304236/180304236 [13:57<00:00, 215160.70it/s] 


In [48]:
model = model_3gram

### PREDICTION

In [62]:
def convert_predictions(line):
 sum_predictions = np.sum([pred[1] for pred in line])
 result = ""
 all_pred = 0
 for word, pred in line:
 new_pred = math.floor(pred / sum_predictions * 100) / 100
 if(new_pred == 1.0):
 new_pred = 0.99
 all_pred = all_pred + new_pred
 result = result + word + ":" + str(new_pred) + " "
 if(round(all_pred, 2) < 1):
 result = result + ":" + str(round(1 - all_pred, 2))
 else:
 result = result + ":" + str(0.01)
 return result

In [58]:
# PREDICTION FOR DEV-0

dataframe = pd.read_csv(directory_dev_0, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], escapechar='\\', quoting=csv.QUOTE_NONE)

left_text = dataframe['LeftContext'].apply(lambda l: re.split(r"\s+", l)).to_list()
right_text = dataframe['RightContext'].apply(lambda l: re.split(r"\s+", l)).to_list()

lines = zip(left_text, right_text)
lines = list(map(lambda l: model.generate_text(l[0], l[1], False), tqdm(lines)))
print(lines[:100])

10519it [00:31, 330.85it/s]

[[], [('passage', 0.005712530712530713), ('growth', 0.0049754299754299755), ('use,', 0.004545454545454545), ('functions,', 0.003931203931203931), ('successors', 0.0036855036855036856), ('place,', 0.0035626535626535625), ('own,', 0.0031941031941031942), ('own', 0.0031941031941031942), ('head', 0.00300982800982801), ('power', 0.0029484029484029483), ('action,', 0.002764127764127764), ('work', 0.0025798525798525797), ('members', 0.0025184275184275185), ('value,', 0.0025184275184275185), ('value', 0.002334152334152334), ('vicinity,', 0.002334152334152334), ('name', 0.002334152334152334), ('place', 0.0022727272727272726), ('beauty', 0.0022113022113022115), ('strength', 0.0022113022113022115)], [], [], [('undertook', 1.0)], [('a', 0.2926829268292683), ('two', 0.08536585365853659), ('goodnand', 0.07317073170731707), ('him', 0.054878048780487805), ('means', 0.036585365853658534), ('money', 0.03048780487804878), ('all', 0.024390243902439025), ('force', 0.024390243902439025), ('just', 0.01829268




In [63]:
with open("dev-0/out.tsv", "w", encoding="UTF-8") as file:
 result = "\n".join(list(map(lambda l: convert_predictions(l), tqdm(lines))))
 file.write(result)
 file.close()

100%|██████████| 10519/10519 [00:00<00:00, 111905.55it/s]


In [64]:
# PREDICTION FOR TEST-A

dataframe = pd.read_csv(directory_test_A, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], escapechar='\\', quoting=csv.QUOTE_NONE)

left_text = dataframe['LeftContext'].apply(lambda l: re.split(r"\s+", l)).to_list()
right_text = dataframe['RightContext'].apply(lambda l: re.split(r"\s+", l)).to_list()

lines = zip(left_text, right_text)
lines = list(map(lambda l: model.generate_text(l[0], l[1], False), tqdm(lines)))
print(lines[:100])

7414it [00:17, 422.07it/s]

[[], [('home', 0.08333333333333333), ('decline', 0.08333333333333333), ('or', 0.08333333333333333), ('spread', 0.08333333333333333), ('is', 0.08333333333333333), ('numerous', 0.08333333333333333), ('road', 0.08333333333333333), ('owned', 0.08333333333333333), ('resides', 0.08333333333333333), ('taxes', 0.08333333333333333), ('whitely', 0.08333333333333333), ('water', 0.08333333333333333)], [], [('man', 0.01770717393503997), ('plan', 0.009106546595163412), ('trial', 0.006779318020843873), ('living', 0.00647576646767176), ('statement', 0.005868663361327532), ('law', 0.005868663361327532), ('vote', 0.005261560254983305), ('class', 0.005059192552868562), ('year', 0.00485682485075382), ('sensation', 0.0043509055954669635), ('question', 0.004148537893352221), ('single', 0.004148537893352221), ('bill', 0.0040473540422948494), ('day', 0.0040473540422948494), ('government', 0.0034402509359506223), ('time', 0.0034402509359506223), ('paper', 0.003339067084893251), ('means', 0.002934331680663766),




In [65]:
with open("test-A/out.tsv", "w", encoding="UTF-8") as file:
 result = "\n".join(list(map(lambda l: convert_predictions(l), tqdm(lines))))
 file.write(result)
 file.close()

100%|██████████| 7414/7414 [00:00<00:00, 128642.81it/s]
