In [1]:
import pandas as pd
import numpy as np
import csv
import re
from collections import Counter, defaultdict
import nltk
import math
from tqdm import tqdm

In [2]:
directory = "train/in.tsv.xz"
directory_expected = "train/expected.tsv"
directory_dev_0 = "dev-0/in.tsv.xz"
directory_test_A = "test-A/in.tsv.xz"

### MODEL N-GRAM

In [3]:
class Model():
    
    def __init__(self, vocab_size=30_000, UNK_token= '<UNK>', n=3):
        if (n <= 1 or n % 2 == 0):
            raise "change N value !!!"
        self.n = n
        self.vocab_size = vocab_size
        self.UNK_token = UNK_token
    
    def train(self, corpus:list) -> None:
        if(self.n > 1):
            self.n_grams = list(nltk.ngrams(corpus, n=self.n))
        else:
            self.n_grams = corpus
        self.counter = Counter(self.n_grams)
        self.words_counter = Counter(corpus)
        self.all_quantities = Counter([gram[:math.floor(self.n/2)]+gram[math.ceil(self.n/2):] for gram in self.n_grams])

        self.all_grams = defaultdict(set)

        for gram in tqdm(self.n_grams):
            previous_words = tuple(gram[:math.floor(self.n/2)])
            next_words = tuple(gram[math.ceil(self.n/2):])
            word = gram[math.floor(self.n/2)]
            self.all_grams[(previous_words, next_words)].add(word)

    def get_conditional_prob_for_word(self, left_text: list, right_text: list, word: str) -> float:
        previous_words = tuple(left_text[-math.floor(self.n/2):])
        next_words = tuple(right_text[:math.floor(self.n/2)])
        quantity = self.counter[previous_words + tuple([word]) + next_words]
        all_quantity = self.all_quantities[previous_words + next_words]
        if (all_quantity <= 0):
            return 0
        return quantity/all_quantity
    
    def get_prob_for_text(self, text: list) -> float:
        prob = 1
        for gram in list(nltk.ngrams(text, self.n)):
            prob *= self.get_conditional_prob_for_word(gram[:math.floor(self.n/2)], gram[math.ceil(self.n/2):], gram[math.floor(self.n/2)])
        return prob
    
    def most_probable_words(self, left_text: list, right_text: list) -> str:
        previous_words = tuple(left_text[-math.floor(self.n/2):])
        next_words = tuple(right_text[:math.floor(self.n/2)])
        all_words = self.all_grams[(previous_words, next_words)]
        best_words = []
        for word in all_words:
            probability = self.get_conditional_prob_for_word(list(previous_words), list(next_words), word)
            best_words.append((word, probability))
        return sorted(best_words, key=(lambda l: l[1]), reverse=True)[:20]
    
    def generate_text(self, text_beggining:list, text_ending:list, greedy: bool) -> list:
        words = self.most_probable_words(text_beggining, text_ending)
        return words


### DATASET

In [4]:
dataframeList = pd.read_csv(directory, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, chunksize=10000)

expectedList = pd.read_csv(directory_expected, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, chunksize=10000)

DATASET = ""

for number, (dataframe, expected) in enumerate(zip(dataframeList, expectedList)):
    dataframe = dataframe.reset_index()
    dataframe = dataframe.replace(r'\\r|\\n|\n|\\t', ' ', regex=True)

    expected['Word'] = expected['Word'].apply(lambda x: str(x).strip())
    word = expected['Word']

    left_text = dataframe['LeftContext'].to_list()
    right_text = dataframe['RightContext'].to_list()
    word = expected['Word'].to_list()

    lines = zip(left_text, word, right_text)
    lines = list(map(lambda l: " ".join(l), lines))
    DATASET = DATASET + " ".join(lines)

FINAL_DATASET = re.split(r"\s+", DATASET)
print(FINAL_DATASET[:100])

['came', 'fiom', 'the', 'last', 'place', 'to', 'this', 'place,', 'and', 'this', 'place', 'is', 'Where', 'We', 'Were,', 'this', 'is', 'the', 'first', 'road', 'I', 'ever', 'was', 'on', 'where', 'you', 'can', 'ride', 'elsewhere', 'from', 'anywhere', 'and', 'be', 'nowhere.', 'He', 'says,', 'while', 'this', 'train', 'stops', 'every-', 'where,', 'it', 'never', 'stops', 'anywhere', 'un-', 'less', 'its', 'somewhere.', 'Well,', 'I', 'says,', "I'm", 'glad', 'to', 'hear', 'that,', 'but,', 'accord-', 'ing', 'to', 'your', 'figures,', 'I', 'left', 'myself', 'where', '1', 'was,', 'which', 'is', 'five', 'miles', 'near-', 'er', 'to', 'myself', 'than', 'I', 'was', 'when', 'we', 'were', 'where', 'we', 'are', 'now.', 'We', 'have', 'now', 'reached', 'Slidell.', "That's", 'a', 'fine', 'place.', 'The', 'people', 'down']


### TRAIN

In [5]:
model_3gram = Model(n = 3)
model_3gram.train(FINAL_DATASET)

100%|██████████| 139475976/139475976 [04:39<00:00, 498903.89it/s]


In [6]:
model = model_3gram

### PREDICTION

In [7]:
def convert_predictions(line):
    sum_predictions = np.sum([pred[1] for pred in line])
    result = ""
    all_pred = 0
    for word, pred in line:
        new_pred = math.floor(pred / sum_predictions * 100) / 100
        if(new_pred == 1.0):
            new_pred = 0.99
        all_pred = all_pred + new_pred
        result = result + word + ":" + str(new_pred) + " "
    if(round(all_pred, 2) < 1):
        result = result + ":" + str(round(1 - all_pred, 2))
    else:
        result = result + ":" + str(0.01)
    return result

In [8]:
# PREDICTION FOR DEV-0

dataframe = pd.read_csv(directory_dev_0, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE)
dataframe = dataframe.replace(r'\\r|\\n|\n|\\t', ' ', regex=True)

left_text = dataframe['LeftContext'].apply(lambda l: re.split(r"\s+", l)).to_list()
right_text = dataframe['RightContext'].apply(lambda l: re.split(r"\s+", l)).to_list()

lines = zip(left_text, right_text)
lines = list(map(lambda l: model.generate_text(l[0], l[1], False), tqdm(lines)))
print(lines[:100])

10519it [00:27, 385.35it/s]

[[], [('successors', 0.006017228274638966), ('passage', 0.005193818089688371), ('place,', 0.005067139599695972), ('growth', 0.004813782619711173), ('use,', 0.004117050924752977), ('head', 0.003737015454775779), ('functions,', 0.0034836584747909806), ('power', 0.0034836584747909806), ('place', 0.003356979984798581), ('own,', 0.0032936407398023817), ('own', 0.0032936407398023817), ('members', 0.0032936407398023817), ('work', 0.003230301494806182), ('principles', 0.0031669622498099823), ('strength', 0.003040283759817583), ('value', 0.003040283759817583), ('beauty', 0.0026602482898403852), ('business', 0.0025969090448441853), ('size', 0.0025969090448441853), ('history', 0.0025969090448441853)], [('a', 0.5), ('lha', 0.25), ('the', 0.25)], [], [], [('a', 0.32934131736526945), ('him', 0.0718562874251497), ('two', 0.0718562874251497), ('only', 0.029940119760479042), ('just', 0.029940119760479042), ('means', 0.023952095808383235), ('money', 0.017964071856287425), ('force', 0.017964071856287425)




In [9]:
with open("dev-0/out.tsv", "w", encoding="UTF-8") as file:
    result = "\n".join(list(map(lambda l: convert_predictions(l), tqdm(lines))))
    file.write(result)
    file.close()

100%|██████████| 10519/10519 [00:00<00:00, 106254.34it/s]


In [10]:
# PREDICTION FOR TEST-A

dataframe = pd.read_csv(directory_test_A, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE)
dataframe = dataframe.replace(r'\\r|\\n|\n|\\t', ' ', regex=True)

left_text = dataframe['LeftContext'].apply(lambda l: re.split(r"\s+", l)).to_list()
right_text = dataframe['RightContext'].apply(lambda l: re.split(r"\s+", l)).to_list()

lines = zip(left_text, right_text)
lines = list(map(lambda l: model.generate_text(l[0], l[1], False), tqdm(lines)))
print(lines[:100])

7414it [00:16, 457.43it/s]

[[], [], [('the', 0.9), ('tho', 0.1)], [('man', 0.02725228204788993), ('plan', 0.012567799973541474), ('trial', 0.010715703135335361), ('living', 0.009921947347532743), ('statement', 0.009525069453631433), ('law', 0.008334435771927504), ('class', 0.008202143140627068), ('time', 0.007937557878026195), ('government', 0.005953168408519645), ('bill', 0.0054239978833179), ('year', 0.0054239978833179), ('question', 0.005291705252017462), ('sensation', 0.005291705252017462), ('day', 0.005159412620717026), ('corporation,', 0.005159412620717026), ('little', 0.0050271199894165895), ('vote', 0.004894827358116153), ('single', 0.004762534726815717), ('means', 0.00423336420161397), ('speech', 0.004101071570313534)], [], [('to', 0.16666666666666666), ('here', 0.16666666666666666), ('youngsters,', 0.08333333333333333), ('vines', 0.08333333333333333), ('material', 0.08333333333333333), ('plaster,', 0.08333333333333333), ('fabrics', 0.08333333333333333), ('mist', 0.08333333333333333), ('arms,', 0.083333




In [11]:
with open("test-A/out.tsv", "w", encoding="UTF-8") as file:
    result = "\n".join(list(map(lambda l: convert_predictions(l), tqdm(lines))))
    file.write(result)
    file.close()

100%|██████████| 7414/7414 [00:00<00:00, 112933.06it/s]
