#%% import pandas as pd from collections import defaultdict, Counter from sqlalchemy import true from nltk import trigrams, word_tokenize, bigrams import csv #%% class Model: def __init__(self): self.model = defaultdict(lambda: defaultdict(lambda: 0)) self.model_bi = defaultdict(lambda: defaultdict(lambda: 0)) train_in = pd.read_csv("train/in.tsv.xz", sep='\t', header=None, encoding="UTF-8", on_bad_lines="skip", quoting=csv.QUOTE_NONE, nrows=300000)[[6, 7]] train_expected = pd.read_csv("train/expected.tsv", sep='\t', header=None, encoding="UTF-8", on_bad_lines="skip", quoting=csv.QUOTE_NONE, nrows=300000) data = pd.concat([train_in, train_expected], axis=1) self.data = data[6] + data[0] + data[7] self.data = self.data.apply(self.clean) def clean(self, text): text = str(text).lower().strip().replace("’", "'").replace('\\n', " ").replace("'t", " not").replace("'s", " is").replace("'ll", " will").replace("'m", " am").replace("'ve", " have").replace(",", "").replace("-", "") return text def train(self): alpha = 0.6 vocab = set() for text in model.data: words = word_tokenize(text) for w1, w2, w3 in trigrams(words): self.model[w1, w2][w3] += 1 vocab.add(w1) vocab.add(w2) vocab.add(w3) for w1, w2 in bigrams(words): self.model_bi[w1][w2] +=1 for w1, w2 in self.model: total_count = float(sum(self.model[w1, w2].values())) for w in self.model[w1, w2]: self.model[w1, w2][w] = (self.model[w1, w2][w] / total_count) * alpha for w1 in self.model_bi: total_count = float(sum(self.model_bi[w1].values())) for w in self.model_bi[w1]: self.model_bi[w1][w] = (self.model_bi[w1][w] / total_count) * (1-alpha) def predict(self, words): trigrams = Counter(dict(self.model[words])) bigrams = Counter(dict(self.model_bi[words[-1]])) predictions = dict((trigrams + bigrams).most_common(6)) total_prob = 0 result = "" for word, prob in predictions.items(): total_prob += prob result += f"{word}:{prob} " if len(result) == 0: return "a:0.2 the:0.2 to:0.2 of:0.1 and:0.1 of:0.1 :0.1" return result + f":{max(1-total_prob, 0.01)}" model = Model() #%% model.data model.train() #%% def predict(model, path, result_path): data = pd.read_csv(path, sep='\t', header=None, encoding="UTF-8", on_bad_lines="skip", quoting=csv.QUOTE_NONE)[7] with open(result_path, "w+", encoding="UTF-8") as f: for text in data: words = word_tokenize(model.clean(text)) if len(words) < 2: prediction = "a:0.2 the:0.2 to:0.2 of:0.1 and:0.1 of:0.1 :0.1" else: prediction = model.predict((words[-2], words[-1])) f.write(prediction + "\n") predict(model, "dev-0/in.tsv.xz", "dev-0/out.tsv") predict(model, "test-A/in.tsv.xz", "test-A/out.tsv")