import pandas as pd import csv from collections import Counter, defaultdict from nltk.tokenize import RegexpTokenizer from nltk import trigrams import regex as re import lzma import kenlm from math import log10 from english_words import english_words_set class WordPred: def __init__(self): self.tokenizer = RegexpTokenizer(r"\w+") # self.model = defaultdict(lambda: defaultdict(lambda: 0)) self.model = kenlm.Model("model.binary") self.words = set() def read_file(self, file): for line in file: text = line.split("\t") yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower())) def read_file_7(self, file): for line in file: text = line.split("\t") yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', text[7].replace("\\n", " ").replace("\n", "").lower())) def fill_words(self, file_path, output_file): with open(output_file, 'w') as out: with lzma.open(file_path, mode='rt') as file: for text in self.read_file(file): for mword in text.split(" "): if mword not in self.words: out.write(mword + "\n") self.words.add(mword) def read_words(self, file_path): with open(file_path, 'r') as fin: for word in fin.readlines(): word = word.replace("\n", "") if word: self.words.add(word) def create_train_file(self, file_path, output_path, rows=10000): with open(output_path, 'w') as outputfile: with lzma.open(file_path, mode='rt') as file: for index, text in enumerate(self.read_file(file)): outputfile.write(text) if index == rows: break outputfile.close() def generate_outputs(self, input_file, output_file): with open(output_file, 'w') as outputf: with lzma.open(input_file, mode='rt') as file: for index, text in enumerate(self.read_file_7(file)): tokens = self.tokenizer.tokenize(text) if len(tokens) < 4: prediction = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1' else: prediction = wp.predict_probs(tokens[0], tokens[1]) outputf.write(prediction + '\n') def predict_probs(self, word1, word2): preds = [] for word in english_words_set: sentence = word1 + ' ' + word + ' ' + word2 words_score = self.model.score(sentence, bos=False, eos=False) if len(preds) < 12: preds.append((word, words_score)) else: min_score = preds[0] for score in preds: if min_score[1] > score[1]: min_score = score if min_score[1] < words_score: preds.remove(min_score) preds.append((word, words_score)) probs = sorted(preds, key=lambda sc: sc[1], reverse=True) str_prediction = '' for word, prob in probs: str_prediction += f'{word}:{prob} ' str_prediction += f':{log10(0.99)}' return str_prediction if __name__ == "__main__": wp = WordPred() # wp.create_train_file("train/in.tsv.xz", "train/in.txt") # wp.fill_words("train/in.tsv.xz", "words.txt") # wp.read_words("words.txt") wp.generate_outputs("dev-0/in.tsv.xz", "dev-0/out3.tsv") wp.generate_outputs("test-A/in.tsv.xz", "test-A/out3.tsv")