#!/usr/bin/env python # coding: utf-8 # In[2]: from nltk import trigrams, word_tokenize import pandas as pd import csv import regex as re from collections import Counter, defaultdict import kenlm from english_words import english_words_alpha_set from math import log10 # In[3]: train_set = pd.read_csv( 'train/in.tsv.xz', sep='\t', header=None, quoting=csv.QUOTE_NONE, nrows=35000) train_labels = pd.read_csv( 'train/expected.tsv', sep='\t', header=None, quoting=csv.QUOTE_NONE, nrows=35000) # In[4]: data = pd.concat([train_set, train_labels], axis=1) # In[5]: data = train_set[6] + train_set[0] + train_set[7] # In[6]: def data_preprocessing(text): return re.sub(r'\p{P}', '', text.lower().replace('-\\n', '').replace('\\n', ' ').replace("'ll", " will").replace("-", "").replace("'ve", " have").replace("'s", " is")) # In[8]: data = data.apply(data_preprocessing) prediction = 'the:0.03 be:0.03 to:0.03 of:0.025 and:0.025 a:0.025 in:0.020 that:0.020 have:0.015 I:0.010 it:0.010 for:0.010 not:0.010 on:0.010 with:0.010 he:0.010 as:0.010 you:0.010 do:0.010 at:0.010 :0.77' # In[25]: with open("train_file.txt", "w+") as f: for text in data: f.write(text + "\n") # In[27]: KENLM_BUILD_PATH='../kenlm/build/bin/lmplz' # In[28]: get_ipython().system('$KENLM_BUILD_PATH -o 4 < train_file.txt > kenlm_model.arpa') # In[29]: import os print(os.getcwd()) model = kenlm.Model('kenlm_model.arpa') # In[30]: def predict(before, after): result = '' prob = 0.0 best = [] for word in english_words_alpha_set: text = ' '.join([before, word, after]) text_score = model.score(text, bos=False, eos=False) if len(best) < 12: best.append((word, text_score)) else: is_better = False worst_score = None for score in best: if not worst_score: worst_score = score else: if worst_score[1] > score[1]: worst_score = score if worst_score[1] < text_score: best.remove(worst_score) best.append((word, text_score)) probs = sorted(best, key=lambda tup: tup[1], reverse=True) pred_str = '' for word, prob in probs: pred_str += f'{word}:{prob} ' pred_str += f':{log10(0.99)}' return pred_str # In[31]: def make_prediction(path, result_path): data = pd.read_csv(path, sep='\t', header=None, quoting=csv.QUOTE_NONE) with open(result_path, 'w', encoding='utf-8') as file_out: for _, row in data.iterrows(): before, after = word_tokenize(data_preprocessing(str(row[6]))), word_tokenize(data_preprocessing(str(row[7]))) if len(before) < 2 or len(after) < 2: pred = prediction else: pred = predict(before[-1], after[0]) file_out.write(pred + '\n') # In[32]: make_prediction("dev-0/in.tsv.xz", "dev-0/out.tsv") # In[33]: make_prediction("test-A/in.tsv.xz", "test-A/out.tsv")