#!/usr/bin/env python # coding: utf-8 # In[136]: import nltk from nltk import trigrams from nltk.tokenize import word_tokenize from collections import defaultdict, Counter from nltk import ngrams import pandas as pd import csv import re model = defaultdict(lambda: defaultdict(lambda: 0)) setOf = set() alpha = 0.02 # In[173]: train_file_in = pd.read_csv("train/in.tsv.xz", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=200000) train_file_out = pd.read_csv("train/expected.tsv", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=200000) print("read train file") # In[174]: stop_words= nltk.corpus.stopwords.words('english') def get_20common_2grams(text, n): outputTrigrams = [] n_grams = ngrams(nltk.tokenize.word_tokenize(text), n) for grams in n_grams: outputTrigrams.append(grams) return outputTrigrams def get_20common_2grams_no_stop(text, n): tokenized_world = nltk.tokenize.word_tokenize(text) stop_words= nltk.corpus.stopwords.words('english') tokenized_no_stop = [i for i in tokenized_world if i not in stop_words] n_grams = ngrams(tokenized_no_stop, n) return n_grams def predict(word_before, word_after): prob_list = dict(Counter(model[(word_before, word_after)]).most_common(6)).items() predictions = [] prob_sum = 0.0 for key, value in prob_list: prob_sum += value predictions.append(f'{key}:{value}') if prob_sum == 0.0: return 'the:0:2 be:0.2 to:0.2 of:0.15 and:0.15 :0.1' remaining_prob = 1 - prob_sum if remaining_prob < 0.01: predictions.append(f':{0.01}') return ' '.join(predictions) # In[175]: train = train_file_in[[6, 7]] train = pd.concat([train, train_file_out], axis=1) train["result"] = train[6] + train[0] + train[7] # In[ ]: for index, row in train.iterrows(): text = str(row["result"]).lower().replace('-\\n', '').replace('\\n', ' ') words = word_tokenize(text) for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True): if w1 and w2 and w3: model[(w2, w3)][w1] += 1 setOf.add(w1) setOf.add(w2) setOf.add(w3) for words in model: num_n_grams = float(sum(model[words].values())) for word in model[words]: model[words][word] = (model[words][word] + alpha) / (num_n_grams + alpha * len(setOf)) # In[ ]: print("train model") for key in model: total_count = float(sum(model[key].values())) for value in model[key]: model[key][value] /= total_count # In[ ]: dev_data = pd.read_csv('dev-0/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE) test_a_data = pd.read_csv('test-A/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE) # In[ ]: print("dev_0"); with open('dev-0/out.tsv', 'w', encoding="utf-8") as file: for index, row in dev_data.iterrows(): text = str(row[7]).lower().replace('-\\n', '').replace('\\n', ' ') words = word_tokenize(text) if len(words) < 4: print(words) prediction = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1' else: prediction = predict(words[0], words[1]) file.write(prediction + '\n') print("test_A"); with open('test-A/out.tsv', 'w', encoding="utf-8") as file: for index, row in test_a_data.iterrows(): text = str(row[7]).lower().replace('-\\n', '').replace('\\n', ' ') words = word_tokenize(text) if len(words) < 4: print(words) prediction = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1' else: prediction = predict(words[0], words[1]) file.write(prediction + '\n') # In[ ]: # test dla alpha 0.01 = 918.59 # test dla alpha 0.001 = 919.14 # test dla alpha 0.02 = 917.99