import string import unicodedata from nltk.tokenize import word_tokenize from nltk import trigrams from collections import defaultdict, Counter import pandas as pd import csv import regex as re DEFAULT_PREDICTION = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1' def preprocess_text(text): # remove punctuation text = text.translate(str.maketrans(' ', ' ', string.punctuation)) # only alphabets and numerics text = re.sub('[^a-zA-Z]', ' ', text) # replace newline with space text = re.sub("\n", " ", text) # lower case text = text.lower() # split and join the words text = ' '.join(text.split()) return text def predict_probs(word1, word2): raw_prediction = dict(model[word1, word2]) prediction = dict(Counter(raw_prediction).most_common(6)) total_prob = 0.0 str_prediction = '' for word, prob in prediction.items(): total_prob += prob str_prediction += f'{word}:{prob} ' if total_prob == 0.0: return DEFAULT_PREDICTION remaining_prob = 1 - total_prob if remaining_prob < 0.01: remaining_prob = 0.01 str_prediction += f':{remaining_prob}' return str_prediction def train_model(training_data): for index, row in training_data.iterrows(): text = preprocess_text(str(row["final"])) words = word_tokenize(text) for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True): if w1 and w2 and w3: model[(w2, w3)][w1] += 1 model[(w1, w2)][w3] += 1 for word_pair in model: num_n_grams = float(sum(model[word_pair].values())) for word in model[word_pair]: model[word_pair][word] /= num_n_grams data = pd.read_csv( "train/in.tsv.xz", sep="\t", error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=100000, ) train_labels = pd.read_csv( "train/expected.tsv", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=100000, ) train_data = data[[6, 7]] train_data = pd.concat([train_data, train_labels], axis=1) train_data["final"] = train_data[6] + train_data[0] + train_data[7] model = defaultdict(lambda: defaultdict(lambda: 0)) dev_data = pd.read_csv('dev-0/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE) test_data = pd.read_csv('test-A/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE) train_model(train_data) with open("dev-0/out.tsv", "w") as file: for _, row in dev_data.iterrows(): text = preprocess_text(str(row[7])) words = word_tokenize(text) if len(words) < 3: prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1" else: prediction = predict_probs(words[0], words[1]) file.write(prediction + "\n") with open("test-A/out.tsv", "w") as file: for _, row in test_data.iterrows(): text = preprocess_text(str(row[7])) words = word_tokenize(text) if len(words) < 3: prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1" else: prediction = predict_probs(words[0], words[1]) file.write(prediction + "\n")