From ca0a2feda4d68967c7c6144e30fcc77a519894b3 Mon Sep 17 00:00:00 2001 From: Adrian Charkiewicz Date: Wed, 26 Apr 2023 18:03:21 +0200 Subject: [PATCH] added trigram --- dev-0/lm0.py | 37 ------------------ trigram.py | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 37 deletions(-) delete mode 100644 dev-0/lm0.py create mode 100644 trigram.py diff --git a/dev-0/lm0.py b/dev-0/lm0.py deleted file mode 100644 index dd41624..0000000 --- a/dev-0/lm0.py +++ /dev/null @@ -1,37 +0,0 @@ -import torch -from transformers import AutoTokenizer, AutoModelForMaskedLM -import sys - -tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") -model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased") - -for line in sys.stdin: - line_splitted = line.split("\t") - left_context = line_splitted[6].split(" ")[-1] - right_context = line_splitted[7].split(" ")[0] - - word = "[MASK]" - - text = f"{left_context} {word} {right_context}" - - input_ids = tokenizer.encode(text, add_special_tokens=False, return_tensors="pt", max_length=512, truncation=True) - - mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1][0] - - with torch.inference_mode(): - outputs = model(input_ids) - predictions = outputs[0][0, mask_token_index].softmax(dim=0) - - top_k = 500 - top_k_tokens = torch.topk(predictions, top_k).indices.tolist() - result = '' - prob_sum = 0 - for token in top_k_tokens: - word = tokenizer.convert_ids_to_tokens([token])[0] - prob = predictions[token].item() - prob_sum += prob - result += f"{word}:{prob} " - diff = 1.0 - prob_sum - result += f":{diff}" - print(result) - diff --git a/trigram.py b/trigram.py new file mode 100644 index 0000000..d181033 --- /dev/null +++ b/trigram.py @@ -0,0 +1,107 @@ +import csv +import pandas as pd +import regex as re +import nltk +import tqdm +from nltk import trigrams, word_tokenize +from collections import Counter, defaultdict +import string + +nltk.download("punkt") + +most_common_en_word = "the:0.3 be:0.2 to:0.15 of:0.1 and:0.025 a:0.0125 :0.2125" +train_count = 150000 +# train set +train_data = pd.read_csv("train/in.tsv.xz", sep="\t", on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE, nrows=train_count) + +# training labels +train_labels = pd.read_csv("train/expected.tsv", sep="\t", on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE,nrows=train_count) + +dev_data = pd.read_csv("dev-0/in.tsv.xz", sep="\t", on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE) + +test_data = pd.read_csv("test-A/in.tsv.xz", sep="\t", on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE) + +def prepare_text(text): + text = text.lower().replace("-\\n", "").replace("\\n", " ") + text = re.sub(r"\p{P}", "", text) + return text + +def train_trigrams(): + for _, row in tqdm.tqdm(train_data.iterrows()): + text = prepare_text(str(row["final"])) + words = word_tokenize(text) + for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True): + if all([w1, w2, w3]): + model[(w2, w3)][w1] += 1 + model[(w1, w2)][w3] += 1 + + for w_pair in model: + ngram_count = float(sum(model[w_pair].values())) + for w3 in model[w_pair]: + model[w_pair][w3] /= ngram_count + + +def predict_probs(word1, word2): + raw_prediction = dict(model[word1, word2]) + prediction = dict(Counter(raw_prediction).most_common(6)) + + total_prob = 0.0 + str_prediction = "" + + for word, prob in prediction.items(): + total_prob += prob + str_prediction += f"{word}:{prob} " + + if total_prob == 0.0: + return most_common_en_word + + remaining_prob = 1 - total_prob + + if remaining_prob < 0.01: + remaining_prob = 0.01 + + str_prediction += f":{remaining_prob}" + + return str_prediction + + +def write_output(): + with open("dev-0/out.tsv", "w") as file: + for _, row in dev_data.iterrows(): + text = prepare_text(str(row[7])) + words = word_tokenize(text) + if len(words) < 3: + prediction = most_common_en_word + else: + prediction = predict_probs(words[0], words[1]) + file.write(prediction + "\n") + + with open("test-A/out.tsv", "w") as file: + for _, row in test_data.iterrows(): + text = prepare_text(str(row[7])) + words = word_tokenize(text) + if len(words) < 3: + prediction = most_common_en_word + else: + prediction = predict_probs(words[0], words[1]) + file.write(prediction + "\n") + + +if __name__ == "__main__": + # Preapare train data + print("Preparing data...") + train_data = train_data[[6, 7]] + train_data = pd.concat([train_data, train_labels], axis=1) + train_data["final"] = train_data[6] + train_data[0] + train_data[7] + + # declare model + print("Preparing model...") + model = defaultdict(lambda: defaultdict(lambda: 0)) + + # train model + print("Model training...") + train_trigrams() + + # write outputs + print("Writing outputs...") + write_output()