From cb4381bb17379a9965252282ffc80640282d3b77 Mon Sep 17 00:00:00 2001 From: s444417 Date: Wed, 26 Apr 2023 13:16:55 +0000 Subject: [PATCH] add removed solutions --- bigram/bigram.py | 105 ++++++++++++++++++++++++++++++++++++++++++++ trigram/trigram.py | 107 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 212 insertions(+) create mode 100644 bigram/bigram.py create mode 100644 trigram/trigram.py diff --git a/bigram/bigram.py b/bigram/bigram.py new file mode 100644 index 0000000..82e92ca --- /dev/null +++ b/bigram/bigram.py @@ -0,0 +1,105 @@ +import csv +import pandas as pd +import regex as re +import nltk +import tqdm +from nltk import bigrams, word_tokenize +from collections import Counter, defaultdict +import string + +nltk.download("punkt") + +most_common_en_word = "the:0.4 be:0.2 to:0.1 of:0.05 and:0.025 a:0.0125 :0.2125" +train_count = 125000 +# train set +train_data = pd.read_csv("train/in.tsv.xz", sep="\t", on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE, nrows=train_count) + +# training labels +train_labels = pd.read_csv("train/expected.tsv", sep="\t", on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE,nrows=train_count) + +dev_data = pd.read_csv("dev-0/in.tsv.xz", sep="\t", on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE) + +test_data = pd.read_csv("test-A/in.tsv.xz", sep="\t", on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE) + +def prepare_text(text): + text = text.lower().replace("-\\n", "").replace("\\n", " ") + text = re.sub(r"\p{P}", "", text) + return text + +def train_bigrams(): + for _, row in tqdm.tqdm(train_data.iterrows()): + text = prepare_text(str(row["final"])) + words = word_tokenize(text) + for w1, w2 in bigrams(words, pad_right=True, pad_left=True): + if all([w1, w2]): + model[w2][w1] += 1 + + for w_pair in model: + ngram_count = float(sum(model[w_pair].values())) + for w2 in model[w_pair]: + model[w_pair][w2] /= ngram_count + + +def predict_probs(word): + raw_prediction = dict(model[word]) + prediction = dict(Counter(raw_prediction).most_common(6)) + + total_prob = 0.0 + str_prediction = "" + + for w, prob in prediction.items(): + total_prob += prob + str_prediction += f"{w}:{prob} " + + if total_prob == 0.0: + return most_common_en_word + + remaining_prob = 1 - total_prob + + if remaining_prob < 0.01: + remaining_prob = 0.01 + + str_prediction += f":{remaining_prob}" + + return str_prediction + + +def write_output(): + with open("dev-0/out.tsv", "w") as file: + for _, row in dev_data.iterrows(): + text = prepare_text(str(row[7])) + words = word_tokenize(text) + if len(words) < 2: + prediction = most_common_en_word + else: + prediction = predict_probs(words[0]) + file.write(prediction + "\n") + + with open("test-A/out.tsv", "w") as file: + for _, row in test_data.iterrows(): + text = prepare_text(str(row[7])) + words = word_tokenize(text) + if len(words) < 2: + prediction = most_common_en_word + else: + prediction = predict_probs(words[0]) + file.write(prediction + "\n") + +if __name__ == "__main__": + # Preapare train data + print("Preparing data...") + train_data = train_data[[6, 7]] + train_data = pd.concat([train_data, train_labels], axis=1) + train_data["final"] = train_data[6] + train_data[0] + train_data[7] + + # declare model + print("Preparing model...") + model = defaultdict(lambda: defaultdict(lambda: 0)) + + # train model + print("Model training...") + train_bigrams() + + # write outputs + print("Writing outputs...") + write_output() \ No newline at end of file diff --git a/trigram/trigram.py b/trigram/trigram.py new file mode 100644 index 0000000..bef859d --- /dev/null +++ b/trigram/trigram.py @@ -0,0 +1,107 @@ +import csv +import pandas as pd +import regex as re +import nltk +import tqdm +from nltk import trigrams, word_tokenize +from collections import Counter, defaultdict +import string + +nltk.download("punkt") + +most_common_en_word = "the:0.4 be:0.2 to:0.1 of:0.05 and:0.025 a:0.0125 :0.2125" +train_count = 125000 +# train set +train_data = pd.read_csv("train/in.tsv.xz", sep="\t", on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE, nrows=train_count) + +# training labels +train_labels = pd.read_csv("train/expected.tsv", sep="\t", on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE,nrows=train_count) + +dev_data = pd.read_csv("dev-0/in.tsv.xz", sep="\t", on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE) + +test_data = pd.read_csv("test-A/in.tsv.xz", sep="\t", on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE) + +def prepare_text(text): + text = text.lower().replace("-\\n", "").replace("\\n", " ") + text = re.sub(r"\p{P}", "", text) + return text + +def train_trigrams(): + for _, row in tqdm.tqdm(train_data.iterrows()): + text = prepare_text(str(row["final"])) + words = word_tokenize(text) + for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True): + if all([w1, w2, w3]): + model[(w2, w3)][w1] += 1 + model[(w1, w2)][w3] += 1 + + for w_pair in model: + ngram_count = float(sum(model[w_pair].values())) + for w3 in model[w_pair]: + model[w_pair][w3] /= ngram_count + + +def predict_probs(word1, word2): + raw_prediction = dict(model[word1, word2]) + prediction = dict(Counter(raw_prediction).most_common(6)) + + total_prob = 0.0 + str_prediction = "" + + for word, prob in prediction.items(): + total_prob += prob + str_prediction += f"{word}:{prob} " + + if total_prob == 0.0: + return most_common_en_word + + remaining_prob = 1 - total_prob + + if remaining_prob < 0.01: + remaining_prob = 0.01 + + str_prediction += f":{remaining_prob}" + + return str_prediction + + +def write_output(): + with open("dev-0/out.tsv", "w") as file: + for _, row in dev_data.iterrows(): + text = prepare_text(str(row[7])) + words = word_tokenize(text) + if len(words) < 3: + prediction = most_common_en_word + else: + prediction = predict_probs(words[0], words[1]) + file.write(prediction + "\n") + + with open("test-A/out.tsv", "w") as file: + for _, row in test_data.iterrows(): + text = prepare_text(str(row[7])) + words = word_tokenize(text) + if len(words) < 3: + prediction = most_common_en_word + else: + prediction = predict_probs(words[0], words[1]) + file.write(prediction + "\n") + + +if __name__ == "__main__": + # Preapare train data + print("Preparing data...") + train_data = train_data[[6, 7]] + train_data = pd.concat([train_data, train_labels], axis=1) + train_data["final"] = train_data[6] + train_data[0] + train_data[7] + + # declare model + print("Preparing model...") + model = defaultdict(lambda: defaultdict(lambda: 0)) + + # train model + print("Model training...") + train_trigrams() + + # write outputs + print("Writing outputs...") + write_output() \ No newline at end of file