diff --git a/run.py b/run.py index 2e707f1..faaff13 100644 --- a/run.py +++ b/run.py @@ -1,111 +1,66 @@ -from encodings import search_function -import lzma -from re import L -import regex as re -import string -import queue -# text = lzma.open('train/in.tsv.xz').read() -def read_file(file): - for line in file: - yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', line.split("\t")[7].replace("\\n"," ").replace("\n","").lower())).split(" ") - -def get_words(file): - for words in read_file(file): - yield from words - -def set_bigram_count(first_word, second_word, bigrams): - if f"{first_word}_{second_word}" not in bigrams: - bigrams[f"{first_word}_{second_word}"] = 1 - else: - bigrams[f"{first_word}_{second_word}"] += 1 - -def set_trigram_count(first_word, second_word, third_word, trigrams): - if f"{first_word}_{second_word}_{third_word}" not in trigrams: - trigrams[f"{first_word}_{second_word}_{third_word}"] = 1 - else: - trigrams[f"{first_word}_{second_word}_{third_word}"] += 1 - -def load_train(): - trigrams = {} - bigrams = {} - index = 0 - expected = open('train/expected.tsv', 'r') - with lzma.open('train/in.tsv.xz', mode='rt') as file: - for words in read_file(file): - expected_word = re.sub(r"[^\w\d'\s]+", '', expected.readline().replace("\n", "").lower()) - mv = 0 - if not words[0]: - mv = 1 - set_bigram_count(words[0+mv], words[1+mv], bigrams) - set_trigram_count(expected_word, words[0+mv], words[1+mv], trigrams) - print(bigrams) - print(trigrams) +import pandas as pd +import csv +from collections import Counter, defaultdict +from nltk.tokenize import RegexpTokenizer +from nltk import trigrams +class WordPred: -def predict(search_for_words): - trigrams = {} - bigrams = {} - index = 0 - expected = open('train/expected.tsv', 'r') - with lzma.open('train/in.tsv.xz', mode='rt') as file: - for words in read_file(file): - expected_word = re.sub(r"[^\w\d'\s]+", '', expected.readline().replace("\n", "").lower()) - mv = 0 - if not words[0]: - mv = 1 - for search_for_word in search_for_words: - if search_for_word[0] == words[0+mv] and search_for_word[1] == words[1+mv]: - set_bigram_count(words[0+mv], words[1+mv], bigrams) - set_trigram_count(expected_word, words[0+mv], words[1+mv], trigrams) - - if index == 100000: - break - index += 1 - - print(len(search_for_words)) - print(len(bigrams)) - print(len(trigrams)) - - left_context_search_for_word = {} - for bigram in bigrams: - max_count = 0 - for trigram in trigrams: - if bigram == '_'.join(trigram.split("_")[1:3]) and trigrams[trigram] > max_count: - max_count = trigrams[trigram] - left_context = trigram.split("_")[0] - left_context_search_for_word[bigram] = left_context + def __init__(self): + self.tokenizer = RegexpTokenizer(r"\w+") + self.model = defaultdict(lambda: defaultdict(lambda: 0)) - for index, search_for_word in enumerate(search_for_words): - hash_search_for_word = '_'.join(search_for_word) - if hash_search_for_word in left_context_search_for_word: - left_context = left_context_search_for_word[hash_search_for_word] - print(f"{index+1}: {left_context} {' '.join(search_for_word)} {trigrams['_'.join([left_context]+search_for_word)]/bigrams[hash_search_for_word]}") + def read_train_data(self, file): + data = pd.read_csv(file, compression='xz', sep="\t", error_bad_lines=False, index_col=0, header=None) + for row in data[:140000].itertuples(): + if len(row)<8: + continue + text = str(row[6]) + ' ' + str(row[7]) + tokens = self.tokenizer.tokenize(text) + for w1, w2, w3 in trigrams(tokens, pad_right=True, pad_left=True): + if w1 and w2 and w3: + self.model[(w2, w3)][w1] += 1 + + for word_pair in self.model: + num_n_grams = float(sum(self.model[word_pair].values())) + for word in self.model[word_pair]: + self.model[word_pair][word] /= num_n_grams + + def generate_outputs(self, input_file, output_file): + data = pd.read_csv(input_file, compression='xz', sep='\t', error_bad_lines=False, index_col=0, header=None, quoting=csv.QUOTE_NONE) + with open(output_file, 'w') as f: + for row in data.iterrows(): + text = str(row[7]) + tokens = self.tokenizer.tokenize(text) + if len(tokens) < 4: + prediction = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1' + else: + prediction = word_gap_prediction.predict_probs(tokens[0], tokens[1]) + f.write(prediction + '\n') + + def predict_probs(self, word1, word2): + predictions = dict(self.model[word1, word2]) + most_common = dict(Counter(predictions).most_common(6)) + + total_prob = 0.0 + str_prediction = '' + + for word, prob in most_common.items(): + total_prob += prob + str_prediction += f'{word}:{prob} ' + + if total_prob == 0.0: + return 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1' + + if 1 - total_prob >= 0.01: + str_prediction += f":{1-total_prob}" else: - print(f"{index+1}: ??? {' '.join(search_for_word)}") + str_prediction += f":0.01" + + return str_prediction -def load_dev(): - search_for_words = [] - with lzma.open('dev-0/in.tsv.xz', mode='rt') as file: - index = 0 - for words in read_file(file): - if words[0]: - search_for_words.append([words[0], words[1]]) - else: - search_for_words.append([words[1], words[2]]) - if index == 100: - break - index += 1 - print(search_for_words) - return search_for_words - -if __name__ == "__main__": - # load_train() - # load_dev() - predict(load_dev()) - # with lzma.open('train/in.tsv.xz', mode='rt') as file: - # index = 0 - # for _ in get_words(file): - # index += 1 - # print(index) # 141820215 - \ No newline at end of file +word_gap_prediction = WordPred() +word_gap_prediction.read_train_data('./train/in.tsv.xz') +# word_gap_prediction.generate_outputs('dev-0/in.tsv.xz', 'dev-0/out.tsv') +# word_gap_prediction.generate_outputs('test-A/in.tsv.xz', 'test-A/out.tsv') \ No newline at end of file diff --git a/run_nc.py b/run_nc.py index 1671f5a..3616fa7 100644 --- a/run_nc.py +++ b/run_nc.py @@ -6,8 +6,9 @@ import string import queue # text = lzma.open('train/in.tsv.xz').read() def read_file(file): - for line in file: - yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', line.split("\t")[7].replace("\\n"," ").replace("\n","").lower())).split(" ") + for line in file: + text = line.split("\t") + yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n"," ").replace("\n","").lower())).split(" ") def get_words(file): for words in read_file(file): @@ -26,10 +27,7 @@ def set_trigram_count(first_word, second_word, third_word, trigrams): trigrams[f"{first_word}_{second_word}_{third_word}"] += 1 def load_train(): - trigrams = {} - bigrams = {} index = 0 - expected = open('train/expected.tsv', 'r') with lzma.open('train/in.tsv.xz', mode='rt') as file: for words in read_file(file): expected_word = re.sub(r"[^\w\d'\s]+", '', expected.readline().replace("\n", "").lower()) diff --git a/run_nc_old.py b/run_nc_old.py new file mode 100644 index 0000000..0fa203e --- /dev/null +++ b/run_nc_old.py @@ -0,0 +1,133 @@ +from encodings import search_function +import lzma +from re import L +import regex as re +import string +import queue +# text = lzma.open('train/in.tsv.xz').read() +def read_file(file): + for line in file: + text = line.split("\t") + yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n"," ").replace("\n","").lower())).split(" ") + +def get_words(file): + for words in read_file(file): + yield from words + +def set_bigram_count(first_word, second_word, bigrams): + if f"{first_word}_{second_word}" not in bigrams: + bigrams[f"{first_word}_{second_word}"] = 1 + else: + bigrams[f"{first_word}_{second_word}"] += 1 + +def set_trigram_count(first_word, second_word, third_word, trigrams): + if f"{first_word}_{second_word}_{third_word}" not in trigrams: + trigrams[f"{first_word}_{second_word}_{third_word}"] = 1 + else: + trigrams[f"{first_word}_{second_word}_{third_word}"] += 1 + +def load_train(): + trigrams = {} + bigrams = {} + index = 0 + expected = open('train/expected.tsv', 'r') + with lzma.open('train/in.tsv.xz', mode='rt') as file: + for words in read_file(file): + expected_word = re.sub(r"[^\w\d'\s]+", '', expected.readline().replace("\n", "").lower()) + mv = 0 + if not words[0]: + mv = 1 + set_bigram_count(words[0+mv], words[1+mv], bigrams) + set_trigram_count(expected_word, words[0+mv], words[1+mv], trigrams) + print(bigrams) + print(trigrams) + + + +def predict(search_for_words): + trigrams = {} + bigrams = {} + trigrams_nc = {} + bigrams_nc = {} + index = 0 + expected = open('train/expected.tsv', 'r') + with lzma.open('train/in.tsv.xz', mode='rt') as file: + for words in read_file(file): + expected_word = re.sub(r"[^\w\d'\s]+", '', expected.readline().replace("\n", "").lower()) + mv = 0 + if not words[0]: + mv = 1 + for search_for_word in search_for_words: + if search_for_word[0] == words[0+mv] and search_for_word[1] == words[1+mv]: + set_bigram_count(words[0+mv], words[1+mv], bigrams) + set_trigram_count(expected_word, words[0+mv], words[1+mv], trigrams) + elif search_for_word[0] == words[0+mv]: + set_bigram_count(words[0+mv], words[1+mv], bigrams_nc) + set_trigram_count(expected_word, words[0+mv], words[1+mv], trigrams_nc) + + if index == 100000: + break + index += 1 + + print(len(search_for_words)) + print(len(bigrams)) + print(len(trigrams)) + print(len(bigrams_nc)) + print(len(trigrams_nc)) + + left_context_search_for_word = {} + for bigram in bigrams: + max_count = 0 + for trigram in trigrams: + if bigram == '_'.join(trigram.split("_")[1:3]) and trigrams[trigram] > max_count: + max_count = trigrams[trigram] + left_context = trigram.split("_")[0] + left_context_search_for_word[bigram] = left_context + + left_context_search_for_word_nc = {} + for bigram in bigrams_nc: + max_count = 0 + for trigram in trigrams_nc: + if bigram == '_'.join(trigram.split("_")[1:3]) and trigrams_nc[trigram] > max_count: + max_count = trigrams_nc[trigram] + left_context = trigram.split("_")[0] + left_context_search_for_word_nc[bigram] = left_context + + for index, search_for_word in enumerate(search_for_words): + hash_search_for_word = '_'.join(search_for_word) + if hash_search_for_word in left_context_search_for_word: + left_context = left_context_search_for_word[hash_search_for_word] + print(f"{index+1}: {left_context} {' '.join(search_for_word)} {trigrams['_'.join([left_context]+search_for_word)]/bigrams[hash_search_for_word]}") + else: + for lfc in left_context_search_for_word_nc: + if search_for_word[0] == lfc.split("_")[0]: + left_context = left_context_search_for_word[lfc] + print(f"{index+1}: {left_context} {' '.join(search_for_word)} {trigrams_nc['_'.join([left_context]+lfc)]/bigrams_nc[lfc]}") + else: + print(f"{index+1}: ??? {' '.join(search_for_word)}") + +def load_dev(): + search_for_words = [] + with lzma.open('dev-0/in.tsv.xz', mode='rt') as file: + index = 0 + for words in read_file(file): + if words[0]: + search_for_words.append([words[0], words[1]]) + else: + search_for_words.append([words[1], words[2]]) + if index == 100: + break + index += 1 + print(search_for_words) + return search_for_words + +if __name__ == "__main__": + # load_train() + # load_dev() + predict(load_dev()) + # with lzma.open('train/in.tsv.xz', mode='rt') as file: + # index = 0 + # for _ in get_words(file): + # index += 1 + # print(index) # 141820215 + \ No newline at end of file