Add non complete grams

2022-04-03 22:59:35 +02:00 · 2022-04-03 22:59:35 +02:00 · 81f09b68d1
commit 81f09b68d1
parent e806e44383
1 changed files with 22 additions and 1 deletions
--- a/run.py
+++ b/run.py
@ -46,6 +46,8 @@ def load_train():
 def predict(search_for_words):    
    trigrams = {}
    bigrams = {}
    trigrams_nc = {}
    bigrams_nc = {}
    index = 0
    expected = open('train/expected.tsv', 'r')
    with lzma.open('train/in.tsv.xz', mode='rt') as file:
@ -58,6 +60,9 @@ def predict(search_for_words):
                if search_for_word[0] == words[0+mv] and search_for_word[1] == words[1+mv]:
                    set_bigram_count(words[0+mv], words[1+mv], bigrams)
                    set_trigram_count(expected_word, words[0+mv], words[1+mv], trigrams)
                elif search_for_word[0] == words[0+mv]:
                    set_bigram_count(words[0+mv], words[1+mv], bigrams_nc)
                    set_trigram_count(expected_word, words[0+mv], words[1+mv], trigrams_nc)
            if index == 100000:
                break
@ -66,6 +71,8 @@ def predict(search_for_words):
    print(len(search_for_words))
    print(len(bigrams))
    print(len(trigrams))
    print(len(bigrams_nc))
    print(len(trigrams_nc))
    left_context_search_for_word = {}
    for bigram in bigrams:
@ -76,13 +83,27 @@ def predict(search_for_words):
                left_context = trigram.split("_")[0]
                left_context_search_for_word[bigram] = left_context
    left_context_search_for_word_nc = {}
    for bigram in bigrams_nc:
        max_count = 0
        for trigram in trigrams_nc:
            if bigram == '_'.join(trigram.split("_")[1:3]) and trigrams_nc[trigram] > max_count:
                max_count = trigrams_nc[trigram]
                left_context = trigram.split("_")[0]
                left_context_search_for_word_nc[bigram] = left_context
    for index, search_for_word in enumerate(search_for_words):
        hash_search_for_word = '_'.join(search_for_word)
        if hash_search_for_word in left_context_search_for_word:
            left_context = left_context_search_for_word[hash_search_for_word]
            print(f"{index+1}: {left_context} {' '.join(search_for_word)} {trigrams['_'.join([left_context]+search_for_word)]/bigrams[hash_search_for_word]}")
        else:
-            print(f"{index+1}: ??? {' '.join(search_for_word)}")
+            for lfc in left_context_search_for_word_nc:
                if search_for_word[0] == lfc.split("_")[0]:
                    left_context = left_context_search_for_word[lfc]
                    print(f"{index+1}: {left_context} {' '.join(search_for_word)} {trigrams_nc['_'.join([left_context]+lfc)]/bigrams_nc[lfc]}")
                else:
                    print(f"{index+1}: ??? {' '.join(search_for_word)}")
 def load_dev():
    search_for_words = []