Zrobione szukanie leftcontext dla dokalnie wystepujacych dwoch slow.

2022-04-03 17:43:04 +02:00 · 2022-04-03 17:43:04 +02:00 · 9725eb4b41
commit 9725eb4b41
parent 993eaaa168
1 changed files with 149 additions and 34 deletions
--- a/run.py
+++ b/run.py
@ -1,10 +1,10 @@
+from encodings import search_function
 import lzma
+from re import L
 import regex as re
 import string
 import queue
 # text = lzma.open('train/in.tsv.xz').read()
-trigrams = {}
-bigrams = {}
 def read_file(file):
    for line in file:
        yield re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")
@ -25,35 +25,150 @@ def set_trigram_count(first_word, second_word, third_word, trigrams):
    else:
        trigrams[f"{first_word}_{second_word}_{third_word}"] += 1

-with lzma.open('train/in.tsv.xz', mode='rt') as file:
-    wordNo = 1
-    word_bi_last = ""
-    words = ["", "", ""]
-    for i_, word in enumerate(get_words(file)):  
-        if len(word_bi_last) > 0:
-            set_bigram_count(word_bi_last, word, bigrams) 
-        if i_ == 1:            
-            words[0]=word_bi_last
-            words[1]=word   
-        elif i_ == 2:
-            words[2]=word                    
-            set_trigram_count(words[0], words[1], words[2], trigrams) 
-        elif i_ > 2:
-            words[0]=words[1]
-            words[1]=words[2]
-            words[2]=word
-            set_trigram_count(words[0], words[1], words[2], trigrams)              
-        word_bi_last = word
-        
-        if i_ == 10000:
-            break
-        
-text = "one of the"
-print(bigrams["political_thirst"])
-print(trigrams["to_political_thirst"])
-for trigram in trigrams:
-    if trigrams[trigram] > 1:
-        print(trigram, trigrams[trigram])
-for bigram in bigrams:
-    if bigrams[bigram] > 6:
-        print(bigram, bigrams[bigram])
+def load_train():
+    with lzma.open('train/in.tsv.xz', mode='rt') as file:
+        wordNo = 1
+        word_bi_last = ""
+        words = ["", "", ""]
+        for i_, word in enumerate(get_words(file)):  
+            word = word.lower()
+            if len(word_bi_last) > 0:
+                set_bigram_count(word_bi_last, word, bigrams) 
+            if i_ == 1:            
+                words[0]=word_bi_last
+                words[1]=word   
+            elif i_ == 2:
+                words[2]=word  
+                set_trigram_count(words[0], words[1], words[2], trigrams) 
+            elif i_ > 2:
+                words[0]=words[1]
+                words[1]=words[2]
+                words[2]=word
+                set_trigram_count(words[0], words[1], words[2], trigrams)              
+            word_bi_last = word
+
+def predict(search_for_words):
+    trigrams_complete = {}
+    bigrams_complete = {}
+    search_for_words_complete = [] # Tablica szukanych słów które wystąpiły w tekście z dokładnie tymi samymi szukanymi słowami w tej samej kolejności
+    # Szukanie bigramów i trigramów które zawierają szukaną lukę dla słowa z tablicy search_for_words.
+    # Jeżeli kolejność słów się zgadza liczona jest ilość wystąpień takich bigramów i trigramów z tymi słowami.
+    # Przy czym dla trigramów sprawdzane są tylk odwa ostatnie słowa bo to logiczne. I potem sprawdzane jest który trigram dla danych słów najczęściej występuje.
+    # I to pierwsze słowo z tego trigramu dla tych danych dwóch słów jest tym słowem szukanym (leftcontext).
+    # bigramy i trigramy zapisywane są w dict jako jeden cały string a słowa odzielone są _ (podłogą).
+    with lzma.open('train/in.tsv.xz', mode='rt') as file:
+        wordNo = 1
+        word_bi_last = ""
+        words = ["", "", ""]
+        for i_, word in enumerate(get_words(file)): # lecimy po kolei słowo po słowie. Słow ma usunięte wszelkie interpunkcja 
+            word = word.lower() # normalizowanie na małe znaki
+            if len(word_bi_last) > 0: # Mamy już pierwsze słow zbuforowane (szczególnie potrzebne dla pierwszego przebiegu) możemy więc zapisać 
+                for search_for_word in search_for_words:
+                    search_for_word_s = search_for_word.split("_")
+                    if search_for_word_s[0] == word_bi_last and search_for_word_s[1] == word: # Jeżeli szukane słowa tworzą bigram występujący w tekście trenującym to zwiększamy liczbę jego wystąpień
+                        search_for_words_complete.append(search_for_word)
+                        set_bigram_count(word_bi_last, word, bigrams_complete) 
+            if i_ == 1:  # If potrzebny aby zbuforować min 3 wyrazy dla trigramu w początkowej fazie przebiegu pętli.          
+                words[0]=word_bi_last
+                words[1]=word   
+            elif i_ == 2: # są już zbuforowane 3 słowa więc można zacząć zliczać trigramy tylko w początkowej fazie przebiegu pętli.
+                words[2]=word  # To jest to 3 słowo
+                for search_for_word in search_for_words:
+                    search_for_word = search_for_word.split("_")
+                    if search_for_word[0] == words[1] and search_for_word[1] == words[2]:  # Jeżeli szukane słowa należą do przedostatniego i ostatniego słowa trigramu to jest zwiększana liczba wystąpień tego trigramu.           
+                        set_trigram_count(words[0], words[1], words[2], trigrams_complete) 
+            elif i_ > 2: # Jest to już ponad 2 przebieg pętli więc możemy rotować wyrazy jak w kolecje. Dla trigramów.
+                words[0]=words[1]
+                words[1]=words[2]
+                words[2]=word
+                for search_for_word in search_for_words:
+                    search_for_word = search_for_word.split("_")
+                    if search_for_word[0] == words[1] and search_for_word[1] == words[2]:                 
+                        set_trigram_count(words[0], words[1], words[2], trigrams_complete)             
+            word_bi_last = word
+            
+            if i_ == 500000:
+                break
+            
+    print (len(bigrams_complete))
+    print (len(trigrams_complete))
+    # Szukanie trigramu który najczęściej wystąpił dla każdych szukanych danych dwóch słów z tablicy serch_for_word.
+    # Z razcji z tego, że są to dokładnie te dwa słowa szukane mogę użyć słownika znalezionych bigramów 
+    search_for_word_complete_bicounts = {}
+    left_context_search_for_word = {}
+    for search_for_word_complete in search_for_words_complete:
+        search_for_word_complete_bicounts[search_for_word_complete] = bigrams_complete[search_for_word_complete]
+    for search_for_word_complete_bicount in search_for_word_complete_bicounts:
+        max_count = 0
+        for trigram in trigrams_complete:
+            if search_for_word_complete_bicount in trigram and trigrams_complete[trigram] > max_count:
+                max_count = trigrams_complete[trigram]
+                left_context = trigram.split("_")[0]
+                left_context_search_for_word[search_for_word_complete_bicount] = left_context
+    for search_for_word in left_context_search_for_word:
+        left_context = left_context_search_for_word[search_for_word]
+        print(f"{left_context} {' '.join(search_for_word.split('_'))}")
+
+    # max_count_t = 0
+    # max_bi_key = ""
+    # max_count_b = 0
+    # for key in bigrams:
+    #     for key_t in trigrams:
+    #         if key in key_t:                
+    #             if bigrams[key]>max_count_b:
+    #                 if key[0] != "_":
+    #                     max_count_b = bigrams[key]
+    #                     max_bi_key = key
+    #                 if trigrams[key_t]>max_count_t:
+    #                     if key_t[0] != "_":
+    #                         max_count_t = trigrams[key_t]
+    #                         max_key = key_t
+    # print(max_bi_key)
+    # print(max_key)
+
+def load_dev():
+    search_for_words = []
+    with lzma.open('dev-0/in.tsv.xz', mode='rt') as file:
+        index = 0
+        second_word = ""
+        third_word = ""
+        was_tab = False
+        word_index_watch = 0
+        for line in file:
+            for word in line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation)).replace("\t", " \t ").split(" "):
+                word = word.lower()
+                if word == '\t':
+                    was_tab = True
+                    word_index_watch = 0
+                    second_word = ""
+                    third_word = ""
+                elif was_tab:
+                    if not second_word:
+                        second_word = word
+                    elif word_index_watch == 1:
+                        third_word = word
+                        search_for_words.append(f"{second_word}_{third_word}")
+                        was_tab = False
+                    else:
+                        was_tab = False
+                        second_word = ""
+                        third_word = ""
+
+                    word_index_watch += 1
+
+            # print(line)
+            index += 1
+            if index == 100:
+                break 
+    print(search_for_words)
+    return search_for_words
+
+if __name__ == "__main__":
+    # load_train()
+    predict(load_dev())
+    # with lzma.open('train/in.tsv.xz', mode='rt') as file:
+    #     index = 0
+    #     for _ in get_words(file):
+    #         index += 1
+    #     print(index) # 141820215
+