Fix to correct verion of mind.

2022-04-03 22:59:04 +02:00 · 2022-04-03 22:59:04 +02:00 · e806e44383
commit e806e44383
parent ecccea3cc4
1 changed files with 55 additions and 131 deletions
--- a/run.py
+++ b/run.py
@ -6,8 +6,8 @@ import string
 import queue
 # text = lzma.open('train/in.tsv.xz').read()
 def read_file(file):
-    for line in file:
-        yield re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")
+    for line in file:      
+        yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', line.split("\t")[7].replace("\\n"," ").replace("\n","").lower())).split(" ")

 def get_words(file):
    for words in read_file(file):
@ -28,156 +28,80 @@ def set_trigram_count(first_word, second_word, third_word, trigrams):
 def load_train():
    trigrams = {}
    bigrams = {}
+    index = 0
+    expected = open('train/expected.tsv', 'r')
    with lzma.open('train/in.tsv.xz', mode='rt') as file:
-        wordNo = 1
-        word_bi_last = ""
-        words = ["", "", ""]
-        for i_, word in enumerate(get_words(file)):  
-            word = word.lower()
-            if len(word_bi_last) > 0:
-                set_bigram_count(word_bi_last, word, bigrams) 
-            if i_ == 1:            
-                words[0]=word_bi_last
-                words[1]=word   
-            elif i_ == 2:
-                words[2]=word  
-                set_trigram_count(words[0], words[1], words[2], trigrams) 
-            elif i_ > 2:
-                words[0]=words[1]
-                words[1]=words[2]
-                words[2]=word
-                set_trigram_count(words[0], words[1], words[2], trigrams)              
-            word_bi_last = word
+        for words in read_file(file):
+            expected_word = re.sub(r"[^\w\d'\s]+", '', expected.readline().replace("\n", "").lower())
+            mv = 0
+            if not words[0]:
+                mv = 1
+            set_bigram_count(words[0+mv], words[1+mv], bigrams)
+            set_trigram_count(expected_word, words[0+mv], words[1+mv], trigrams)
+    print(bigrams)
+    print(trigrams)

-def predict(search_for_words):
-    trigrams_complete = {} # Tablica trigramów szukanych słów które wystąpiły w tekście z dokładnie tymi samymi szukanymi słowami w tej samej kolejności
-    bigrams_complete = {} # Tablica bigramów szukanych słów które wystąpiły w tekście z dokładnie tymi samymi szukanymi słowami w tej samej kolejności
-    # bigrams_not_complete = {}
-    # trigrams_not_complete = {}
-    # search_for_words_complete = [] # Tablica szukanych słów które wystąpiły w tekście z dokładnie tymi samymi szukanymi słowami w tej samej kolejności
-    # Szukanie bigramów i trigramów które zawierają szukaną lukę dla słowa z tablicy search_for_words.
-    # Jeżeli kolejność słów się zgadza liczona jest ilość wystąpień takich bigramów i trigramów z tymi słowami.
-    # Przy czym dla trigramów sprawdzane są tylk odwa ostatnie słowa bo to logiczne. I potem sprawdzane jest który trigram dla danych słów najczęściej występuje.
-    # I to pierwsze słowo z tego trigramu dla tych danych dwóch słów jest tym słowem szukanym (leftcontext).
-    # bigramy i trigramy zapisywane są w dict jako jeden cały string a słowa odzielone są _ (podłogą).
+
+
+def predict(search_for_words):    
+    trigrams = {}
+    bigrams = {}
+    index = 0
+    expected = open('train/expected.tsv', 'r')
    with lzma.open('train/in.tsv.xz', mode='rt') as file:
-        wordNo = 1
-        word_bi_last = ""
-        words = ["", "", ""]
-        for i_, word in enumerate(get_words(file)): # lecimy po kolei słowo po słowie. Słow ma usunięte wszelkie interpunkcja 
-            word = word.lower() # normalizowanie na małe znaki
-            if not word:
-                continue
-            if len(word_bi_last) > 0: # Mamy już pierwsze słow zbuforowane (szczególnie potrzebne dla pierwszego przebiegu) możemy więc zapisać 
-                for search_for_word in search_for_words:
-                    search_for_word_s = search_for_word.split("_")
-                    if search_for_word_s[0] == word_bi_last and search_for_word_s[1] == word: # Jeżeli szukane słowa tworzą bigram występujący w tekście trenującym to zwiększamy liczbę jego wystąpień
-                        set_bigram_count(word_bi_last, word, bigrams_complete) 
-                    # elif search_for_word_s[0] == word_bi_last:
-                    #     set_bigram_count(word_bi_last, word, bigrams_not_complete)
-            if i_ == 1:  # If potrzebny aby zbuforować min 3 wyrazy dla trigramu w początkowej fazie przebiegu pętli.          
-                words[0]=word_bi_last
-                words[1]=word   
-            elif i_ == 2: # są już zbuforowane 3 słowa więc można zacząć zliczać trigramy tylko w początkowej fazie przebiegu pętli.
-                words[2]=word  # To jest to 3 słowo
-                for search_for_word in search_for_words:
-                    search_for_word = search_for_word.split("_")
-                    if search_for_word[0] == words[1] and search_for_word[1] == words[2]:  # Jeżeli szukane słowa należą do przedostatniego i ostatniego słowa trigramu to jest zwiększana liczba wystąpień tego trigramu.           
-                        set_trigram_count(words[0], words[1], words[2], trigrams_complete) 
-                    # elif search_for_word[0] == words[1]:
-                    #     set_trigram_count(words[0], words[1], words[2], trigrams_not_complete) 
-            elif i_ > 2: # Jest to już ponad 2 przebieg pętli więc możemy rotować wyrazy jak w kolecje. Dla trigramów.
-                words[0]=words[1]
-                words[1]=words[2]
-                words[2]=word
-                for search_for_word in search_for_words:
-                    search_for_word = search_for_word.split("_")
-                    if search_for_word[0] == words[1] and search_for_word[1] == words[2]:  
-                        set_trigram_count(words[0], words[1], words[2], trigrams_complete)       
-                    # elif search_for_word[0] == words[1]:
-                    #     set_trigram_count(words[0], words[1], words[2], trigrams_not_complete)      
-            word_bi_last = word
-            
-            if i_ == 500000:
+        for words in read_file(file):
+            expected_word = re.sub(r"[^\w\d'\s]+", '', expected.readline().replace("\n", "").lower())
+            mv = 0
+            if not words[0]:
+                mv = 1
+            for search_for_word in search_for_words:
+                if search_for_word[0] == words[0+mv] and search_for_word[1] == words[1+mv]:
+                    set_bigram_count(words[0+mv], words[1+mv], bigrams)
+                    set_trigram_count(expected_word, words[0+mv], words[1+mv], trigrams)
+    
+            if index == 100000:
                break
+            index += 1
            
    print(len(search_for_words))
-    print(len(bigrams_complete))
-    print(len(trigrams_complete))
-    # print(len(bigrams_complete), len(bigrams_not_complete), len(bigrams_complete)+len(bigrams_not_complete))
-    # print(len(trigrams_complete), len(trigrams_not_complete), len(trigrams_complete)+len(trigrams_not_complete))
-    # Szukanie trigramu który najczęściej wystąpił dla każdych dokadnie tych samych co szukanych danych dwóch słów z tablicy serch_for_word.
-    # Dotyczy dkoładnie pasujących bigramów z szukanymi słowami
+    print(len(bigrams))
+    print(len(trigrams))
+    
    left_context_search_for_word = {}
-    for bigram_complete in bigrams_complete:
+    for bigram in bigrams:
        max_count = 0
-        for trigram in trigrams_complete:
-            if bigram_complete == '_'.join(trigram.split("_")[1:3]) and trigrams_complete[trigram] > max_count:
-                max_count = trigrams_complete[trigram]
+        for trigram in trigrams:
+            if bigram == '_'.join(trigram.split("_")[1:3]) and trigrams[trigram] > max_count:
+                max_count = trigrams[trigram]
                left_context = trigram.split("_")[0]
-                left_context_search_for_word[bigram_complete] = left_context
+                left_context_search_for_word[bigram] = left_context

-    # # Szukanie trigramu który najczęściej wystąpił dla pierwszego szukanego słowa z szukanych słów z tablicy serch_for_word.
-    # # To w przypadku gdyby szukane słowa w ogóle nie znalazły swojego dopasowania w zbiorze train to wtedy dostaną jakieś tam prawdopodobieństwo dla tego pierwszego słow z szukanych słów.
-    # left_context_search_for_word_not_complete = {}
-    # for bigram_not_complete in bigrams_not_complete:
-    #     max_count = 0
-    #     for trigram in trigrams_not_complete:
-    #         if bigram_not_complete == '_'.join(trigram.split("_")[1:3]) and trigrams_not_complete[trigram] > max_count:
-    #             max_count = trigrams_not_complete[trigram]
-    #             left_context = trigram.split("_")[0]
-    #             left_context_search_for_word_not_complete[bigram_not_complete] = left_context
-
-    for search_for_word in search_for_words:
-        if search_for_word in left_context_search_for_word:
-            left_context = left_context_search_for_word[search_for_word]
-            print(f"{left_context} {' '.join(search_for_word.split('_'))} {trigrams_complete['_'.join([left_context, search_for_word])]/bigrams_complete[search_for_word]}")
+    for index, search_for_word in enumerate(search_for_words):
+        hash_search_for_word = '_'.join(search_for_word)
+        if hash_search_for_word in left_context_search_for_word:
+            left_context = left_context_search_for_word[hash_search_for_word]
+            print(f"{index+1}: {left_context} {' '.join(search_for_word)} {trigrams['_'.join([left_context]+search_for_word)]/bigrams[hash_search_for_word]}")
+        else:
+            print(f"{index+1}: ??? {' '.join(search_for_word)}")

 def load_dev():
-    # Ładowanie zbioru testującego
-    # Luka została oznaczona jako znak tabulacji (\t)
    search_for_words = []
    with lzma.open('dev-0/in.tsv.xz', mode='rt') as file:
        index = 0
-        second_word = ""
-        third_word = ""
-        was_tab = False
-        word_index_watch = 0
-        for line in file:
-            # Wczytanie linijiki i dzielenie jej na słowa, w przypadku napotkania luki (znaku \t) dodanie spacji aby oznaczyć jako słowo. Czyszczenie słów z różnych dziwnych znaków.
-            for word in line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation)).replace("\t", " \t ").split(" "):
-                word = word.lower()
-                if not word: # omijamy pusty znak wynikający z podziału przez spacje, dokońca nie wiem dlaczego się pojawia raczej nie powienien.
-                    continue
-                # Napotkał lukę czyli kolejne dwa wyrazy będą brane jako bigramy i na tych bigramach (zmienna search_for_words) będzie dokonywana predykcja.
-                if word == '\t':
-                    was_tab = True
-                    word_index_watch = 0
-                    second_word = ""
-                    third_word = ""
-                elif was_tab: # Wystąpiła wcześniej luka (szukane pierwsze słowo) czyli zapisujemy słowo jako drugie słowo z bigramu, przeskakujemy iterację i potem zapisujemy trzecie słowo jeżeli w między czasie nie wystąpi jakaś luka inaczej zaczynamy proces od nowa.
-                    if not second_word: # Sprawdzamy czy już drugie słowo nie zostało zbuforowane, jeżeli tak to oznacza, że teraz czekamy na trzecie słowo
-                        second_word = word # Buforujemy drugie słowo
-                    elif word_index_watch == 1: # Kolejna iteracja czyli jest to trzecie słowo z bigramu to zapisujemy szukany bigram
-                        third_word = word
-                        search_for_words.append(f"{second_word}_{third_word}")
-                        was_tab = False # Oznaczamy, że dla tej luki mamy już bigram 
-                    else: # Jeżeli przekroczymy indeks słów to szkuamy kolejnej luki i resetujemy zmienne. W sumie do końca nie wiem czy to jest potrzebne
-                        was_tab = False
-                        second_word = ""
-                        third_word = ""
-
-                    word_index_watch += 1
-
-            # print(line)
-            index += 1
+        for words in read_file(file):
+            if words[0]:
+                search_for_words.append([words[0], words[1]])
+            else:
+                search_for_words.append([words[1], words[2]])
            if index == 100:
-                break 
+                break
+            index += 1
    print(search_for_words)
    return search_for_words

 if __name__ == "__main__":
    # load_train()
+    # load_dev()
    predict(load_dev())
    # with lzma.open('train/in.tsv.xz', mode='rt') as file:
    #     index = 0