challenging-america-word-ga.../run.py

from encodings import search_function
import lzma
from re import L
import regex as re
import string
import queue
# text = lzma.open('train/in.tsv.xz').read()
def read_file(file):
    for line in file:
        yield re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")

def get_words(file):
    for words in read_file(file):
        yield from words

def set_bigram_count(first_word, second_word, bigrams):
    if f"{first_word}_{second_word}" not in bigrams:
        bigrams[f"{first_word}_{second_word}"] = 1
    else:
        bigrams[f"{first_word}_{second_word}"] += 1

def set_trigram_count(first_word, second_word, third_word, trigrams):
    if f"{first_word}_{second_word}_{third_word}" not in trigrams:
        trigrams[f"{first_word}_{second_word}_{third_word}"] = 1
    else:
        trigrams[f"{first_word}_{second_word}_{third_word}"] += 1

def load_train():
    with lzma.open('train/in.tsv.xz', mode='rt') as file:
        wordNo = 1
        word_bi_last = ""
        words = ["", "", ""]
        for i_, word in enumerate(get_words(file)):
            word = word.lower()
            if len(word_bi_last) > 0:
                set_bigram_count(word_bi_last, word, bigrams)
            if i_ == 1:
                words[0]=word_bi_last
                words[1]=word
            elif i_ == 2:
                words[2]=word
                set_trigram_count(words[0], words[1], words[2], trigrams)
            elif i_ > 2:
                words[0]=words[1]
                words[1]=words[2]
                words[2]=word
                set_trigram_count(words[0], words[1], words[2], trigrams)
            word_bi_last = word

def predict(search_for_words):
    trigrams_complete = {}
    bigrams_complete = {}
    search_for_words_complete = [] # Tablica szukanych słów które wystąpiły w tekście z dokładnie tymi samymi szukanymi słowami w tej samej kolejności
    # Szukanie bigramów i trigramów które zawierają szukaną lukę dla słowa z tablicy search_for_words.
    # Jeżeli kolejność słów się zgadza liczona jest ilość wystąpień takich bigramów i trigramów z tymi słowami.
    # Przy czym dla trigramów sprawdzane są tylk odwa ostatnie słowa bo to logiczne. I potem sprawdzane jest który trigram dla danych słów najczęściej występuje.
    # I to pierwsze słowo z tego trigramu dla tych danych dwóch słów jest tym słowem szukanym (leftcontext).
    # bigramy i trigramy zapisywane są w dict jako jeden cały string a słowa odzielone są _ (podłogą).
    with lzma.open('train/in.tsv.xz', mode='rt') as file:
        wordNo = 1
        word_bi_last = ""
        words = ["", "", ""]
        for i_, word in enumerate(get_words(file)): # lecimy po kolei słowo po słowie. Słow ma usunięte wszelkie interpunkcja
            word = word.lower() # normalizowanie na małe znaki
            if len(word_bi_last) > 0: # Mamy już pierwsze słow zbuforowane (szczególnie potrzebne dla pierwszego przebiegu) możemy więc zapisać
                for search_for_word in search_for_words:
                    search_for_word_s = search_for_word.split("_")
                    if search_for_word_s[0] == word_bi_last and search_for_word_s[1] == word: # Jeżeli szukane słowa tworzą bigram występujący w tekście trenującym to zwiększamy liczbę jego wystąpień
                        search_for_words_complete.append(search_for_word)
                        set_bigram_count(word_bi_last, word, bigrams_complete)
            if i_ == 1:  # If potrzebny aby zbuforować min 3 wyrazy dla trigramu w początkowej fazie przebiegu pętli.
                words[0]=word_bi_last
                words[1]=word
            elif i_ == 2: # są już zbuforowane 3 słowa więc można zacząć zliczać trigramy tylko w początkowej fazie przebiegu pętli.
                words[2]=word  # To jest to 3 słowo
                for search_for_word in search_for_words:
                    search_for_word = search_for_word.split("_")
                    if search_for_word[0] == words[1] and search_for_word[1] == words[2]:  # Jeżeli szukane słowa należą do przedostatniego i ostatniego słowa trigramu to jest zwiększana liczba wystąpień tego trigramu.
                        set_trigram_count(words[0], words[1], words[2], trigrams_complete)
            elif i_ > 2: # Jest to już ponad 2 przebieg pętli więc możemy rotować wyrazy jak w kolecje. Dla trigramów.
                words[0]=words[1]
                words[1]=words[2]
                words[2]=word
                for search_for_word in search_for_words:
                    search_for_word = search_for_word.split("_")
                    if search_for_word[0] == words[1] and search_for_word[1] == words[2]:
                        set_trigram_count(words[0], words[1], words[2], trigrams_complete)
            word_bi_last = word

            if i_ == 500000:
                break

    print (len(bigrams_complete))
    print (len(trigrams_complete))
    # Szukanie trigramu który najczęściej wystąpił dla każdych szukanych danych dwóch słów z tablicy serch_for_word.
    # Z razcji z tego, że są to dokładnie te dwa słowa szukane mogę użyć słownika znalezionych bigramów
    search_for_word_complete_bicounts = {}
    left_context_search_for_word = {}
    for search_for_word_complete in search_for_words_complete:
        search_for_word_complete_bicounts[search_for_word_complete] = bigrams_complete[search_for_word_complete]
    for search_for_word_complete_bicount in search_for_word_complete_bicounts:
        max_count = 0
        for trigram in trigrams_complete:
            if search_for_word_complete_bicount in trigram and trigrams_complete[trigram] > max_count:
                max_count = trigrams_complete[trigram]
                left_context = trigram.split("_")[0]
                left_context_search_for_word[search_for_word_complete_bicount] = left_context
    for search_for_word in left_context_search_for_word:
        left_context = left_context_search_for_word[search_for_word]
        print(f"{left_context} {' '.join(search_for_word.split('_'))}")

    # max_count_t = 0
    # max_bi_key = ""
    # max_count_b = 0
    # for key in bigrams:
    #     for key_t in trigrams:
    #         if key in key_t:
    #             if bigrams[key]>max_count_b:
    #                 if key[0] != "_":
    #                     max_count_b = bigrams[key]
    #                     max_bi_key = key
    #                 if trigrams[key_t]>max_count_t:
    #                     if key_t[0] != "_":
    #                         max_count_t = trigrams[key_t]
    #                         max_key = key_t
    # print(max_bi_key)
    # print(max_key)

def load_dev():
    search_for_words = []
    with lzma.open('dev-0/in.tsv.xz', mode='rt') as file:
        index = 0
        second_word = ""
        third_word = ""
        was_tab = False
        word_index_watch = 0
        for line in file:
            for word in line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation)).replace("\t", " \t ").split(" "):
                word = word.lower()
                if word == '\t':
                    was_tab = True
                    word_index_watch = 0
                    second_word = ""
                    third_word = ""
                elif was_tab:
                    if not second_word:
                        second_word = word
                    elif word_index_watch == 1:
                        third_word = word
                        search_for_words.append(f"{second_word}_{third_word}")
                        was_tab = False
                    else:
                        was_tab = False
                        second_word = ""
                        third_word = ""

                    word_index_watch += 1

            # print(line)
            index += 1
            if index == 100:
                break
    print(search_for_words)
    return search_for_words

if __name__ == "__main__":
    # load_train()
    predict(load_dev())
    # with lzma.open('train/in.tsv.xz', mode='rt') as file:
    #     index = 0
    #     for _ in get_words(file):
    #         index += 1
    #     print(index) # 141820215