challenging-america-word-ga.../run.py

import lzma
import regex as re
import string
# text = lzma.open('train/in.tsv.xz').read()
trigrams = {}
bigrams = {}
pos = 0
index = 0
words = []
def read_file(file):
    for line in file:
        yield re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")


def get_words(file):
    for words in read_file(file):
        yield from words

def set_bigram_count(first_word, second_word, bigrams):    
    if f"{first_word}_{second_word}" not in bigrams:
        bigrams[f"{first_word}_{second_word}"] = 1
    else:
        bigrams[f"{first_word}_{second_word}"] += 1    

def set_trigram_count(first_word, second_word, third_word, trigrams):    
    if f"{first_word}_{second_word}_{third_word}" not in trigrams:
        trigrams[f"{first_word}_{second_word}_{third_word}"] = 1
    else:
        trigrams[f"{first_word}_{second_word}_{third_word}"] += 1

with lzma.open('train/in.tsv.xz', mode='rt') as file:
    wordNo = 1
    first_word = ""
    second_word = ""
    third_word = ""
    for i_, word in enumerate(get_words(file)):
        if wordNo == 1:
            first_word = word
            if len(third_word) > 0:
                set_bigram_count(third_word, first_word, bigrams)
                if len(second_word) > 0:
                    set_trigram_count(second_word, third_word, first_word, trigrams)   
                    
        elif wordNo == 2:
            second_word = word        
            set_bigram_count(first_word, second_word, bigrams)
            if len(third_word) > 0:
                set_trigram_count(third_word, first_word, second_word, trigrams)      
                          
        elif wordNo == 3:
            third_word = word
            set_bigram_count(second_word, third_word, bigrams)
            set_trigram_count(first_word, second_word, third_word, trigrams)
            wordNo = 0
            
        wordNo += 1
        if i_ == 100:
            break
print(trigrams)

with lzma.open('train/in.tsv.xz', mode='rt') as file:
    for line in file:
        words += re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")
        print(words)
        break

# with lzma.open('train/in.tsv.xz', mode='rt') as file:
#     for line in file:
#         # print(line.replace("\\n"," ").replace("\n"," "))
#         words += re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")
#         print(words)
#         last_two_words = []
#         for i_, word in enumerate(words):
#             if i_ + 2 < len(words):
#                 if f"{words[i_+1]}_{words[i_+2]}" not in bigrams:
#                     bigrams[f"{words[i_+1]}_{words[i_+2]}"] = 1
#                 else:
#                     bigrams[f"{words[i_+1]}_{words[i_+2]}"] += 1
                    
#                 if f"{words[i_]}_{words[i_+1]}_{words[i_+2]}" not in trigrams:
#                     trigrams[f"{words[i_]}_{words[i_+1]}_{words[i_+2]}"] = 1
#                 else:
#                     trigrams[f"{words[i_]}_{words[i_+1]}_{words[i_+2]}"] += 1
#             else:
#                 last_two_words = [words[-2]]+[words[-1]]
#         print(last_two_words)
#         words = []
#         # print(words)
#         # print(re.sub(' +|\t', ' ', line).replace("\\n", " ").replace("\n","").split(" "))
#         # break
#         if index == 2:
#             break
#         index += 1
        
# text = "one of the"
# print(bigrams["political_thirst"])
# print(trigrams["to_political_thirst"])
# for trigram in trigrams:
#     if trigrams[trigram] > 1:
#         print(trigram, trigrams[trigram])
# for bigram in bigrams:
#     if bigrams[bigram] > 6:
#         print(bigram, bigrams[bigram])
Tworzenie bigramow i trigramow metoda 1. Funkcje yield. 2022-03-26 00:16:16 +01:00			`import lzma`
			`import regex as re`
			`import string`
			`# text = lzma.open('train/in.tsv.xz').read()`
			`trigrams = {}`
			`bigrams = {}`
			`pos = 0`
			`index = 0`
			`words = []`
			`def read_file(file):`
			`for line in file:`
			`yield re.sub(' +\|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")`


			`def get_words(file):`
			`for words in read_file(file):`
			`yield from words`

			`def set_bigram_count(first_word, second_word, bigrams):`
			`if f"{first_word}_{second_word}" not in bigrams:`
			`bigrams[f"{first_word}_{second_word}"] = 1`
			`else:`
			`bigrams[f"{first_word}_{second_word}"] += 1`

			`def set_trigram_count(first_word, second_word, third_word, trigrams):`
			`if f"{first_word}_{second_word}_{third_word}" not in trigrams:`
			`trigrams[f"{first_word}_{second_word}_{third_word}"] = 1`
			`else:`
			`trigrams[f"{first_word}_{second_word}_{third_word}"] += 1`

			`with lzma.open('train/in.tsv.xz', mode='rt') as file:`
			`wordNo = 1`
			`first_word = ""`
			`second_word = ""`
			`third_word = ""`
			`for i_, word in enumerate(get_words(file)):`
			`if wordNo == 1:`
			`first_word = word`
			`if len(third_word) > 0:`
			`set_bigram_count(third_word, first_word, bigrams)`
			`if len(second_word) > 0:`
			`set_trigram_count(second_word, third_word, first_word, trigrams)`

			`elif wordNo == 2:`
			`second_word = word`
			`set_bigram_count(first_word, second_word, bigrams)`
			`if len(third_word) > 0:`
			`set_trigram_count(third_word, first_word, second_word, trigrams)`

			`elif wordNo == 3:`
			`third_word = word`
			`set_bigram_count(second_word, third_word, bigrams)`
			`set_trigram_count(first_word, second_word, third_word, trigrams)`
			`wordNo = 0`

			`wordNo += 1`
			`if i_ == 100:`
			`break`
			`print(trigrams)`

			`with lzma.open('train/in.tsv.xz', mode='rt') as file:`
			`for line in file:`
			`words += re.sub(' +\|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")`
			`print(words)`
			`break`

			`# with lzma.open('train/in.tsv.xz', mode='rt') as file:`
			`# for line in file:`
			`# # print(line.replace("\\n"," ").replace("\n"," "))`
			`# words += re.sub(' +\|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")`
			`# print(words)`
			`# last_two_words = []`
			`# for i_, word in enumerate(words):`
			`# if i_ + 2 < len(words):`
			`# if f"{words[i_+1]}_{words[i_+2]}" not in bigrams:`
			`# bigrams[f"{words[i_+1]}_{words[i_+2]}"] = 1`
			`# else:`
			`# bigrams[f"{words[i_+1]}_{words[i_+2]}"] += 1`

			`# if f"{words[i_]}_{words[i_+1]}_{words[i_+2]}" not in trigrams:`
			`# trigrams[f"{words[i_]}_{words[i_+1]}_{words[i_+2]}"] = 1`
			`# else:`
			`# trigrams[f"{words[i_]}_{words[i_+1]}_{words[i_+2]}"] += 1`
			`# else:`
			`# last_two_words = [words[-2]]+[words[-1]]`
			`# print(last_two_words)`
			`# words = []`
			`# # print(words)`
			`# # print(re.sub(' +\|\t', ' ', line).replace("\\n", " ").replace("\n","").split(" "))`
			`# # break`
			`# if index == 2:`
			`# break`
			`# index += 1`

			`# text = "one of the"`
			`# print(bigrams["political_thirst"])`
			`# print(trigrams["to_political_thirst"])`
			`# for trigram in trigrams:`
			`# if trigrams[trigram] > 1:`
			`# print(trigram, trigrams[trigram])`
			`# for bigram in bigrams:`
			`# if bigrams[bigram] > 6:`
			`# print(bigram, bigrams[bigram])`