challenging-america-word-ga.../run.py

import lzma
import regex as re
import string
import queue
# text = lzma.open('train/in.tsv.xz').read()
trigrams = {}
bigrams = {}
def read_file(file):
    for line in file:
        yield re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")

def get_words(file):
    for words in read_file(file):
        yield from words

def set_bigram_count(first_word, second_word, bigrams):
    if f"{first_word}_{second_word}" not in bigrams:
        bigrams[f"{first_word}_{second_word}"] = 1
    else:
        bigrams[f"{first_word}_{second_word}"] += 1

def set_trigram_count(first_word, second_word, third_word, trigrams):
    if f"{first_word}_{second_word}_{third_word}" not in trigrams:
        trigrams[f"{first_word}_{second_word}_{third_word}"] = 1
    else:
        trigrams[f"{first_word}_{second_word}_{third_word}"] += 1

with lzma.open('train/in.tsv.xz', mode='rt') as file:
    wordNo = 1
    word_bi_last = ""
    words = ["", "", ""]
    for i_, word in enumerate(get_words(file)):
        if len(word_bi_last) > 0:
            set_bigram_count(word_bi_last, word, bigrams)
        if i_ == 1:
            words[0]=word_bi_last
            words[1]=word
        elif i_ == 2:
            words[2]=word
            set_trigram_count(words[0], words[1], words[2], trigrams)
        elif i_ > 2:
            words[0]=words[1]
            words[1]=words[2]
            words[2]=word
            set_trigram_count(words[0], words[1], words[2], trigrams)
        word_bi_last = word

        if i_ == 10000:
            break

text = "one of the"
print(bigrams["political_thirst"])
print(trigrams["to_political_thirst"])
for trigram in trigrams:
    if trigrams[trigram] > 1:
        print(trigram, trigrams[trigram])
for bigram in bigrams:
    if bigrams[bigram] > 6:
        print(bigram, bigrams[bigram])