import lzma import regex as re import string # text = lzma.open('train/in.tsv.xz').read() trigrams = {} bigrams = {} pos = 0 index = 0 words = [] def read_file(file): for line in file: yield re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ") def get_words(file): for words in read_file(file): yield from words def set_bigram_count(first_word, second_word, bigrams): if f"{first_word}_{second_word}" not in bigrams: bigrams[f"{first_word}_{second_word}"] = 1 else: bigrams[f"{first_word}_{second_word}"] += 1 def set_trigram_count(first_word, second_word, third_word, trigrams): if f"{first_word}_{second_word}_{third_word}" not in trigrams: trigrams[f"{first_word}_{second_word}_{third_word}"] = 1 else: trigrams[f"{first_word}_{second_word}_{third_word}"] += 1 with lzma.open('train/in.tsv.xz', mode='rt') as file: wordNo = 1 first_word = "" second_word = "" third_word = "" for i_, word in enumerate(get_words(file)): if wordNo == 1: first_word = word if len(third_word) > 0: set_bigram_count(third_word, first_word, bigrams) if len(second_word) > 0: set_trigram_count(second_word, third_word, first_word, trigrams) elif wordNo == 2: second_word = word set_bigram_count(first_word, second_word, bigrams) if len(third_word) > 0: set_trigram_count(third_word, first_word, second_word, trigrams) elif wordNo == 3: third_word = word set_bigram_count(second_word, third_word, bigrams) set_trigram_count(first_word, second_word, third_word, trigrams) wordNo = 0 wordNo += 1 if i_ == 100: break print(trigrams) with lzma.open('train/in.tsv.xz', mode='rt') as file: for line in file: words += re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ") print(words) break # with lzma.open('train/in.tsv.xz', mode='rt') as file: # for line in file: # # print(line.replace("\\n"," ").replace("\n"," ")) # words += re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ") # print(words) # last_two_words = [] # for i_, word in enumerate(words): # if i_ + 2 < len(words): # if f"{words[i_+1]}_{words[i_+2]}" not in bigrams: # bigrams[f"{words[i_+1]}_{words[i_+2]}"] = 1 # else: # bigrams[f"{words[i_+1]}_{words[i_+2]}"] += 1 # if f"{words[i_]}_{words[i_+1]}_{words[i_+2]}" not in trigrams: # trigrams[f"{words[i_]}_{words[i_+1]}_{words[i_+2]}"] = 1 # else: # trigrams[f"{words[i_]}_{words[i_+1]}_{words[i_+2]}"] += 1 # else: # last_two_words = [words[-2]]+[words[-1]] # print(last_two_words) # words = [] # # print(words) # # print(re.sub(' +|\t', ' ', line).replace("\\n", " ").replace("\n","").split(" ")) # # break # if index == 2: # break # index += 1 # text = "one of the" # print(bigrams["political_thirst"]) # print(trigrams["to_political_thirst"]) # for trigram in trigrams: # if trigrams[trigram] > 1: # print(trigram, trigrams[trigram]) # for bigram in bigrams: # if bigrams[bigram] > 6: # print(bigram, bigrams[bigram])