From 993eaaa168c3673efcc269ffed8a2ba89c0687ed Mon Sep 17 00:00:00 2001 From: Jan Nowak <95jan.nowak@gmail.com> Date: Sat, 26 Mar 2022 01:21:57 +0100 Subject: [PATCH] Zrobione dodawanie bigramow i trigramow na tablicach. --- run.py | 104 +++++++++++++++++---------------------------------------- 1 file changed, 30 insertions(+), 74 deletions(-) diff --git a/run.py b/run.py index 408fc49..286bf56 100644 --- a/run.py +++ b/run.py @@ -1,17 +1,14 @@ import lzma import regex as re import string +import queue # text = lzma.open('train/in.tsv.xz').read() trigrams = {} bigrams = {} -pos = 0 -index = 0 -words = [] def read_file(file): for line in file: yield re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ") - def get_words(file): for words in read_file(file): yield from words @@ -30,74 +27,33 @@ def set_trigram_count(first_word, second_word, third_word, trigrams): with lzma.open('train/in.tsv.xz', mode='rt') as file: wordNo = 1 - first_word = "" - second_word = "" - third_word = "" - for i_, word in enumerate(get_words(file)): - if wordNo == 1: - first_word = word - if len(third_word) > 0: - set_bigram_count(third_word, first_word, bigrams) - if len(second_word) > 0: - set_trigram_count(second_word, third_word, first_word, trigrams) - - elif wordNo == 2: - second_word = word - set_bigram_count(first_word, second_word, bigrams) - if len(third_word) > 0: - set_trigram_count(third_word, first_word, second_word, trigrams) - - elif wordNo == 3: - third_word = word - set_bigram_count(second_word, third_word, bigrams) - set_trigram_count(first_word, second_word, third_word, trigrams) - wordNo = 0 - - wordNo += 1 - if i_ == 100: - break -print(trigrams) - -with lzma.open('train/in.tsv.xz', mode='rt') as file: - for line in file: - words += re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ") - print(words) - break - -# with lzma.open('train/in.tsv.xz', mode='rt') as file: -# for line in file: -# # print(line.replace("\\n"," ").replace("\n"," ")) -# words += re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ") -# print(words) -# last_two_words = [] -# for i_, word in enumerate(words): -# if i_ + 2 < len(words): -# if f"{words[i_+1]}_{words[i_+2]}" not in bigrams: -# bigrams[f"{words[i_+1]}_{words[i_+2]}"] = 1 -# else: -# bigrams[f"{words[i_+1]}_{words[i_+2]}"] += 1 - -# if f"{words[i_]}_{words[i_+1]}_{words[i_+2]}" not in trigrams: -# trigrams[f"{words[i_]}_{words[i_+1]}_{words[i_+2]}"] = 1 -# else: -# trigrams[f"{words[i_]}_{words[i_+1]}_{words[i_+2]}"] += 1 -# else: -# last_two_words = [words[-2]]+[words[-1]] -# print(last_two_words) -# words = [] -# # print(words) -# # print(re.sub(' +|\t', ' ', line).replace("\\n", " ").replace("\n","").split(" ")) -# # break -# if index == 2: -# break -# index += 1 + word_bi_last = "" + words = ["", "", ""] + for i_, word in enumerate(get_words(file)): + if len(word_bi_last) > 0: + set_bigram_count(word_bi_last, word, bigrams) + if i_ == 1: + words[0]=word_bi_last + words[1]=word + elif i_ == 2: + words[2]=word + set_trigram_count(words[0], words[1], words[2], trigrams) + elif i_ > 2: + words[0]=words[1] + words[1]=words[2] + words[2]=word + set_trigram_count(words[0], words[1], words[2], trigrams) + word_bi_last = word -# text = "one of the" -# print(bigrams["political_thirst"]) -# print(trigrams["to_political_thirst"]) -# for trigram in trigrams: -# if trigrams[trigram] > 1: -# print(trigram, trigrams[trigram]) -# for bigram in bigrams: -# if bigrams[bigram] > 6: -# print(bigram, bigrams[bigram]) + if i_ == 10000: + break + +text = "one of the" +print(bigrams["political_thirst"]) +print(trigrams["to_political_thirst"]) +for trigram in trigrams: + if trigrams[trigram] > 1: + print(trigram, trigrams[trigram]) +for bigram in bigrams: + if bigrams[bigram] > 6: + print(bigram, bigrams[bigram])