Zrobione dodawanie bigramow i trigramow na tablicach.

2022-03-26 01:21:57 +01:00 · 2022-03-26 01:21:57 +01:00 · 993eaaa168
commit 993eaaa168
parent d5888a3d7a
1 changed files with 30 additions and 74 deletions
--- a/run.py
+++ b/run.py
@ -1,17 +1,14 @@
 import lzma
 import regex as re
 import string
+import queue
 # text = lzma.open('train/in.tsv.xz').read()
 trigrams = {}
 bigrams = {}
-pos = 0
-index = 0
-words = []
 def read_file(file):
    for line in file:
        yield re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")

-
 def get_words(file):
    for words in read_file(file):
        yield from words
@ -30,74 +27,33 @@ def set_trigram_count(first_word, second_word, third_word, trigrams):

 with lzma.open('train/in.tsv.xz', mode='rt') as file:
    wordNo = 1
-    first_word = ""
-    second_word = ""
-    third_word = ""
-    for i_, word in enumerate(get_words(file)):
-        if wordNo == 1:
-            first_word = word
-            if len(third_word) > 0:
-                set_bigram_count(third_word, first_word, bigrams)
-                if len(second_word) > 0:
-                    set_trigram_count(second_word, third_word, first_word, trigrams)   
-                    
-        elif wordNo == 2:
-            second_word = word        
-            set_bigram_count(first_word, second_word, bigrams)
-            if len(third_word) > 0:
-                set_trigram_count(third_word, first_word, second_word, trigrams)      
-                          
-        elif wordNo == 3:
-            third_word = word
-            set_bigram_count(second_word, third_word, bigrams)
-            set_trigram_count(first_word, second_word, third_word, trigrams)
-            wordNo = 0
-            
-        wordNo += 1
-        if i_ == 100:
-            break
-print(trigrams)
-
-with lzma.open('train/in.tsv.xz', mode='rt') as file:
-    for line in file:
-        words += re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")
-        print(words)
-        break
-
-# with lzma.open('train/in.tsv.xz', mode='rt') as file:
-#     for line in file:
-#         # print(line.replace("\\n"," ").replace("\n"," "))
-#         words += re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")
-#         print(words)
-#         last_two_words = []
-#         for i_, word in enumerate(words):
-#             if i_ + 2 < len(words):
-#                 if f"{words[i_+1]}_{words[i_+2]}" not in bigrams:
-#                     bigrams[f"{words[i_+1]}_{words[i_+2]}"] = 1
-#                 else:
-#                     bigrams[f"{words[i_+1]}_{words[i_+2]}"] += 1
-                    
-#                 if f"{words[i_]}_{words[i_+1]}_{words[i_+2]}" not in trigrams:
-#                     trigrams[f"{words[i_]}_{words[i_+1]}_{words[i_+2]}"] = 1
-#                 else:
-#                     trigrams[f"{words[i_]}_{words[i_+1]}_{words[i_+2]}"] += 1
-#             else:
-#                 last_two_words = [words[-2]]+[words[-1]]
-#         print(last_two_words)
-#         words = []
-#         # print(words)
-#         # print(re.sub(' +|\t', ' ', line).replace("\\n", " ").replace("\n","").split(" "))
-#         # break
-#         if index == 2:
-#             break
-#         index += 1
+    word_bi_last = ""
+    words = ["", "", ""]
+    for i_, word in enumerate(get_words(file)):  
+        if len(word_bi_last) > 0:
+            set_bigram_count(word_bi_last, word, bigrams) 
+        if i_ == 1:            
+            words[0]=word_bi_last
+            words[1]=word   
+        elif i_ == 2:
+            words[2]=word                    
+            set_trigram_count(words[0], words[1], words[2], trigrams) 
+        elif i_ > 2:
+            words[0]=words[1]
+            words[1]=words[2]
+            words[2]=word
+            set_trigram_count(words[0], words[1], words[2], trigrams)              
+        word_bi_last = word
        
-# text = "one of the"
-# print(bigrams["political_thirst"])
-# print(trigrams["to_political_thirst"])
-# for trigram in trigrams:
-#     if trigrams[trigram] > 1:
-#         print(trigram, trigrams[trigram])
-# for bigram in bigrams:
-#     if bigrams[bigram] > 6:
-#         print(bigram, bigrams[bigram])
+        if i_ == 10000:
+            break
+        
+text = "one of the"
+print(bigrams["political_thirst"])
+print(trigrams["to_political_thirst"])
+for trigram in trigrams:
+    if trigrams[trigram] > 1:
+        print(trigram, trigrams[trigram])
+for bigram in bigrams:
+    if bigrams[bigram] > 6:
+        print(bigram, bigrams[bigram])