Tworzenie bigramow i trigramow metoda 1. Funkcje yield.

2022-03-26 00:16:16 +01:00 · 2022-03-26 00:16:16 +01:00 · d5888a3d7a
commit d5888a3d7a
parent 61e88a9c8c
1 changed files with 103 additions and 0 deletions
--- a/run.py
+++ b/run.py
@ -0,0 +1,103 @@
+import lzma
+import regex as re
+import string
+# text = lzma.open('train/in.tsv.xz').read()
+trigrams = {}
+bigrams = {}
+pos = 0
+index = 0
+words = []
+def read_file(file):
+    for line in file:
+        yield re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")
+
+
+def get_words(file):
+    for words in read_file(file):
+        yield from words
+
+def set_bigram_count(first_word, second_word, bigrams):    
+    if f"{first_word}_{second_word}" not in bigrams:
+        bigrams[f"{first_word}_{second_word}"] = 1
+    else:
+        bigrams[f"{first_word}_{second_word}"] += 1    
+
+def set_trigram_count(first_word, second_word, third_word, trigrams):    
+    if f"{first_word}_{second_word}_{third_word}" not in trigrams:
+        trigrams[f"{first_word}_{second_word}_{third_word}"] = 1
+    else:
+        trigrams[f"{first_word}_{second_word}_{third_word}"] += 1
+
+with lzma.open('train/in.tsv.xz', mode='rt') as file:
+    wordNo = 1
+    first_word = ""
+    second_word = ""
+    third_word = ""
+    for i_, word in enumerate(get_words(file)):
+        if wordNo == 1:
+            first_word = word
+            if len(third_word) > 0:
+                set_bigram_count(third_word, first_word, bigrams)
+                if len(second_word) > 0:
+                    set_trigram_count(second_word, third_word, first_word, trigrams)   
+                    
+        elif wordNo == 2:
+            second_word = word        
+            set_bigram_count(first_word, second_word, bigrams)
+            if len(third_word) > 0:
+                set_trigram_count(third_word, first_word, second_word, trigrams)      
+                          
+        elif wordNo == 3:
+            third_word = word
+            set_bigram_count(second_word, third_word, bigrams)
+            set_trigram_count(first_word, second_word, third_word, trigrams)
+            wordNo = 0
+            
+        wordNo += 1
+        if i_ == 100:
+            break
+print(trigrams)
+
+with lzma.open('train/in.tsv.xz', mode='rt') as file:
+    for line in file:
+        words += re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")
+        print(words)
+        break
+
+# with lzma.open('train/in.tsv.xz', mode='rt') as file:
+#     for line in file:
+#         # print(line.replace("\\n"," ").replace("\n"," "))
+#         words += re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")
+#         print(words)
+#         last_two_words = []
+#         for i_, word in enumerate(words):
+#             if i_ + 2 < len(words):
+#                 if f"{words[i_+1]}_{words[i_+2]}" not in bigrams:
+#                     bigrams[f"{words[i_+1]}_{words[i_+2]}"] = 1
+#                 else:
+#                     bigrams[f"{words[i_+1]}_{words[i_+2]}"] += 1
+                    
+#                 if f"{words[i_]}_{words[i_+1]}_{words[i_+2]}" not in trigrams:
+#                     trigrams[f"{words[i_]}_{words[i_+1]}_{words[i_+2]}"] = 1
+#                 else:
+#                     trigrams[f"{words[i_]}_{words[i_+1]}_{words[i_+2]}"] += 1
+#             else:
+#                 last_two_words = [words[-2]]+[words[-1]]
+#         print(last_two_words)
+#         words = []
+#         # print(words)
+#         # print(re.sub(' +|\t', ' ', line).replace("\\n", " ").replace("\n","").split(" "))
+#         # break
+#         if index == 2:
+#             break
+#         index += 1
+        
+# text = "one of the"
+# print(bigrams["political_thirst"])
+# print(trigrams["to_political_thirst"])
+# for trigram in trigrams:
+#     if trigrams[trigram] > 1:
+#         print(trigram, trigrams[trigram])
+# for bigram in bigrams:
+#     if bigrams[bigram] > 6:
+#         print(bigram, bigrams[bigram])