From 730e401d240021401619d293f3b700aad9ffa266 Mon Sep 17 00:00:00 2001
From: Jan Nowak <jannow2@st.amu.edu.pl>
Date: Sun, 3 Apr 2022 23:01:42 +0200
Subject: [PATCH] Nc grams in other file.

---
 run.py    |  23 +---------
 run_nc.py | 132 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 133 insertions(+), 22 deletions(-)
 create mode 100644 run_nc.py

diff --git a/run.py b/run.py
index 1671f5a..2e707f1 100644
--- a/run.py
+++ b/run.py
@@ -46,8 +46,6 @@ def load_train():
 def predict(search_for_words):    
     trigrams = {}
     bigrams = {}
-    trigrams_nc = {}
-    bigrams_nc = {}
     index = 0
     expected = open('train/expected.tsv', 'r')
     with lzma.open('train/in.tsv.xz', mode='rt') as file:
@@ -60,9 +58,6 @@ def predict(search_for_words):
                 if search_for_word[0] == words[0+mv] and search_for_word[1] == words[1+mv]:
                     set_bigram_count(words[0+mv], words[1+mv], bigrams)
                     set_trigram_count(expected_word, words[0+mv], words[1+mv], trigrams)
-                elif search_for_word[0] == words[0+mv]:
-                    set_bigram_count(words[0+mv], words[1+mv], bigrams_nc)
-                    set_trigram_count(expected_word, words[0+mv], words[1+mv], trigrams_nc)
     
             if index == 100000:
                 break
@@ -71,8 +66,6 @@ def predict(search_for_words):
     print(len(search_for_words))
     print(len(bigrams))
     print(len(trigrams))
-    print(len(bigrams_nc))
-    print(len(trigrams_nc))
     
     left_context_search_for_word = {}
     for bigram in bigrams:
@@ -82,15 +75,6 @@ def predict(search_for_words):
                 max_count = trigrams[trigram]
                 left_context = trigram.split("_")[0]
                 left_context_search_for_word[bigram] = left_context
-    
-    left_context_search_for_word_nc = {}
-    for bigram in bigrams_nc:
-        max_count = 0
-        for trigram in trigrams_nc:
-            if bigram == '_'.join(trigram.split("_")[1:3]) and trigrams_nc[trigram] > max_count:
-                max_count = trigrams_nc[trigram]
-                left_context = trigram.split("_")[0]
-                left_context_search_for_word_nc[bigram] = left_context
 
     for index, search_for_word in enumerate(search_for_words):
         hash_search_for_word = '_'.join(search_for_word)
@@ -98,12 +82,7 @@ def predict(search_for_words):
             left_context = left_context_search_for_word[hash_search_for_word]
             print(f"{index+1}: {left_context} {' '.join(search_for_word)} {trigrams['_'.join([left_context]+search_for_word)]/bigrams[hash_search_for_word]}")
         else:
-            for lfc in left_context_search_for_word_nc:
-                if search_for_word[0] == lfc.split("_")[0]:
-                    left_context = left_context_search_for_word[lfc]
-                    print(f"{index+1}: {left_context} {' '.join(search_for_word)} {trigrams_nc['_'.join([left_context]+lfc)]/bigrams_nc[lfc]}")
-                else:
-                    print(f"{index+1}: ??? {' '.join(search_for_word)}")
+            print(f"{index+1}: ??? {' '.join(search_for_word)}")
 
 def load_dev():
     search_for_words = []
diff --git a/run_nc.py b/run_nc.py
new file mode 100644
index 0000000..1671f5a
--- /dev/null
+++ b/run_nc.py
@@ -0,0 +1,132 @@
+from encodings import search_function
+import lzma
+from re import L
+import regex as re
+import string
+import queue
+# text = lzma.open('train/in.tsv.xz').read()
+def read_file(file):
+    for line in file:      
+        yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', line.split("\t")[7].replace("\\n"," ").replace("\n","").lower())).split(" ")
+
+def get_words(file):
+    for words in read_file(file):
+        yield from words
+
+def set_bigram_count(first_word, second_word, bigrams):    
+    if f"{first_word}_{second_word}" not in bigrams:
+        bigrams[f"{first_word}_{second_word}"] = 1
+    else:
+        bigrams[f"{first_word}_{second_word}"] += 1    
+
+def set_trigram_count(first_word, second_word, third_word, trigrams):    
+    if f"{first_word}_{second_word}_{third_word}" not in trigrams:
+        trigrams[f"{first_word}_{second_word}_{third_word}"] = 1
+    else:
+        trigrams[f"{first_word}_{second_word}_{third_word}"] += 1
+
+def load_train():
+    trigrams = {}
+    bigrams = {}
+    index = 0
+    expected = open('train/expected.tsv', 'r')
+    with lzma.open('train/in.tsv.xz', mode='rt') as file:
+        for words in read_file(file):
+            expected_word = re.sub(r"[^\w\d'\s]+", '', expected.readline().replace("\n", "").lower())
+            mv = 0
+            if not words[0]:
+                mv = 1
+            set_bigram_count(words[0+mv], words[1+mv], bigrams)
+            set_trigram_count(expected_word, words[0+mv], words[1+mv], trigrams)
+    print(bigrams)
+    print(trigrams)
+
+
+
+def predict(search_for_words):    
+    trigrams = {}
+    bigrams = {}
+    trigrams_nc = {}
+    bigrams_nc = {}
+    index = 0
+    expected = open('train/expected.tsv', 'r')
+    with lzma.open('train/in.tsv.xz', mode='rt') as file:
+        for words in read_file(file):
+            expected_word = re.sub(r"[^\w\d'\s]+", '', expected.readline().replace("\n", "").lower())
+            mv = 0
+            if not words[0]:
+                mv = 1
+            for search_for_word in search_for_words:
+                if search_for_word[0] == words[0+mv] and search_for_word[1] == words[1+mv]:
+                    set_bigram_count(words[0+mv], words[1+mv], bigrams)
+                    set_trigram_count(expected_word, words[0+mv], words[1+mv], trigrams)
+                elif search_for_word[0] == words[0+mv]:
+                    set_bigram_count(words[0+mv], words[1+mv], bigrams_nc)
+                    set_trigram_count(expected_word, words[0+mv], words[1+mv], trigrams_nc)
+    
+            if index == 100000:
+                break
+            index += 1
+            
+    print(len(search_for_words))
+    print(len(bigrams))
+    print(len(trigrams))
+    print(len(bigrams_nc))
+    print(len(trigrams_nc))
+    
+    left_context_search_for_word = {}
+    for bigram in bigrams:
+        max_count = 0
+        for trigram in trigrams:
+            if bigram == '_'.join(trigram.split("_")[1:3]) and trigrams[trigram] > max_count:
+                max_count = trigrams[trigram]
+                left_context = trigram.split("_")[0]
+                left_context_search_for_word[bigram] = left_context
+    
+    left_context_search_for_word_nc = {}
+    for bigram in bigrams_nc:
+        max_count = 0
+        for trigram in trigrams_nc:
+            if bigram == '_'.join(trigram.split("_")[1:3]) and trigrams_nc[trigram] > max_count:
+                max_count = trigrams_nc[trigram]
+                left_context = trigram.split("_")[0]
+                left_context_search_for_word_nc[bigram] = left_context
+
+    for index, search_for_word in enumerate(search_for_words):
+        hash_search_for_word = '_'.join(search_for_word)
+        if hash_search_for_word in left_context_search_for_word:
+            left_context = left_context_search_for_word[hash_search_for_word]
+            print(f"{index+1}: {left_context} {' '.join(search_for_word)} {trigrams['_'.join([left_context]+search_for_word)]/bigrams[hash_search_for_word]}")
+        else:
+            for lfc in left_context_search_for_word_nc:
+                if search_for_word[0] == lfc.split("_")[0]:
+                    left_context = left_context_search_for_word[lfc]
+                    print(f"{index+1}: {left_context} {' '.join(search_for_word)} {trigrams_nc['_'.join([left_context]+lfc)]/bigrams_nc[lfc]}")
+                else:
+                    print(f"{index+1}: ??? {' '.join(search_for_word)}")
+
+def load_dev():
+    search_for_words = []
+    with lzma.open('dev-0/in.tsv.xz', mode='rt') as file:
+        index = 0
+        for words in read_file(file):
+            if words[0]:
+                search_for_words.append([words[0], words[1]])
+            else:
+                search_for_words.append([words[1], words[2]])
+            if index == 100:
+                break
+            index += 1
+    print(search_for_words)
+    return search_for_words
+
+if __name__ == "__main__":
+    # load_train()
+    # load_dev()
+    predict(load_dev())
+    # with lzma.open('train/in.tsv.xz', mode='rt') as file:
+    #     index = 0
+    #     for _ in get_words(file):
+    #         index += 1
+    #     print(index) # 141820215
+ 
\ No newline at end of file