452662 trigram

2024-04-24 14:20:00 +02:00 · 2024-04-24 14:20:00 +02:00 · 6bb3f18cf6
commit 6bb3f18cf6
parent 66ceb65baf
2 changed files with 467 additions and 505 deletions
--- a/run.py
+++ b/run.py
@ -72,31 +72,13 @@ for i in range(len(expected)):
 # In[9]:


-from collections import defaultdict
-from nltk import ngrams
-from nltk.tokenize import word_tokenize
-
-model = defaultdict(lambda: defaultdict(float))
-dictionary = set()
-for line in corpus[:100000]:
-            tokens = word_tokenize(line)
-            for word1, word2, word3, word4 in ngrams(tokens, n=4, pad_right=True, pad_left=True):
-                if word1 and word2 and word3 and word4:
-                    model[(word2, word3, word4)][word1] += 1
-                    model[(word1, word2, word3)][word4] += 1
-                    dictionary.update([word1, word2, word3, word4])
-
-
-# In[15]:
-
-
 from collections import defaultdict
 from nltk import trigrams
 from nltk.tokenize import word_tokenize

 model_trigram = defaultdict(lambda: defaultdict(float))
 dictionary_trigram = set()
-for line in corpus[:100000]:
+for line in corpus[:200000]:
            tokens = word_tokenize(line)
            for word1, word2, word3 in trigrams(tokens, pad_right=True, pad_left=True):
                if word1 and word2 and word3:
@ -105,7 +87,7 @@ for line in corpus[:100000]:
                    dictionary_trigram.update([word1, word2, word3])


-# In[18]:
+# In[10]:


 from collections import defaultdict
@ -114,7 +96,7 @@ from nltk.tokenize import word_tokenize

 model_bigram = defaultdict(lambda: defaultdict(float))
 dictionary_bigram = set()
-for line in corpus[:100000]:
+for line in corpus[:200000]:
            tokens = word_tokenize(line)
            for word1, word2 in bigrams(tokens, pad_right=True, pad_left=True):
                if word1 and word2:
@ -126,16 +108,6 @@ for line in corpus[:100000]:
 # In[11]:


-smoothing = 0.0001
-for trio in model:
-    count_sum = sum(model[trio].values()) + smoothing * len(dictionary)
-    for token in model[trio]:
-        model[trio][token] = (model[trio][token] + smoothing) / count_sum
-
-
-# In[17]:
-
-
 smoothing = 0.0001
 for trio in model_trigram:
    count_sum = sum(model_trigram[trio].values()) + smoothing * len(dictionary_trigram)
@ -143,7 +115,7 @@ for trio in model_trigram:
        model_trigram[trio][token] = (model_trigram[trio][token] + smoothing) / count_sum


-# In[19]:
+# In[12]:


 smoothing = 0.0001
@ -153,12 +125,12 @@ for trio in model_bigram:
        model_bigram[trio][token] = (model_bigram[trio][token] + smoothing) / count_sum


-# In[21]:
+# In[19]:


 from collections import Counter

-default = "the:0.30000 of:0.20000 and:0.10000 to:0.10000 in:0.10000 a:0.10000 :0.10000"
+default = "the:0.10000 of:0.05000 and:0.01000 to:0.01000 in:0.01000 a:0.01000 :0.81000"

 data = read_xz_file("dev-0\\in.tsv.xz")
 corpus_before=[]
@ -172,16 +144,11 @@ with open("dev-0\\out.tsv", "w", encoding="utf-8") as output:
        tokens = word_tokenize(text)
        prediction = ""

-        if len(tokens) >= 4:
-            results = dict(model[(tokens[0], tokens[1], tokens[2])])
+        if len(tokens) >= 3:
+            results = dict(model_trigram[(tokens[0], tokens[1])])
            if results:
                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))

-        if prediction == "":
-            trigram_results = dict(model_trigram[(tokens[0], tokens[1])])
-            if trigram_results:
-                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(trigram_results).most_common(6))
-
        if prediction == "":
            bigram_results = dict(model_bigram[tokens[0]])
            if bigram_results:
@ -199,12 +166,12 @@ with open("dev-0\\out.tsv", "w", encoding="utf-8") as output:



-# In[23]:
+# In[22]:


 from collections import Counter

-default = "the:0.30000 of:0.20000 and:0.10000 to:0.10000 in:0.10000 a:0.10000 :0.10000"
+default = "the:0.10000 of:0.05000 and:0.01000 to:0.01000 in:0.01000 a:0.01000 :0.81000"

 data = read_xz_file("test-A\\in.tsv.xz")
 corpus_before=[]
@ -218,16 +185,11 @@ with open("test-A\\out.tsv", "w", encoding="utf-8") as output:
        tokens = word_tokenize(text)
        prediction = ""

-        if len(tokens) >= 4:
-            results = dict(model[(tokens[0], tokens[1], tokens[2])])
+        if len(tokens) >= 3:
+            results = dict(model_trigram[(tokens[0], tokens[1])])
            if results:
                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))

-        if prediction == "":
-            trigram_results = dict(model_trigram[(tokens[0], tokens[1])])
-            if trigram_results:
-                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(trigram_results).most_common(6))
-
        if prediction == "":
            bigram_results = dict(model_bigram[tokens[0]])
            if bigram_results:
--- a/test-A/out.tsv
+++ b/test-A/out.tsv