452662

2024-04-23 21:23:41 +02:00 · 2024-04-23 21:23:41 +02:00 · 3283b55c35
commit 3283b55c35
parent 77806b3f25
3 changed files with 8867 additions and 8799 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/run.py
+++ b/run.py
@ -21,8 +21,8 @@ def read_tsv_file(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
-            line = line.strip().split('\t')  
+            line = line.strip().split('\t')  # Rozdziel linie na elementy za pomocą tabulatora
-            data.append(line)
+            data.append(line)  # Dodaj elementy do listy danych
    return data
@ -87,16 +87,40 @@ for line in corpus[:100000]:
                    dictionary.update([word1, word2, word3, word4])
-# In[10]:
+# In[15]:
-model2 = model.copy()
+from collections import defaultdict
 from nltk import trigrams
 from nltk.tokenize import word_tokenize
 model_trigram = defaultdict(lambda: defaultdict(float))
 dictionary_trigram = set()
 for line in corpus[:100000]:
            tokens = word_tokenize(line)
            for word1, word2, word3 in trigrams(tokens, pad_right=True, pad_left=True):
                if word1 and word2 and word3:
                    model_trigram[(word2, word3)][word1] += 1
                    model_trigram[(word1, word2)][word3] += 1
                    dictionary_trigram.update([word1, word2, word3])
-# In[ ]:
+# In[18]:
-len(model)
+from collections import defaultdict
 from nltk import bigrams
 from nltk.tokenize import word_tokenize
 model_bigram = defaultdict(lambda: defaultdict(float))
 dictionary_bigram = set()
 for line in corpus[:100000]:
            tokens = word_tokenize(line)
            for word1, word2 in bigrams(tokens, pad_right=True, pad_left=True):
                if word1 and word2:
                    model_bigram[word2][word1] += 1
                    model_bigram[word1][word2] += 1
                    dictionary_bigram.update([word1, word2])
 # In[11]:
@ -109,7 +133,27 @@ for trio in model:
        model[trio][token] = (model[trio][token] + smoothing) / count_sum
-# In[12]:
+# In[17]:
 smoothing = 0.0001
 for trio in model_trigram:
    count_sum = sum(model_trigram[trio].values()) + smoothing * len(dictionary_trigram)
    for token in model_trigram[trio]:
        model_trigram[trio][token] = (model_trigram[trio][token] + smoothing) / count_sum
 # In[19]:
 smoothing = 0.0001
 for trio in model_bigram:
    count_sum = sum(model_bigram[trio].values()) + smoothing * len(dictionary_bigram)
    for token in model_bigram[trio]:
        model_bigram[trio][token] = (model_bigram[trio][token] + smoothing) / count_sum
 # In[21]:
 from collections import Counter
@ -126,21 +170,36 @@ for i in range(len(data)):
 with open("dev-0\\out.tsv", "w", encoding="utf-8") as output:
    for text in corpus_before:
        tokens = word_tokenize(text)
-        if len(tokens) < 4:
+        prediction = ""
            prediction = default
-        results = dict(model[(tokens[0], tokens[1], tokens[2])])
+        if len(tokens) >= 4:
-        if not results:
+            results = dict(model[(tokens[0], tokens[1], tokens[2])])
-            prediction = default
+            if results:
                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
        if prediction == "":
            trigram_results = dict(model_trigram[(tokens[0], tokens[1])])
            if trigram_results:
                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(trigram_results).most_common(6))
        if prediction == "":
            bigram_results = dict(model_bigram[tokens[0]])
            if bigram_results:
                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6))
        prediction = ' '.join(
            f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
        if prediction == "":
            prediction = default
        output.write(str(prediction.replace("\n", "").strip() + "\n"))
-# In[13]:
+# In[ ]:
 # In[23]:
 from collections import Counter
@ -157,17 +216,26 @@ for i in range(len(data)):
 with open("test-A\\out.tsv", "w", encoding="utf-8") as output:
    for text in corpus_before:
        tokens = word_tokenize(text)
-        if len(tokens) < 4:
+        prediction = ""
            prediction = default
-        results = dict(model[(tokens[0], tokens[1], tokens[2])])
+        if len(tokens) >= 4:
-        if not results:
+            results = dict(model[(tokens[0], tokens[1], tokens[2])])
-            prediction = default
+            if results:
                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
        if prediction == "":
            trigram_results = dict(model_trigram[(tokens[0], tokens[1])])
            if trigram_results:
                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(trigram_results).most_common(6))
        if prediction == "":
            bigram_results = dict(model_bigram[tokens[0]])
            if bigram_results:
                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6))
        prediction = ' '.join(
            f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
        if prediction == "":
            prediction = default
        output.write(str(prediction.replace("\n", "").strip() + "\n"))
--- a/test-A/out.tsv
+++ b/test-A/out.tsv