452662

fourgram 100k corpus
2024-04-23 21:23:41 +02:00 · 2024-04-23 20:51:53 +02:00 · 2024-04-23 20:46:13 +02:00
3 changed files with 18179 additions and 17933 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/run.py
+++ b/run.py
@ -0,0 +1,246 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[1]:
+
+
+import lzma
+def read_xz_file(file_path):
+    data = []
+    with lzma.open(file_path, 'rt', encoding='utf-8') as f:
+        for line in f:
+            line = line.lower().replace("-\\n", "").replace("\\n", " ").replace("\xad", "").replace("\\\\n", " ").replace("\\\\", " ").replace("\n", " ")
+            data.append(line)  
+    return data
+
+
+# In[2]:
+
+
+def read_tsv_file(file_path):
+    data = []
+    with open(file_path, 'r', encoding='utf-8') as file:
+        for line in file:
+            line = line.strip().split('\t')  # Rozdziel linie na elementy za pomocą tabulatora
+            data.append(line)  # Dodaj elementy do listy danych
+    return data
+
+
+# In[3]:
+
+
+file_path = "train\\in.tsv.xz"
+
+
+# In[4]:
+
+
+data = read_xz_file(file_path)
+
+
+# In[5]:
+
+
+expected = read_tsv_file("train\\expected.tsv")
+
+
+# In[6]:
+
+
+corpus_before=[]
+corpus_after=[]
+for i in range(len(data)):
+    corpus_before.append(str(data[i].split("\t")[6]))
+    corpus_after.append(str(data[i].split("\t")[7]))
+
+
+# In[7]:
+
+
+for i in range(len(expected)):
+    expected[i] = str(expected[i]).lower()
+
+
+# In[8]:
+
+
+corpus = []
+for i in range(len(expected)):
+    corpus.append(corpus_before[i] + " " + expected[i] + " " + corpus_after[i])
+
+
+# In[9]:
+
+
+from collections import defaultdict
+from nltk import ngrams
+from nltk.tokenize import word_tokenize
+
+model = defaultdict(lambda: defaultdict(float))
+dictionary = set()
+for line in corpus[:100000]:
+            tokens = word_tokenize(line)
+            for word1, word2, word3, word4 in ngrams(tokens, n=4, pad_right=True, pad_left=True):
+                if word1 and word2 and word3 and word4:
+                    model[(word2, word3, word4)][word1] += 1
+                    model[(word1, word2, word3)][word4] += 1
+                    dictionary.update([word1, word2, word3, word4])
+
+
+# In[15]:
+
+
+from collections import defaultdict
+from nltk import trigrams
+from nltk.tokenize import word_tokenize
+
+model_trigram = defaultdict(lambda: defaultdict(float))
+dictionary_trigram = set()
+for line in corpus[:100000]:
+            tokens = word_tokenize(line)
+            for word1, word2, word3 in trigrams(tokens, pad_right=True, pad_left=True):
+                if word1 and word2 and word3:
+                    model_trigram[(word2, word3)][word1] += 1
+                    model_trigram[(word1, word2)][word3] += 1
+                    dictionary_trigram.update([word1, word2, word3])
+
+
+# In[18]:
+
+
+from collections import defaultdict
+from nltk import bigrams
+from nltk.tokenize import word_tokenize
+
+model_bigram = defaultdict(lambda: defaultdict(float))
+dictionary_bigram = set()
+for line in corpus[:100000]:
+            tokens = word_tokenize(line)
+            for word1, word2 in bigrams(tokens, pad_right=True, pad_left=True):
+                if word1 and word2:
+                    model_bigram[word2][word1] += 1
+                    model_bigram[word1][word2] += 1
+                    dictionary_bigram.update([word1, word2])
+
+
+# In[11]:
+
+
+smoothing = 0.0001
+for trio in model:
+    count_sum = sum(model[trio].values()) + smoothing * len(dictionary)
+    for token in model[trio]:
+        model[trio][token] = (model[trio][token] + smoothing) / count_sum
+
+
+# In[17]:
+
+
+smoothing = 0.0001
+for trio in model_trigram:
+    count_sum = sum(model_trigram[trio].values()) + smoothing * len(dictionary_trigram)
+    for token in model_trigram[trio]:
+        model_trigram[trio][token] = (model_trigram[trio][token] + smoothing) / count_sum
+
+
+# In[19]:
+
+
+smoothing = 0.0001
+for trio in model_bigram:
+    count_sum = sum(model_bigram[trio].values()) + smoothing * len(dictionary_bigram)
+    for token in model_bigram[trio]:
+        model_bigram[trio][token] = (model_bigram[trio][token] + smoothing) / count_sum
+
+
+# In[21]:
+
+
+from collections import Counter
+
+default = "the:0.30000 of:0.20000 and:0.10000 to:0.10000 in:0.10000 a:0.10000 :0.10000"
+
+data = read_xz_file("dev-0\\in.tsv.xz")
+corpus_before=[]
+corpus_after=[]
+for i in range(len(data)):
+    corpus_before.append(str(data[i].split("\t")[6]))
+    corpus_after.append(str(data[i].split("\t")[7]))
+    
+with open("dev-0\\out.tsv", "w", encoding="utf-8") as output:
+    for text in corpus_before:
+        tokens = word_tokenize(text)
+        prediction = ""
+
+        if len(tokens) >= 4:
+            results = dict(model[(tokens[0], tokens[1], tokens[2])])
+            if results:
+                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
+
+        if prediction == "":
+            trigram_results = dict(model_trigram[(tokens[0], tokens[1])])
+            if trigram_results:
+                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(trigram_results).most_common(6))
+
+        if prediction == "":
+            bigram_results = dict(model_bigram[tokens[0]])
+            if bigram_results:
+                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6))
+
+        if prediction == "":
+            prediction = default
+
+        output.write(str(prediction.replace("\n", "").strip() + "\n"))
+
+
+# In[ ]:
+
+
+
+
+
+# In[23]:
+
+
+from collections import Counter
+
+default = "the:0.30000 of:0.20000 and:0.10000 to:0.10000 in:0.10000 a:0.10000 :0.10000"
+
+data = read_xz_file("test-A\\in.tsv.xz")
+corpus_before=[]
+corpus_after=[]
+for i in range(len(data)):
+    corpus_before.append(str(data[i].split("\t")[6]))
+    corpus_after.append(str(data[i].split("\t")[7]))
+    
+with open("test-A\\out.tsv", "w", encoding="utf-8") as output:
+    for text in corpus_before:
+        tokens = word_tokenize(text)
+        prediction = ""
+
+        if len(tokens) >= 4:
+            results = dict(model[(tokens[0], tokens[1], tokens[2])])
+            if results:
+                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
+
+        if prediction == "":
+            trigram_results = dict(model_trigram[(tokens[0], tokens[1])])
+            if trigram_results:
+                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(trigram_results).most_common(6))
+
+        if prediction == "":
+            bigram_results = dict(model_bigram[tokens[0]])
+            if bigram_results:
+                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6))
+
+        if prediction == "":
+            prediction = default
+
+        output.write(str(prediction.replace("\n", "").strip() + "\n"))
+
+
+# In[ ]:
+
+
+
+
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
Author	SHA1	Message	Date
s452662	3283b55c35	452662	2024-04-23 21:23:41 +02:00
s452662	77806b3f25	452662	2024-04-23 20:51:53 +02:00
s452662	7743dd2472	fourgram 100k corpus	2024-04-23 20:46:13 +02:00