2 changed files with 10635 additions and 10648 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/run.py
+++ b/run.py
@ -1,21 +1,7 @@
 #!/usr/bin/env python
 # coding: utf-8

-# In[25]:
-
-
-import tensorflow as tf
-from tensorflow.keras.preprocessing.text import Tokenizer
-from tensorflow.keras.preprocessing.sequence import pad_sequences
-from tensorflow.keras.models import Model
-from tensorflow.keras.layers import Input, Embedding, Dense, Lambda
-from gensim.models import Word2Vec
-import numpy as np
-from sklearn.model_selection import train_test_split
-from tensorflow.keras.callbacks import EarlyStopping
-
-
-# In[26]:
+# In[1]:


 import lzma
@ -28,179 +14,123 @@ def read_xz_file(file_path):
    return data


-# In[27]:
+# In[2]:


 def read_tsv_file(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
-            line = line.strip().split('\t') 
-            data.append(line)  
+            line = line.strip().split('\t')  # Rozdziel linie na elementy za pomocą tabulatora
+            data.append(line)  # Dodaj elementy do listy danych
    return data


-# In[28]:
+# In[3]:


 file_path = "train\\in.tsv.xz"


-# In[29]:
+# In[4]:


 data = read_xz_file(file_path)


-# In[30]:
+# In[5]:


 expected = read_tsv_file("train\\expected.tsv")


-# In[31]:
+# In[6]:


 corpus_before=[]
 corpus_after=[]
-for i in range(20000):
+for i in range(len(data)):
    corpus_before.append(str(data[i].split("\t")[6]))
    corpus_after.append(str(data[i].split("\t")[7]))


-# In[32]:
+# In[7]:


-for i in range(20000):
+for i in range(len(expected)):
    expected[i] = str(expected[i]).lower()


-# In[33]:
+# In[8]:


 corpus = []
-for i in range(20000):
+for i in range(len(expected)):
    corpus.append(corpus_before[i] + " " + expected[i] + " " + corpus_after[i])


-# In[34]:
+# In[9]:


-sentences = [text.split() for text in corpus]
+from collections import defaultdict
+from nltk import trigrams
+from nltk.tokenize import word_tokenize
+
+model_trigram = defaultdict(lambda: defaultdict(float))
+dictionary_trigram = set()
+for line in corpus[:200000]:
+            tokens = word_tokenize(line)
+            for word1, word2, word3 in trigrams(tokens, pad_right=True, pad_left=True):
+                if word1 and word2 and word3:
+                    model_trigram[(word2, word3)][word1] += 1
+                    model_trigram[(word1, word2)][word3] += 1
+                    dictionary_trigram.update([word1, word2, word3])


-# In[35]:
+# In[10]:


-word2vec_model = Word2Vec(sentences, vector_size=70, window=5, min_count=1, workers=4)
-word2vec_model.train(sentences, total_examples=word2vec_model.corpus_count, epochs=10)
+from collections import defaultdict
+from nltk import bigrams
+from nltk.tokenize import word_tokenize
+
+model_bigram = defaultdict(lambda: defaultdict(float))
+dictionary_bigram = set()
+for line in corpus[:200000]:
+            tokens = word_tokenize(line)
+            for word1, word2 in bigrams(tokens, pad_right=True, pad_left=True):
+                if word1 and word2:
+                    model_bigram[word2][word1] += 1
+                    model_bigram[word1][word2] += 1
+                    dictionary_bigram.update([word1, word2])


-# In[36]:
+# In[11]:


-tokenizer = Tokenizer()
-tokenizer.fit_on_texts(corpus)
-total_words = len(tokenizer.word_index) + 1
+smoothing = 0.0001
+for trio in model_trigram:
+    count_sum = sum(model_trigram[trio].values()) + smoothing * len(dictionary_trigram)
+    for token in model_trigram[trio]:
+        model_trigram[trio][token] = (model_trigram[trio][token] + smoothing) / count_sum


-# In[37]:
+# In[12]:


-input_sequences = []
-output_words = []
-
-for before, word, after in zip(corpus_before, expected, corpus_after):
-
-    before_tokens = tokenizer.texts_to_sequences([before])[0]
-    after_tokens = tokenizer.texts_to_sequences([after])[0]
-    word_token = tokenizer.texts_to_sequences([word])[0][0]
+smoothing = 0.0001
+for trio in model_bigram:
+    count_sum = sum(model_bigram[trio].values()) + smoothing * len(dictionary_bigram)
+    for token in model_bigram[trio]:
+        model_bigram[trio][token] = (model_bigram[trio][token] + smoothing) / count_sum


-    for i in range(1, 6):
-        input_seq = before_tokens[-(5-i):] + [word_token] + after_tokens[:i]
-        input_sequences.append(pad_sequences([input_seq], maxlen=5, padding='pre')[0])
-        output_words.append(word_token)
-
-
-input_sequences = np.array(input_sequences)
-output_words = np.array(output_words)
-
-
-# In[38]:
-
-
-embedding_matrix = np.zeros((total_words, 70))
-for word, i in tokenizer.word_index.items():
-    if word in word2vec_model.wv:
-        embedding_matrix[i] = word2vec_model.wv[word]
-
-
-# In[39]:
-
-
-X_train, X_val, y_train, y_val = train_test_split(input_sequences, output_words, test_size=0.2, random_state=33)
-
-
-# In[40]:
-
-
-input_layer = Input(shape=(5,))
-
-embedding_layer = Embedding(total_words, 70, trainable=False)(input_layer)
-
-
-
-sum_layer = Lambda(lambda x: tf.reduce_sum(x, axis=1))(embedding_layer)
-
-dense_layer1 = Dense(128, activation='relu')(sum_layer)
-dense_layer2 = Dense(64, activation='relu')(dense_layer1)
-
-
-linear_layer = Dense(70, activation='relu')(dense_layer2)
-
-
-output_layer = Dense(total_words, activation='softmax')(linear_layer)
-
-
-# In[41]:
-
-
-model = Model(inputs=input_layer, outputs=output_layer)
-model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
-early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
-
-
-
-# In[42]:
-
-
-model.fit(X_train, y_train, epochs=15, verbose=2, validation_data=(X_val, y_val), callbacks=[early_stopping])
-
-
-# In[43]:
-
-
-model.save('ngram_model_20k.keras')
-
-
-# In[44]:
-
-
-def predict_next_words(model, tokenizer, text, top_n=1):
-    sequence = tokenizer.texts_to_sequences([text])[0]
-    padded_sequence = pad_sequences([sequence], maxlen=5, padding='pre')
-    predictions = model.predict(padded_sequence)[0]
-    top_indices = np.argsort(predictions)[-top_n:][::-1]
-    top_words = [tokenizer.index_word[index] for index in top_indices]
-    top_probabilities = [predictions[index] for index in top_indices]
-    return list(zip(top_words, top_probabilities))
-
-
-# In[45]:
+# In[19]:


+from collections import Counter

+default = "the:0.10000 of:0.05000 and:0.01000 to:0.01000 in:0.01000 a:0.01000 :0.81000"

 data = read_xz_file("dev-0\\in.tsv.xz")
 corpus_before=[]
@ -211,11 +141,68 @@ for i in range(len(data)):
    
 with open("dev-0\\out.tsv", "w", encoding="utf-8") as output:
    for text in corpus_before:
-        predictions = predict_next_words(model, tokenizer, text)
-        result = " ".join([f"{word}:{round(probability,5)}" for word, probability in predictions])
-        output.write(str(result.replace("\n", "").strip() + "\n"))
+        tokens = word_tokenize(text)
+        prediction = ""
+
+        if len(tokens) >= 3:
+            results = dict(model_trigram[(tokens[0], tokens[1])])
+            if results:
+                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
+
+        if prediction == "":
+            bigram_results = dict(model_bigram[tokens[0]])
+            if bigram_results:
+                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6))
+
+        if prediction == "":
+            prediction = default
+
+        output.write(str(prediction.replace("\n", "").strip() + "\n"))
+
+
+# In[ ]:





+# In[22]:
+
+
+from collections import Counter
+
+default = "the:0.10000 of:0.05000 and:0.01000 to:0.01000 in:0.01000 a:0.01000 :0.81000"
+
+data = read_xz_file("test-A\\in.tsv.xz")
+corpus_before=[]
+corpus_after=[]
+for i in range(len(data)):
+    corpus_before.append(str(data[i].split("\t")[6]))
+    corpus_after.append(str(data[i].split("\t")[7]))
+    
+with open("test-A\\out.tsv", "w", encoding="utf-8") as output:
+    for text in corpus_before:
+        tokens = word_tokenize(text)
+        prediction = ""
+
+        if len(tokens) >= 3:
+            results = dict(model_trigram[(tokens[0], tokens[1])])
+            if results:
+                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
+
+        if prediction == "":
+            bigram_results = dict(model_bigram[tokens[0]])
+            if bigram_results:
+                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6))
+
+        if prediction == "":
+            prediction = default
+
+        output.write(str(prediction.replace("\n", "").strip() + "\n"))
+
+
+# In[ ]:
+
+
+
+