added run.py

20k model
2024-05-22 15:37:23 +02:00 · 2024-05-21 18:13:45 +02:00 · 2024-05-21 18:12:03 +02:00 · 2024-05-21 18:10:57 +02:00 · 2024-05-21 18:10:09 +02:00 · 2024-05-21 18:03:22 +02:00
2 changed files with 10648 additions and 10635 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/run.py
+++ b/run.py
@ -1,7 +1,21 @@
 #!/usr/bin/env python
 # coding: utf-8

-# In[1]:
+# In[25]:
+
+
+import tensorflow as tf
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.models import Model
+from tensorflow.keras.layers import Input, Embedding, Dense, Lambda
+from gensim.models import Word2Vec
+import numpy as np
+from sklearn.model_selection import train_test_split
+from tensorflow.keras.callbacks import EarlyStopping
+
+
+# In[26]:


 import lzma
@ -14,123 +28,179 @@ def read_xz_file(file_path):
    return data


-# In[2]:
+# In[27]:


 def read_tsv_file(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
-            line = line.strip().split('\t')  # Rozdziel linie na elementy za pomocą tabulatora
-            data.append(line)  # Dodaj elementy do listy danych
+            line = line.strip().split('\t') 
+            data.append(line)  
    return data


-# In[3]:
+# In[28]:


 file_path = "train\\in.tsv.xz"


-# In[4]:
+# In[29]:


 data = read_xz_file(file_path)


-# In[5]:
+# In[30]:


 expected = read_tsv_file("train\\expected.tsv")


-# In[6]:
+# In[31]:


 corpus_before=[]
 corpus_after=[]
-for i in range(len(data)):
+for i in range(20000):
    corpus_before.append(str(data[i].split("\t")[6]))
    corpus_after.append(str(data[i].split("\t")[7]))


-# In[7]:
+# In[32]:


-for i in range(len(expected)):
+for i in range(20000):
    expected[i] = str(expected[i]).lower()


-# In[8]:
+# In[33]:


 corpus = []
-for i in range(len(expected)):
+for i in range(20000):
    corpus.append(corpus_before[i] + " " + expected[i] + " " + corpus_after[i])


-# In[9]:
+# In[34]:


-from collections import defaultdict
-from nltk import trigrams
-from nltk.tokenize import word_tokenize
-
-model_trigram = defaultdict(lambda: defaultdict(float))
-dictionary_trigram = set()
-for line in corpus[:200000]:
-            tokens = word_tokenize(line)
-            for word1, word2, word3 in trigrams(tokens, pad_right=True, pad_left=True):
-                if word1 and word2 and word3:
-                    model_trigram[(word2, word3)][word1] += 1
-                    model_trigram[(word1, word2)][word3] += 1
-                    dictionary_trigram.update([word1, word2, word3])
+sentences = [text.split() for text in corpus]


-# In[10]:
+# In[35]:


-from collections import defaultdict
-from nltk import bigrams
-from nltk.tokenize import word_tokenize
-
-model_bigram = defaultdict(lambda: defaultdict(float))
-dictionary_bigram = set()
-for line in corpus[:200000]:
-            tokens = word_tokenize(line)
-            for word1, word2 in bigrams(tokens, pad_right=True, pad_left=True):
-                if word1 and word2:
-                    model_bigram[word2][word1] += 1
-                    model_bigram[word1][word2] += 1
-                    dictionary_bigram.update([word1, word2])
+word2vec_model = Word2Vec(sentences, vector_size=70, window=5, min_count=1, workers=4)
+word2vec_model.train(sentences, total_examples=word2vec_model.corpus_count, epochs=10)


-# In[11]:
+# In[36]:


-smoothing = 0.0001
-for trio in model_trigram:
-    count_sum = sum(model_trigram[trio].values()) + smoothing * len(dictionary_trigram)
-    for token in model_trigram[trio]:
-        model_trigram[trio][token] = (model_trigram[trio][token] + smoothing) / count_sum
+tokenizer = Tokenizer()
+tokenizer.fit_on_texts(corpus)
+total_words = len(tokenizer.word_index) + 1


-# In[12]:
+# In[37]:


-smoothing = 0.0001
-for trio in model_bigram:
-    count_sum = sum(model_bigram[trio].values()) + smoothing * len(dictionary_bigram)
-    for token in model_bigram[trio]:
-        model_bigram[trio][token] = (model_bigram[trio][token] + smoothing) / count_sum
+input_sequences = []
+output_words = []
+
+for before, word, after in zip(corpus_before, expected, corpus_after):
+
+    before_tokens = tokenizer.texts_to_sequences([before])[0]
+    after_tokens = tokenizer.texts_to_sequences([after])[0]
+    word_token = tokenizer.texts_to_sequences([word])[0][0]
+    
+
+    for i in range(1, 6):
+        input_seq = before_tokens[-(5-i):] + [word_token] + after_tokens[:i]
+        input_sequences.append(pad_sequences([input_seq], maxlen=5, padding='pre')[0])
+        output_words.append(word_token)


-# In[19]:
+input_sequences = np.array(input_sequences)
+output_words = np.array(output_words)
+
+
+# In[38]:
+
+
+embedding_matrix = np.zeros((total_words, 70))
+for word, i in tokenizer.word_index.items():
+    if word in word2vec_model.wv:
+        embedding_matrix[i] = word2vec_model.wv[word]
+
+
+# In[39]:
+
+
+X_train, X_val, y_train, y_val = train_test_split(input_sequences, output_words, test_size=0.2, random_state=33)
+
+
+# In[40]:
+
+
+input_layer = Input(shape=(5,))
+
+embedding_layer = Embedding(total_words, 70, trainable=False)(input_layer)
+
+
+
+sum_layer = Lambda(lambda x: tf.reduce_sum(x, axis=1))(embedding_layer)
+
+dense_layer1 = Dense(128, activation='relu')(sum_layer)
+dense_layer2 = Dense(64, activation='relu')(dense_layer1)
+
+
+linear_layer = Dense(70, activation='relu')(dense_layer2)
+
+
+output_layer = Dense(total_words, activation='softmax')(linear_layer)
+
+
+# In[41]:
+
+
+model = Model(inputs=input_layer, outputs=output_layer)
+model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
+early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
+
+
+
+# In[42]:
+
+
+model.fit(X_train, y_train, epochs=15, verbose=2, validation_data=(X_val, y_val), callbacks=[early_stopping])
+
+
+# In[43]:
+
+
+model.save('ngram_model_20k.keras')
+
+
+# In[44]:
+
+
+def predict_next_words(model, tokenizer, text, top_n=1):
+    sequence = tokenizer.texts_to_sequences([text])[0]
+    padded_sequence = pad_sequences([sequence], maxlen=5, padding='pre')
+    predictions = model.predict(padded_sequence)[0]
+    top_indices = np.argsort(predictions)[-top_n:][::-1]
+    top_words = [tokenizer.index_word[index] for index in top_indices]
+    top_probabilities = [predictions[index] for index in top_indices]
+    return list(zip(top_words, top_probabilities))
+
+
+# In[45]:


-from collections import Counter

-default = "the:0.10000 of:0.05000 and:0.01000 to:0.01000 in:0.01000 a:0.01000 :0.81000"

 data = read_xz_file("dev-0\\in.tsv.xz")
 corpus_before=[]
@ -141,68 +211,11 @@ for i in range(len(data)):
    
 with open("dev-0\\out.tsv", "w", encoding="utf-8") as output:
    for text in corpus_before:
-        tokens = word_tokenize(text)
-        prediction = ""
-
-        if len(tokens) >= 3:
-            results = dict(model_trigram[(tokens[0], tokens[1])])
-            if results:
-                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
-
-        if prediction == "":
-            bigram_results = dict(model_bigram[tokens[0]])
-            if bigram_results:
-                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6))
-
-        if prediction == "":
-            prediction = default
-
-        output.write(str(prediction.replace("\n", "").strip() + "\n"))
-
-
-# In[ ]:
+        predictions = predict_next_words(model, tokenizer, text)
+        result = " ".join([f"{word}:{round(probability,5)}" for word, probability in predictions])
+        output.write(str(result.replace("\n", "").strip() + "\n"))





-# In[22]:
-
-
-from collections import Counter
-
-default = "the:0.10000 of:0.05000 and:0.01000 to:0.01000 in:0.01000 a:0.01000 :0.81000"
-
-data = read_xz_file("test-A\\in.tsv.xz")
-corpus_before=[]
-corpus_after=[]
-for i in range(len(data)):
-    corpus_before.append(str(data[i].split("\t")[6]))
-    corpus_after.append(str(data[i].split("\t")[7]))
-    
-with open("test-A\\out.tsv", "w", encoding="utf-8") as output:
-    for text in corpus_before:
-        tokens = word_tokenize(text)
-        prediction = ""
-
-        if len(tokens) >= 3:
-            results = dict(model_trigram[(tokens[0], tokens[1])])
-            if results:
-                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
-
-        if prediction == "":
-            bigram_results = dict(model_bigram[tokens[0]])
-            if bigram_results:
-                prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6))
-
-        if prediction == "":
-            prediction = default
-
-        output.write(str(prediction.replace("\n", "").strip() + "\n"))
-
-
-# In[ ]:
-
-
-
-
Author	SHA1	Message	Date
s452662	c58ffd626a	added run.py	2024-05-22 15:37:23 +02:00
s452662	3a6aac8ee8	20k model	2024-05-21 18:13:45 +02:00
s452662	dad79f5c65	20k model	2024-05-21 18:12:03 +02:00
s452662	ce767265e6	15k model	2024-05-21 18:10:57 +02:00
s452662	5e2ef57e79	20k model	2024-05-21 18:10:09 +02:00
s452662	83d3ca46fd	20k model	2024-05-21 18:03:22 +02:00
s452662	2ecf9330f7	20k model	2024-05-21 18:02:40 +02:00
s452662	97f1e60238	20k model	2024-05-21 17:59:57 +02:00
s452662	a64af691fb	20k model	2024-05-21 17:58:55 +02:00
s452662	a32cbedb08	15k model	2024-05-21 17:57:05 +02:00
s452662	2316e1b27c	20k model	2024-05-21 17:49:07 +02:00
s452662	2496330431	15k model	2024-05-21 17:47:04 +02:00
s452662	c01df98eee	fix	2024-05-20 17:11:35 +02:00
s452662	623b5be8c2	fix	2024-05-20 17:08:09 +02:00
s452662	45752bcc71	more data	2024-05-20 15:10:39 +02:00
s452662	5fa09ebed6	fix	2024-05-19 12:18:47 +02:00
s452662	e28100054e	fix	2024-05-19 12:07:20 +02:00
s452662	9e7a4719d5	fix	2024-05-19 12:01:20 +02:00
s452662	b130fe816a	fix	2024-05-19 11:50:13 +02:00
s452662	e4852ed367	fix	2024-05-19 11:49:18 +02:00
s452662	871d474805	Delete dev-0/out.tsv	2024-05-19 11:46:48 +02:00
s452662	d57b2473c8	rounded results	2024-05-19 11:45:22 +02:00
s452662	198d964a5c	Update dev-0/out.tsv	2024-05-19 11:14:14 +02:00
s452662	c9063e4542	fixed formatting	2024-05-19 11:07:39 +02:00
s452662	ac8e7b5550	first test	2024-05-18 14:25:35 +02:00