Compare commits
No commits in common. "word2vec" and "master" have entirely different histories.
21038
dev-0/out.tsv
21038
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
245
run.py
245
run.py
@ -1,21 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
# In[25]:
|
||||
|
||||
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.models import Model
|
||||
from tensorflow.keras.layers import Input, Embedding, Dense, Lambda
|
||||
from gensim.models import Word2Vec
|
||||
import numpy as np
|
||||
from sklearn.model_selection import train_test_split
|
||||
from tensorflow.keras.callbacks import EarlyStopping
|
||||
|
||||
|
||||
# In[26]:
|
||||
# In[1]:
|
||||
|
||||
|
||||
import lzma
|
||||
@ -28,179 +14,123 @@ def read_xz_file(file_path):
|
||||
return data
|
||||
|
||||
|
||||
# In[27]:
|
||||
# In[2]:
|
||||
|
||||
|
||||
def read_tsv_file(file_path):
|
||||
data = []
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
for line in file:
|
||||
line = line.strip().split('\t')
|
||||
data.append(line)
|
||||
line = line.strip().split('\t') # Rozdziel linie na elementy za pomocą tabulatora
|
||||
data.append(line) # Dodaj elementy do listy danych
|
||||
return data
|
||||
|
||||
|
||||
# In[28]:
|
||||
# In[3]:
|
||||
|
||||
|
||||
file_path = "train\\in.tsv.xz"
|
||||
|
||||
|
||||
# In[29]:
|
||||
# In[4]:
|
||||
|
||||
|
||||
data = read_xz_file(file_path)
|
||||
|
||||
|
||||
# In[30]:
|
||||
# In[5]:
|
||||
|
||||
|
||||
expected = read_tsv_file("train\\expected.tsv")
|
||||
|
||||
|
||||
# In[31]:
|
||||
# In[6]:
|
||||
|
||||
|
||||
corpus_before=[]
|
||||
corpus_after=[]
|
||||
for i in range(20000):
|
||||
for i in range(len(data)):
|
||||
corpus_before.append(str(data[i].split("\t")[6]))
|
||||
corpus_after.append(str(data[i].split("\t")[7]))
|
||||
|
||||
|
||||
# In[32]:
|
||||
# In[7]:
|
||||
|
||||
|
||||
for i in range(20000):
|
||||
for i in range(len(expected)):
|
||||
expected[i] = str(expected[i]).lower()
|
||||
|
||||
|
||||
# In[33]:
|
||||
# In[8]:
|
||||
|
||||
|
||||
corpus = []
|
||||
for i in range(20000):
|
||||
for i in range(len(expected)):
|
||||
corpus.append(corpus_before[i] + " " + expected[i] + " " + corpus_after[i])
|
||||
|
||||
|
||||
# In[34]:
|
||||
# In[9]:
|
||||
|
||||
|
||||
sentences = [text.split() for text in corpus]
|
||||
from collections import defaultdict
|
||||
from nltk import trigrams
|
||||
from nltk.tokenize import word_tokenize
|
||||
|
||||
model_trigram = defaultdict(lambda: defaultdict(float))
|
||||
dictionary_trigram = set()
|
||||
for line in corpus[:200000]:
|
||||
tokens = word_tokenize(line)
|
||||
for word1, word2, word3 in trigrams(tokens, pad_right=True, pad_left=True):
|
||||
if word1 and word2 and word3:
|
||||
model_trigram[(word2, word3)][word1] += 1
|
||||
model_trigram[(word1, word2)][word3] += 1
|
||||
dictionary_trigram.update([word1, word2, word3])
|
||||
|
||||
|
||||
# In[35]:
|
||||
# In[10]:
|
||||
|
||||
|
||||
word2vec_model = Word2Vec(sentences, vector_size=70, window=5, min_count=1, workers=4)
|
||||
word2vec_model.train(sentences, total_examples=word2vec_model.corpus_count, epochs=10)
|
||||
from collections import defaultdict
|
||||
from nltk import bigrams
|
||||
from nltk.tokenize import word_tokenize
|
||||
|
||||
model_bigram = defaultdict(lambda: defaultdict(float))
|
||||
dictionary_bigram = set()
|
||||
for line in corpus[:200000]:
|
||||
tokens = word_tokenize(line)
|
||||
for word1, word2 in bigrams(tokens, pad_right=True, pad_left=True):
|
||||
if word1 and word2:
|
||||
model_bigram[word2][word1] += 1
|
||||
model_bigram[word1][word2] += 1
|
||||
dictionary_bigram.update([word1, word2])
|
||||
|
||||
|
||||
# In[36]:
|
||||
# In[11]:
|
||||
|
||||
|
||||
tokenizer = Tokenizer()
|
||||
tokenizer.fit_on_texts(corpus)
|
||||
total_words = len(tokenizer.word_index) + 1
|
||||
smoothing = 0.0001
|
||||
for trio in model_trigram:
|
||||
count_sum = sum(model_trigram[trio].values()) + smoothing * len(dictionary_trigram)
|
||||
for token in model_trigram[trio]:
|
||||
model_trigram[trio][token] = (model_trigram[trio][token] + smoothing) / count_sum
|
||||
|
||||
|
||||
# In[37]:
|
||||
# In[12]:
|
||||
|
||||
|
||||
input_sequences = []
|
||||
output_words = []
|
||||
|
||||
for before, word, after in zip(corpus_before, expected, corpus_after):
|
||||
|
||||
before_tokens = tokenizer.texts_to_sequences([before])[0]
|
||||
after_tokens = tokenizer.texts_to_sequences([after])[0]
|
||||
word_token = tokenizer.texts_to_sequences([word])[0][0]
|
||||
smoothing = 0.0001
|
||||
for trio in model_bigram:
|
||||
count_sum = sum(model_bigram[trio].values()) + smoothing * len(dictionary_bigram)
|
||||
for token in model_bigram[trio]:
|
||||
model_bigram[trio][token] = (model_bigram[trio][token] + smoothing) / count_sum
|
||||
|
||||
|
||||
for i in range(1, 6):
|
||||
input_seq = before_tokens[-(5-i):] + [word_token] + after_tokens[:i]
|
||||
input_sequences.append(pad_sequences([input_seq], maxlen=5, padding='pre')[0])
|
||||
output_words.append(word_token)
|
||||
|
||||
|
||||
input_sequences = np.array(input_sequences)
|
||||
output_words = np.array(output_words)
|
||||
|
||||
|
||||
# In[38]:
|
||||
|
||||
|
||||
embedding_matrix = np.zeros((total_words, 70))
|
||||
for word, i in tokenizer.word_index.items():
|
||||
if word in word2vec_model.wv:
|
||||
embedding_matrix[i] = word2vec_model.wv[word]
|
||||
|
||||
|
||||
# In[39]:
|
||||
|
||||
|
||||
X_train, X_val, y_train, y_val = train_test_split(input_sequences, output_words, test_size=0.2, random_state=33)
|
||||
|
||||
|
||||
# In[40]:
|
||||
|
||||
|
||||
input_layer = Input(shape=(5,))
|
||||
|
||||
embedding_layer = Embedding(total_words, 70, trainable=False)(input_layer)
|
||||
|
||||
|
||||
|
||||
sum_layer = Lambda(lambda x: tf.reduce_sum(x, axis=1))(embedding_layer)
|
||||
|
||||
dense_layer1 = Dense(128, activation='relu')(sum_layer)
|
||||
dense_layer2 = Dense(64, activation='relu')(dense_layer1)
|
||||
|
||||
|
||||
linear_layer = Dense(70, activation='relu')(dense_layer2)
|
||||
|
||||
|
||||
output_layer = Dense(total_words, activation='softmax')(linear_layer)
|
||||
|
||||
|
||||
# In[41]:
|
||||
|
||||
|
||||
model = Model(inputs=input_layer, outputs=output_layer)
|
||||
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
|
||||
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
|
||||
|
||||
|
||||
|
||||
# In[42]:
|
||||
|
||||
|
||||
model.fit(X_train, y_train, epochs=15, verbose=2, validation_data=(X_val, y_val), callbacks=[early_stopping])
|
||||
|
||||
|
||||
# In[43]:
|
||||
|
||||
|
||||
model.save('ngram_model_20k.keras')
|
||||
|
||||
|
||||
# In[44]:
|
||||
|
||||
|
||||
def predict_next_words(model, tokenizer, text, top_n=1):
|
||||
sequence = tokenizer.texts_to_sequences([text])[0]
|
||||
padded_sequence = pad_sequences([sequence], maxlen=5, padding='pre')
|
||||
predictions = model.predict(padded_sequence)[0]
|
||||
top_indices = np.argsort(predictions)[-top_n:][::-1]
|
||||
top_words = [tokenizer.index_word[index] for index in top_indices]
|
||||
top_probabilities = [predictions[index] for index in top_indices]
|
||||
return list(zip(top_words, top_probabilities))
|
||||
|
||||
|
||||
# In[45]:
|
||||
# In[19]:
|
||||
|
||||
|
||||
from collections import Counter
|
||||
|
||||
default = "the:0.10000 of:0.05000 and:0.01000 to:0.01000 in:0.01000 a:0.01000 :0.81000"
|
||||
|
||||
data = read_xz_file("dev-0\\in.tsv.xz")
|
||||
corpus_before=[]
|
||||
@ -211,11 +141,68 @@ for i in range(len(data)):
|
||||
|
||||
with open("dev-0\\out.tsv", "w", encoding="utf-8") as output:
|
||||
for text in corpus_before:
|
||||
predictions = predict_next_words(model, tokenizer, text)
|
||||
result = " ".join([f"{word}:{round(probability,5)}" for word, probability in predictions])
|
||||
output.write(str(result.replace("\n", "").strip() + "\n"))
|
||||
tokens = word_tokenize(text)
|
||||
prediction = ""
|
||||
|
||||
if len(tokens) >= 3:
|
||||
results = dict(model_trigram[(tokens[0], tokens[1])])
|
||||
if results:
|
||||
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
|
||||
|
||||
if prediction == "":
|
||||
bigram_results = dict(model_bigram[tokens[0]])
|
||||
if bigram_results:
|
||||
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6))
|
||||
|
||||
if prediction == "":
|
||||
prediction = default
|
||||
|
||||
output.write(str(prediction.replace("\n", "").strip() + "\n"))
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# In[22]:
|
||||
|
||||
|
||||
from collections import Counter
|
||||
|
||||
default = "the:0.10000 of:0.05000 and:0.01000 to:0.01000 in:0.01000 a:0.01000 :0.81000"
|
||||
|
||||
data = read_xz_file("test-A\\in.tsv.xz")
|
||||
corpus_before=[]
|
||||
corpus_after=[]
|
||||
for i in range(len(data)):
|
||||
corpus_before.append(str(data[i].split("\t")[6]))
|
||||
corpus_after.append(str(data[i].split("\t")[7]))
|
||||
|
||||
with open("test-A\\out.tsv", "w", encoding="utf-8") as output:
|
||||
for text in corpus_before:
|
||||
tokens = word_tokenize(text)
|
||||
prediction = ""
|
||||
|
||||
if len(tokens) >= 3:
|
||||
results = dict(model_trigram[(tokens[0], tokens[1])])
|
||||
if results:
|
||||
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
|
||||
|
||||
if prediction == "":
|
||||
bigram_results = dict(model_bigram[tokens[0]])
|
||||
if bigram_results:
|
||||
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6))
|
||||
|
||||
if prediction == "":
|
||||
prediction = default
|
||||
|
||||
output.write(str(prediction.replace("\n", "").strip() + "\n"))
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user