This commit is contained in:
Mateusz 2024-05-19 11:31:01 +02:00
parent 5ec1581437
commit 3817604049
5 changed files with 1441 additions and 1433 deletions

3
.gitignore vendored
View File

@ -1,2 +1,5 @@
word2vec_100_3_polish.bin word2vec_100_3_polish.bin
word2vec_100_3_polish.bin.syn0.npy word2vec_100_3_polish.bin.syn0.npy
word2vec_300_3_polish.bin
word2vec_300_3_polish.bin.trainables.syn1neg.npy
word2vec_300_3_polish.bin.wv.vectors.npy

File diff suppressed because it is too large Load Diff

1
repo_link.tsv Normal file
View File

@ -0,0 +1 @@
https://git.wmi.amu.edu.pl/s464913/DL_Word2Vec
1 https://git.wmi.amu.edu.pl/s464913/DL_Word2Vec

14
run.py
View File

@ -17,20 +17,24 @@ def read_data():
def text_to_vector(text, word2vec, vector_size): def text_to_vector(text, word2vec, vector_size):
words = text.split() words = text.split()
text_vector = np.zeros(vector_size) text_vector = np.zeros(vector_size)
word_count = 0
for word in words: for word in words:
if word in word2vec: if word in word2vec.wv:
text_vector += word2vec[word] text_vector += word2vec.wv[word]
return text_vector / len(words) word_count += 1
if word_count > 0:
text_vector /= word_count
return text_vector
def main(): def main():
train_dataset, dev_0_dataset, test_A_dataset = read_data() train_dataset, dev_0_dataset, test_A_dataset = read_data()
# Word2Vec parameters # Word2Vec parameters
vector_size = 100 vector_size = 300
# Training the Word2Vec model # Training the Word2Vec model
word2vec = KeyedVectors.load("word2vec_100_3_polish.bin") word2vec = KeyedVectors.load("word2vec_300_3_polish.bin")
# Convert text to vectors # Convert text to vectors
train_vectors = np.array( train_vectors = np.array(

File diff suppressed because it is too large Load Diff