Uczenie_Glebokie/2. Word2Vec/word2vec.ipynb
2024-05-20 00:43:18 +02:00

61 KiB

import os
import pandas as pd
import gensim
from gensim.models import KeyedVectors
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt
from keras.regularizers import l2
WARNING:tensorflow:From C:\Users\Pawel\anaconda3\Lib\site-packages\keras\src\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.

Declare path

data_dir_path = 'sport-text-classification-ball-isi-public'
train_path = os.path.join(data_dir_path, 'train\\\\train.tsv')
dev_texts_path = os.path.join(data_dir_path, 'dev-0\\\\in.tsv')
dev_labels_path = os.path.join(data_dir_path, 'dev-0\\\\expected.tsv')
dev_predicted_path = os.path.join(data_dir_path, 'dev-0\\\\out.tsv')
test_texts_path = os.path.join(data_dir_path, 'test-A\\\\in.tsv')
test_predicted_path = os.path.join(data_dir_path, 'test-A\\\\out.tsv')
word2vec_file_path = 'word2vec_100_3_polish.bin'

Load files

train_data = pd.read_csv(train_path, sep='\t', usecols=[0, 1], header=None, names=['label', 'text'])
dev_texts_data = pd.read_csv(dev_texts_path, sep='\t', usecols=[0], header=None, names=['text'])
dev_labels_data = pd.read_csv(dev_labels_path, sep='\t', usecols=[0], header=None, names=['label'])
test_texts_data = pd.read_csv(test_texts_path, sep='\t', usecols=[0], header=None, names=['text'])

Load word2vec

word2vec = KeyedVectors.load(word2vec_file_path)

Preprocess data

def text_to_word2vec(text):
    text_vector = np.mean([word2vec[word] for word in text if word in word2vec], axis=0).tolist()
    if np.isnan(text_vector).any() or not isinstance(text_vector, list):
        return np.zeros(word2vec.vector_size)
    return text_vector

def fit_data(column):
    return np.array(column.tolist())

def fit_data_X(text_column):
    text_preprocessed = text_column.apply(lambda x: gensim.utils.simple_preprocess(x))
    vectors = text_preprocessed.apply(lambda x: text_to_word2vec(x))
    return fit_data(vectors)

train_X = fit_data_X(train_data['text'])
train_Y = fit_data(train_data['label'])
dev_X = fit_data_X(dev_texts_data['text'])
dev_Y = fit_data(dev_labels_data['label'])
test_X = fit_data_X(test_texts_data['text'])
C:\Users\Pawel\anaconda3\Lib\site-packages\numpy\core\fromnumeric.py:3464: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,

Create model

model = Sequential()
model.add(Dense(128, input_dim=train_X.shape[1], activation='relu'))
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
WARNING:tensorflow:From C:\Users\Pawel\anaconda3\Lib\site-packages\keras\src\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

Compile model

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
WARNING:tensorflow:From C:\Users\Pawel\anaconda3\Lib\site-packages\keras\src\optimizers\__init__.py:309: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

Train model

history = model.fit(train_X, train_Y, epochs=20, batch_size=32, validation_data=(dev_X, dev_Y))
Epoch 1/20
WARNING:tensorflow:From C:\Users\Pawel\anaconda3\Lib\site-packages\keras\src\utils\tf_utils.py:492: The name tf.ragged.RaggedTensorValue is deprecated. Please use tf.compat.v1.ragged.RaggedTensorValue instead.

WARNING:tensorflow:From C:\Users\Pawel\anaconda3\Lib\site-packages\keras\src\engine\base_layer_utils.py:384: The name tf.executing_eagerly_outside_functions is deprecated. Please use tf.compat.v1.executing_eagerly_outside_functions instead.

3067/3067 [==============================] - 6s 1ms/step - loss: 0.1954 - accuracy: 0.9327 - val_loss: 0.1472 - val_accuracy: 0.9450
Epoch 2/20
3067/3067 [==============================] - 4s 1ms/step - loss: 0.1389 - accuracy: 0.9479 - val_loss: 0.1354 - val_accuracy: 0.9477
Epoch 3/20
3067/3067 [==============================] - 4s 1ms/step - loss: 0.1261 - accuracy: 0.9516 - val_loss: 0.1287 - val_accuracy: 0.9490
Epoch 4/20
3067/3067 [==============================] - 4s 1ms/step - loss: 0.1183 - accuracy: 0.9554 - val_loss: 0.1324 - val_accuracy: 0.9463
Epoch 5/20
3067/3067 [==============================] - 4s 1ms/step - loss: 0.1125 - accuracy: 0.9572 - val_loss: 0.1196 - val_accuracy: 0.9507
Epoch 6/20
3067/3067 [==============================] - 5s 1ms/step - loss: 0.1068 - accuracy: 0.9592 - val_loss: 0.1153 - val_accuracy: 0.9525
Epoch 7/20
3067/3067 [==============================] - 4s 1ms/step - loss: 0.1030 - accuracy: 0.9607 - val_loss: 0.1234 - val_accuracy: 0.9521
Epoch 8/20
3067/3067 [==============================] - 4s 1ms/step - loss: 0.1001 - accuracy: 0.9619 - val_loss: 0.1319 - val_accuracy: 0.9459
Epoch 9/20
3067/3067 [==============================] - 4s 1ms/step - loss: 0.0973 - accuracy: 0.9630 - val_loss: 0.1251 - val_accuracy: 0.9507
Epoch 10/20
3067/3067 [==============================] - 4s 1ms/step - loss: 0.0943 - accuracy: 0.9646 - val_loss: 0.1276 - val_accuracy: 0.9521
Epoch 11/20
3067/3067 [==============================] - 4s 1ms/step - loss: 0.0913 - accuracy: 0.9662 - val_loss: 0.1151 - val_accuracy: 0.9543
Epoch 12/20
3067/3067 [==============================] - 4s 1ms/step - loss: 0.0893 - accuracy: 0.9669 - val_loss: 0.1176 - val_accuracy: 0.9536
Epoch 13/20
3067/3067 [==============================] - 4s 1ms/step - loss: 0.0867 - accuracy: 0.9671 - val_loss: 0.1252 - val_accuracy: 0.9505
Epoch 14/20
3067/3067 [==============================] - 4s 1ms/step - loss: 0.0852 - accuracy: 0.9685 - val_loss: 0.1309 - val_accuracy: 0.9514
Epoch 15/20
3067/3067 [==============================] - 4s 1ms/step - loss: 0.0828 - accuracy: 0.9692 - val_loss: 0.1208 - val_accuracy: 0.9543
Epoch 16/20
3067/3067 [==============================] - 4s 1ms/step - loss: 0.0808 - accuracy: 0.9696 - val_loss: 0.1311 - val_accuracy: 0.9503
Epoch 17/20
3067/3067 [==============================] - 4s 1ms/step - loss: 0.0794 - accuracy: 0.9703 - val_loss: 0.1301 - val_accuracy: 0.9519
Epoch 18/20
3067/3067 [==============================] - 4s 1ms/step - loss: 0.0778 - accuracy: 0.9710 - val_loss: 0.1237 - val_accuracy: 0.9518
Epoch 19/20
3067/3067 [==============================] - 4s 1ms/step - loss: 0.0751 - accuracy: 0.9720 - val_loss: 0.1280 - val_accuracy: 0.9507
Epoch 20/20
3067/3067 [==============================] - 4s 1ms/step - loss: 0.0744 - accuracy: 0.9722 - val_loss: 0.1239 - val_accuracy: 0.9510
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Model Accuracy Over Epochs')
plt.legend()
plt.show()

Predict and save results

def predict_and_save(X, filename):
    Y_predicted = model.predict(X)
    Y_predicted_df = pd.DataFrame(Y_predicted, columns=['predicted_label'])
    Y_predicted_df.to_csv(filename, sep='\t', index=False, header=None)
dev_predicted = predict_and_save(dev_X, dev_predicted_path)
test_predicted = predict_and_save(test_X, test_predicted_path)
171/171 [==============================] - 0s 906us/step
171/171 [==============================] - 0s 900us/step