Uczenie_Glebokie/2. Word2Vec/word2vec.ipynb
2024-05-20 03:14:08 +02:00

58 KiB

import os
import pandas as pd
import gensim
from gensim.models import KeyedVectors
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt
from keras.regularizers import l2

Declare path

data_dir_path = 'sport-text-classification-ball-isi-public'
train_path = os.path.join(data_dir_path, 'train\\\\train.tsv')
dev_texts_path = os.path.join(data_dir_path, 'dev-0\\\\in.tsv')
dev_labels_path = os.path.join(data_dir_path, 'dev-0\\\\expected.tsv')
dev_predicted_path = os.path.join(data_dir_path, 'dev-0\\\\out.tsv')
test_texts_path = os.path.join(data_dir_path, 'test-A\\\\in.tsv')
test_predicted_path = os.path.join(data_dir_path, 'test-A\\\\out.tsv')
word2vec_file_path = 'word2vec_100_3_polish.bin'

Load files

train_data = pd.read_csv(train_path, sep='\t', usecols=[0, 1], header=None, names=['label', 'text'])
dev_texts_data = pd.read_csv(dev_texts_path, sep='\t', usecols=[0], header=None, names=['text'])
dev_labels_data = pd.read_csv(dev_labels_path, sep='\t', usecols=[0], header=None, names=['label'])
test_texts_data = pd.read_csv(test_texts_path, sep='\t', usecols=[0], header=None, names=['text'])

Load word2vec

word2vec = KeyedVectors.load(word2vec_file_path)

Preprocess data

def text_to_word2vec(text):
    text_vector = np.mean([word2vec[word] for word in text if word in word2vec], axis=0).tolist()
    if np.isnan(text_vector).any() or not isinstance(text_vector, list):
        return np.zeros(word2vec.vector_size)
    return text_vector

def fit_data(column):
    return np.array(column.tolist())

def fit_data_X(text_column):
    text_preprocessed = text_column.apply(lambda x: gensim.utils.simple_preprocess(x))
    vectors = text_preprocessed.apply(lambda x: text_to_word2vec(x))
    return fit_data(vectors)

train_X = fit_data_X(train_data['text'])
train_Y = fit_data(train_data['label'])
dev_X = fit_data_X(dev_texts_data['text'])
dev_Y = fit_data(dev_labels_data['label'])
test_X = fit_data_X(test_texts_data['text'])
C:\Users\Pawel\anaconda3\Lib\site-packages\numpy\core\fromnumeric.py:3464: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,

Create model

model = Sequential()
model.add(Dense(128, input_dim=train_X.shape[1], activation='relu'))
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
WARNING:tensorflow:From C:\Users\Pawel\anaconda3\Lib\site-packages\keras\src\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

Compile model

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
WARNING:tensorflow:From C:\Users\Pawel\anaconda3\Lib\site-packages\keras\src\optimizers\__init__.py:309: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

Train model

history = model.fit(train_X, train_Y, epochs=20, batch_size=32, validation_data=(dev_X, dev_Y))
Epoch 1/20
WARNING:tensorflow:From C:\Users\Pawel\anaconda3\Lib\site-packages\keras\src\utils\tf_utils.py:492: The name tf.ragged.RaggedTensorValue is deprecated. Please use tf.compat.v1.ragged.RaggedTensorValue instead.

WARNING:tensorflow:From C:\Users\Pawel\anaconda3\Lib\site-packages\keras\src\engine\base_layer_utils.py:384: The name tf.executing_eagerly_outside_functions is deprecated. Please use tf.compat.v1.executing_eagerly_outside_functions instead.

3067/3067 [==============================] - 17s 4ms/step - loss: 0.1955 - accuracy: 0.9319 - val_loss: 0.1569 - val_accuracy: 0.9404
Epoch 2/20
3067/3067 [==============================] - 11s 4ms/step - loss: 0.1393 - accuracy: 0.9471 - val_loss: 0.1337 - val_accuracy: 0.9450
Epoch 3/20
3067/3067 [==============================] - 10s 3ms/step - loss: 0.1264 - accuracy: 0.9523 - val_loss: 0.1410 - val_accuracy: 0.9426
Epoch 4/20
3067/3067 [==============================] - 15s 5ms/step - loss: 0.1189 - accuracy: 0.9543 - val_loss: 0.1231 - val_accuracy: 0.9516
Epoch 5/20
3067/3067 [==============================] - 16s 5ms/step - loss: 0.1133 - accuracy: 0.9570 - val_loss: 0.1206 - val_accuracy: 0.9490
Epoch 6/20
3067/3067 [==============================] - 17s 6ms/step - loss: 0.1076 - accuracy: 0.9588 - val_loss: 0.1220 - val_accuracy: 0.9481
Epoch 7/20
3067/3067 [==============================] - 11s 4ms/step - loss: 0.1039 - accuracy: 0.9605 - val_loss: 0.1125 - val_accuracy: 0.9541
Epoch 8/20
3067/3067 [==============================] - 18s 6ms/step - loss: 0.0997 - accuracy: 0.9620 - val_loss: 0.1123 - val_accuracy: 0.9536
Epoch 9/20
3067/3067 [==============================] - 17s 6ms/step - loss: 0.0964 - accuracy: 0.9639 - val_loss: 0.1092 - val_accuracy: 0.9547
Epoch 10/20
3067/3067 [==============================] - 11s 4ms/step - loss: 0.0936 - accuracy: 0.9645 - val_loss: 0.1120 - val_accuracy: 0.9567
Epoch 11/20
3067/3067 [==============================] - 17s 5ms/step - loss: 0.0906 - accuracy: 0.9656 - val_loss: 0.1170 - val_accuracy: 0.9527
Epoch 12/20
3067/3067 [==============================] - 12s 4ms/step - loss: 0.0882 - accuracy: 0.9670 - val_loss: 0.1171 - val_accuracy: 0.9549
Epoch 13/20
3067/3067 [==============================] - 18s 6ms/step - loss: 0.0854 - accuracy: 0.9681 - val_loss: 0.1120 - val_accuracy: 0.9567
Epoch 14/20
3067/3067 [==============================] - 13s 4ms/step - loss: 0.0830 - accuracy: 0.9688 - val_loss: 0.1171 - val_accuracy: 0.9562
Epoch 15/20
3067/3067 [==============================] - 12s 4ms/step - loss: 0.0810 - accuracy: 0.9695 - val_loss: 0.1226 - val_accuracy: 0.9510
Epoch 16/20
3067/3067 [==============================] - 16s 5ms/step - loss: 0.0791 - accuracy: 0.9704 - val_loss: 0.1167 - val_accuracy: 0.9567
Epoch 17/20
3067/3067 [==============================] - 14s 4ms/step - loss: 0.0776 - accuracy: 0.9709 - val_loss: 0.1264 - val_accuracy: 0.9532
Epoch 18/20
3067/3067 [==============================] - 14s 4ms/step - loss: 0.0755 - accuracy: 0.9714 - val_loss: 0.1191 - val_accuracy: 0.9519
Epoch 19/20
3067/3067 [==============================] - 13s 4ms/step - loss: 0.0742 - accuracy: 0.9722 - val_loss: 0.1190 - val_accuracy: 0.9545
Epoch 20/20
3067/3067 [==============================] - 13s 4ms/step - loss: 0.0725 - accuracy: 0.9732 - val_loss: 0.1295 - val_accuracy: 0.9552
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Model Accuracy Over Epochs')
plt.legend()
plt.show()

Predict and save results

def predict_and_save(X, filename):
    Y_predicted = model.predict(X)
    Y_predicted = np.round(Y_predicted,0).astype(int)
    Y_predicted_df = pd.DataFrame(Y_predicted, columns=['predicted_label'])
    Y_predicted_df.to_csv(filename, sep='\t', index=False, header=None)
dev_predicted = predict_and_save(dev_X, dev_predicted_path)
test_predicted = predict_and_save(test_X, test_predicted_path)
171/171 [==============================] - 0s 3ms/step
171/171 [==============================] - 1s 3ms/step