58 KiB
58 KiB
import os
import pandas as pd
import gensim
from gensim.models import KeyedVectors
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt
from keras.regularizers import l2
Declare path
data_dir_path = 'sport-text-classification-ball-isi-public'
train_path = os.path.join(data_dir_path, 'train\\\\train.tsv')
dev_texts_path = os.path.join(data_dir_path, 'dev-0\\\\in.tsv')
dev_labels_path = os.path.join(data_dir_path, 'dev-0\\\\expected.tsv')
dev_predicted_path = os.path.join(data_dir_path, 'dev-0\\\\out.tsv')
test_texts_path = os.path.join(data_dir_path, 'test-A\\\\in.tsv')
test_predicted_path = os.path.join(data_dir_path, 'test-A\\\\out.tsv')
word2vec_file_path = 'word2vec_100_3_polish.bin'
Load files
train_data = pd.read_csv(train_path, sep='\t', usecols=[0, 1], header=None, names=['label', 'text'])
dev_texts_data = pd.read_csv(dev_texts_path, sep='\t', usecols=[0], header=None, names=['text'])
dev_labels_data = pd.read_csv(dev_labels_path, sep='\t', usecols=[0], header=None, names=['label'])
test_texts_data = pd.read_csv(test_texts_path, sep='\t', usecols=[0], header=None, names=['text'])
Load word2vec
word2vec = KeyedVectors.load(word2vec_file_path)
Preprocess data
def text_to_word2vec(text):
text_vector = np.mean([word2vec[word] for word in text if word in word2vec], axis=0).tolist()
if np.isnan(text_vector).any() or not isinstance(text_vector, list):
return np.zeros(word2vec.vector_size)
return text_vector
def fit_data(column):
return np.array(column.tolist())
def fit_data_X(text_column):
text_preprocessed = text_column.apply(lambda x: gensim.utils.simple_preprocess(x))
vectors = text_preprocessed.apply(lambda x: text_to_word2vec(x))
return fit_data(vectors)
train_X = fit_data_X(train_data['text'])
train_Y = fit_data(train_data['label'])
dev_X = fit_data_X(dev_texts_data['text'])
dev_Y = fit_data(dev_labels_data['label'])
test_X = fit_data_X(test_texts_data['text'])
C:\Users\Pawel\anaconda3\Lib\site-packages\numpy\core\fromnumeric.py:3464: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype,
Create model
model = Sequential()
model.add(Dense(128, input_dim=train_X.shape[1], activation='relu'))
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
WARNING:tensorflow:From C:\Users\Pawel\anaconda3\Lib\site-packages\keras\src\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.
Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
WARNING:tensorflow:From C:\Users\Pawel\anaconda3\Lib\site-packages\keras\src\optimizers\__init__.py:309: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.
Train model
history = model.fit(train_X, train_Y, epochs=20, batch_size=32, validation_data=(dev_X, dev_Y))
Epoch 1/20 WARNING:tensorflow:From C:\Users\Pawel\anaconda3\Lib\site-packages\keras\src\utils\tf_utils.py:492: The name tf.ragged.RaggedTensorValue is deprecated. Please use tf.compat.v1.ragged.RaggedTensorValue instead. WARNING:tensorflow:From C:\Users\Pawel\anaconda3\Lib\site-packages\keras\src\engine\base_layer_utils.py:384: The name tf.executing_eagerly_outside_functions is deprecated. Please use tf.compat.v1.executing_eagerly_outside_functions instead. 3067/3067 [==============================] - 17s 4ms/step - loss: 0.1955 - accuracy: 0.9319 - val_loss: 0.1569 - val_accuracy: 0.9404 Epoch 2/20 3067/3067 [==============================] - 11s 4ms/step - loss: 0.1393 - accuracy: 0.9471 - val_loss: 0.1337 - val_accuracy: 0.9450 Epoch 3/20 3067/3067 [==============================] - 10s 3ms/step - loss: 0.1264 - accuracy: 0.9523 - val_loss: 0.1410 - val_accuracy: 0.9426 Epoch 4/20 3067/3067 [==============================] - 15s 5ms/step - loss: 0.1189 - accuracy: 0.9543 - val_loss: 0.1231 - val_accuracy: 0.9516 Epoch 5/20 3067/3067 [==============================] - 16s 5ms/step - loss: 0.1133 - accuracy: 0.9570 - val_loss: 0.1206 - val_accuracy: 0.9490 Epoch 6/20 3067/3067 [==============================] - 17s 6ms/step - loss: 0.1076 - accuracy: 0.9588 - val_loss: 0.1220 - val_accuracy: 0.9481 Epoch 7/20 3067/3067 [==============================] - 11s 4ms/step - loss: 0.1039 - accuracy: 0.9605 - val_loss: 0.1125 - val_accuracy: 0.9541 Epoch 8/20 3067/3067 [==============================] - 18s 6ms/step - loss: 0.0997 - accuracy: 0.9620 - val_loss: 0.1123 - val_accuracy: 0.9536 Epoch 9/20 3067/3067 [==============================] - 17s 6ms/step - loss: 0.0964 - accuracy: 0.9639 - val_loss: 0.1092 - val_accuracy: 0.9547 Epoch 10/20 3067/3067 [==============================] - 11s 4ms/step - loss: 0.0936 - accuracy: 0.9645 - val_loss: 0.1120 - val_accuracy: 0.9567 Epoch 11/20 3067/3067 [==============================] - 17s 5ms/step - loss: 0.0906 - accuracy: 0.9656 - val_loss: 0.1170 - val_accuracy: 0.9527 Epoch 12/20 3067/3067 [==============================] - 12s 4ms/step - loss: 0.0882 - accuracy: 0.9670 - val_loss: 0.1171 - val_accuracy: 0.9549 Epoch 13/20 3067/3067 [==============================] - 18s 6ms/step - loss: 0.0854 - accuracy: 0.9681 - val_loss: 0.1120 - val_accuracy: 0.9567 Epoch 14/20 3067/3067 [==============================] - 13s 4ms/step - loss: 0.0830 - accuracy: 0.9688 - val_loss: 0.1171 - val_accuracy: 0.9562 Epoch 15/20 3067/3067 [==============================] - 12s 4ms/step - loss: 0.0810 - accuracy: 0.9695 - val_loss: 0.1226 - val_accuracy: 0.9510 Epoch 16/20 3067/3067 [==============================] - 16s 5ms/step - loss: 0.0791 - accuracy: 0.9704 - val_loss: 0.1167 - val_accuracy: 0.9567 Epoch 17/20 3067/3067 [==============================] - 14s 4ms/step - loss: 0.0776 - accuracy: 0.9709 - val_loss: 0.1264 - val_accuracy: 0.9532 Epoch 18/20 3067/3067 [==============================] - 14s 4ms/step - loss: 0.0755 - accuracy: 0.9714 - val_loss: 0.1191 - val_accuracy: 0.9519 Epoch 19/20 3067/3067 [==============================] - 13s 4ms/step - loss: 0.0742 - accuracy: 0.9722 - val_loss: 0.1190 - val_accuracy: 0.9545 Epoch 20/20 3067/3067 [==============================] - 13s 4ms/step - loss: 0.0725 - accuracy: 0.9732 - val_loss: 0.1295 - val_accuracy: 0.9552
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Model Accuracy Over Epochs')
plt.legend()
plt.show()
Predict and save results
def predict_and_save(X, filename):
Y_predicted = model.predict(X)
Y_predicted = np.round(Y_predicted,0).astype(int)
Y_predicted_df = pd.DataFrame(Y_predicted, columns=['predicted_label'])
Y_predicted_df.to_csv(filename, sep='\t', index=False, header=None)
dev_predicted = predict_and_save(dev_X, dev_predicted_path)
test_predicted = predict_and_save(test_X, test_predicted_path)
171/171 [==============================] - 0s 3ms/step 171/171 [==============================] - 1s 3ms/step