61 KiB
61 KiB
import os
import pandas as pd
import gensim
from gensim.models import KeyedVectors
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt
from keras.regularizers import l2
WARNING:tensorflow:From C:\Users\Pawel\anaconda3\Lib\site-packages\keras\src\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.
Declare path
data_dir_path = 'sport-text-classification-ball-isi-public'
train_path = os.path.join(data_dir_path, 'train\\\\train.tsv')
dev_texts_path = os.path.join(data_dir_path, 'dev-0\\\\in.tsv')
dev_labels_path = os.path.join(data_dir_path, 'dev-0\\\\expected.tsv')
dev_predicted_path = os.path.join(data_dir_path, 'dev-0\\\\out.tsv')
test_texts_path = os.path.join(data_dir_path, 'test-A\\\\in.tsv')
test_predicted_path = os.path.join(data_dir_path, 'test-A\\\\out.tsv')
word2vec_file_path = 'word2vec_100_3_polish.bin'
Load files
train_data = pd.read_csv(train_path, sep='\t', usecols=[0, 1], header=None, names=['label', 'text'])
dev_texts_data = pd.read_csv(dev_texts_path, sep='\t', usecols=[0], header=None, names=['text'])
dev_labels_data = pd.read_csv(dev_labels_path, sep='\t', usecols=[0], header=None, names=['label'])
test_texts_data = pd.read_csv(test_texts_path, sep='\t', usecols=[0], header=None, names=['text'])
Load word2vec
word2vec = KeyedVectors.load(word2vec_file_path)
Preprocess data
def text_to_word2vec(text):
text_vector = np.mean([word2vec[word] for word in text if word in word2vec], axis=0).tolist()
if np.isnan(text_vector).any() or not isinstance(text_vector, list):
return np.zeros(word2vec.vector_size)
return text_vector
def fit_data(column):
return np.array(column.tolist())
def fit_data_X(text_column):
text_preprocessed = text_column.apply(lambda x: gensim.utils.simple_preprocess(x))
vectors = text_preprocessed.apply(lambda x: text_to_word2vec(x))
return fit_data(vectors)
train_X = fit_data_X(train_data['text'])
train_Y = fit_data(train_data['label'])
dev_X = fit_data_X(dev_texts_data['text'])
dev_Y = fit_data(dev_labels_data['label'])
test_X = fit_data_X(test_texts_data['text'])
C:\Users\Pawel\anaconda3\Lib\site-packages\numpy\core\fromnumeric.py:3464: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype,
Create model
model = Sequential()
model.add(Dense(128, input_dim=train_X.shape[1], activation='relu'))
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
WARNING:tensorflow:From C:\Users\Pawel\anaconda3\Lib\site-packages\keras\src\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.
Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
WARNING:tensorflow:From C:\Users\Pawel\anaconda3\Lib\site-packages\keras\src\optimizers\__init__.py:309: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.
Train model
history = model.fit(train_X, train_Y, epochs=20, batch_size=32, validation_data=(dev_X, dev_Y))
Epoch 1/20 WARNING:tensorflow:From C:\Users\Pawel\anaconda3\Lib\site-packages\keras\src\utils\tf_utils.py:492: The name tf.ragged.RaggedTensorValue is deprecated. Please use tf.compat.v1.ragged.RaggedTensorValue instead. WARNING:tensorflow:From C:\Users\Pawel\anaconda3\Lib\site-packages\keras\src\engine\base_layer_utils.py:384: The name tf.executing_eagerly_outside_functions is deprecated. Please use tf.compat.v1.executing_eagerly_outside_functions instead. 3067/3067 [==============================] - 6s 1ms/step - loss: 0.1954 - accuracy: 0.9327 - val_loss: 0.1472 - val_accuracy: 0.9450 Epoch 2/20 3067/3067 [==============================] - 4s 1ms/step - loss: 0.1389 - accuracy: 0.9479 - val_loss: 0.1354 - val_accuracy: 0.9477 Epoch 3/20 3067/3067 [==============================] - 4s 1ms/step - loss: 0.1261 - accuracy: 0.9516 - val_loss: 0.1287 - val_accuracy: 0.9490 Epoch 4/20 3067/3067 [==============================] - 4s 1ms/step - loss: 0.1183 - accuracy: 0.9554 - val_loss: 0.1324 - val_accuracy: 0.9463 Epoch 5/20 3067/3067 [==============================] - 4s 1ms/step - loss: 0.1125 - accuracy: 0.9572 - val_loss: 0.1196 - val_accuracy: 0.9507 Epoch 6/20 3067/3067 [==============================] - 5s 1ms/step - loss: 0.1068 - accuracy: 0.9592 - val_loss: 0.1153 - val_accuracy: 0.9525 Epoch 7/20 3067/3067 [==============================] - 4s 1ms/step - loss: 0.1030 - accuracy: 0.9607 - val_loss: 0.1234 - val_accuracy: 0.9521 Epoch 8/20 3067/3067 [==============================] - 4s 1ms/step - loss: 0.1001 - accuracy: 0.9619 - val_loss: 0.1319 - val_accuracy: 0.9459 Epoch 9/20 3067/3067 [==============================] - 4s 1ms/step - loss: 0.0973 - accuracy: 0.9630 - val_loss: 0.1251 - val_accuracy: 0.9507 Epoch 10/20 3067/3067 [==============================] - 4s 1ms/step - loss: 0.0943 - accuracy: 0.9646 - val_loss: 0.1276 - val_accuracy: 0.9521 Epoch 11/20 3067/3067 [==============================] - 4s 1ms/step - loss: 0.0913 - accuracy: 0.9662 - val_loss: 0.1151 - val_accuracy: 0.9543 Epoch 12/20 3067/3067 [==============================] - 4s 1ms/step - loss: 0.0893 - accuracy: 0.9669 - val_loss: 0.1176 - val_accuracy: 0.9536 Epoch 13/20 3067/3067 [==============================] - 4s 1ms/step - loss: 0.0867 - accuracy: 0.9671 - val_loss: 0.1252 - val_accuracy: 0.9505 Epoch 14/20 3067/3067 [==============================] - 4s 1ms/step - loss: 0.0852 - accuracy: 0.9685 - val_loss: 0.1309 - val_accuracy: 0.9514 Epoch 15/20 3067/3067 [==============================] - 4s 1ms/step - loss: 0.0828 - accuracy: 0.9692 - val_loss: 0.1208 - val_accuracy: 0.9543 Epoch 16/20 3067/3067 [==============================] - 4s 1ms/step - loss: 0.0808 - accuracy: 0.9696 - val_loss: 0.1311 - val_accuracy: 0.9503 Epoch 17/20 3067/3067 [==============================] - 4s 1ms/step - loss: 0.0794 - accuracy: 0.9703 - val_loss: 0.1301 - val_accuracy: 0.9519 Epoch 18/20 3067/3067 [==============================] - 4s 1ms/step - loss: 0.0778 - accuracy: 0.9710 - val_loss: 0.1237 - val_accuracy: 0.9518 Epoch 19/20 3067/3067 [==============================] - 4s 1ms/step - loss: 0.0751 - accuracy: 0.9720 - val_loss: 0.1280 - val_accuracy: 0.9507 Epoch 20/20 3067/3067 [==============================] - 4s 1ms/step - loss: 0.0744 - accuracy: 0.9722 - val_loss: 0.1239 - val_accuracy: 0.9510
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Model Accuracy Over Epochs')
plt.legend()
plt.show()
Predict and save results
def predict_and_save(X, filename):
Y_predicted = model.predict(X)
Y_predicted_df = pd.DataFrame(Y_predicted, columns=['predicted_label'])
Y_predicted_df.to_csv(filename, sep='\t', index=False, header=None)
dev_predicted = predict_and_save(dev_X, dev_predicted_path)
test_predicted = predict_and_save(test_X, test_predicted_path)
171/171 [==============================] - 0s 906us/step 171/171 [==============================] - 0s 900us/step