UMA-projekt/train_lstm.py

79 lines
2.8 KiB
Python
Raw Permalink Normal View History

2022-06-19 13:16:05 +02:00
import os
import numpy as np
import tensorflow as tf
from keras import Sequential
from keras.layers import Dropout, Embedding, Bidirectional, LSTM, Dense
from keras.saving.save import load_model
from keras.utils import pad_sequences
from keras_preprocessing.text import Tokenizer
from sklearn.metrics import classification_report
from stopwords_filter import filter_stopwords
from termcolor import colored
def lstm(df_train, df_test):
print("Number of GPUs available: ", len(tf.config.list_physical_devices('GPU')))
# Filtrowanie stopwordów
df_train = filter_stopwords(df_train)
df_test = filter_stopwords(df_test)
# Podzielenie danych na testowe i treningowe
x_train = df_train['Input']
x_test = df_test['Input']
y_train = df_train['Sentiment']
y_test = df_test['Sentiment']
# Ustawienie hiperparametrów modelu
vocab_size = 10000
embedding_dim = 128
# max_length = max([len(text) for text in (df_train['Input'].append(df_test['Input']))])
max_length = int(sum([len(text) for text in (df_train['Input'].append(df_test['Input']))])
/ len(df_train['Input'].append(df_test['Input'])))
epochs = 5
# Utworzenie word embeddingów
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(x_train)
x_train_sequences = tokenizer.texts_to_sequences(x_train)
x_train_padded = pad_sequences(x_train_sequences, maxlen=max_length, padding='post', truncating='post')
x_test_sequences = tokenizer.texts_to_sequences(x_test)
x_test_padded = pad_sequences(x_test_sequences, maxlen=max_length, padding='post', truncating='post')
# Utworzenie modelu: definicja architektury, kompilacja, trening i zapisanie do pliku
if os.path.isdir('lstm_model'):
model = load_model('lstm_model')
model.summary()
else:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(embedding_dim)))
model.add(Dense(6, activation='softmax'))
model.summary()
model.compile(
loss='sparse_categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'],
)
history = model.fit(x_train_padded, y_train, epochs=epochs)
model.save('lstm_model')
# Dokonanie predykcji na zbiorze testowym
y_pred = model.predict(x_test_padded)
y_pred_cat = []
for pred in y_pred:
y_pred_cat.append(np.argmax(pred))
# Ewaluacja modelu
results_text = classification_report(y_test, y_pred_cat, zero_division=True)
results_dict = classification_report(y_test, y_pred_cat, zero_division=True, output_dict=True)
print(colored('---------- MODEL 2: LSTM ----------', 'blue'))
print(colored(results_text, 'blue'))
return results_dict