79 lines
2.8 KiB
Python
79 lines
2.8 KiB
Python
|
import os
|
||
|
import numpy as np
|
||
|
import tensorflow as tf
|
||
|
from keras import Sequential
|
||
|
from keras.layers import Dropout, Embedding, Bidirectional, LSTM, Dense
|
||
|
from keras.saving.save import load_model
|
||
|
from keras.utils import pad_sequences
|
||
|
from keras_preprocessing.text import Tokenizer
|
||
|
from sklearn.metrics import classification_report
|
||
|
from stopwords_filter import filter_stopwords
|
||
|
from termcolor import colored
|
||
|
|
||
|
|
||
|
def lstm(df_train, df_test):
|
||
|
print("Number of GPUs available: ", len(tf.config.list_physical_devices('GPU')))
|
||
|
|
||
|
# Filtrowanie stopwordów
|
||
|
df_train = filter_stopwords(df_train)
|
||
|
df_test = filter_stopwords(df_test)
|
||
|
|
||
|
# Podzielenie danych na testowe i treningowe
|
||
|
x_train = df_train['Input']
|
||
|
x_test = df_test['Input']
|
||
|
y_train = df_train['Sentiment']
|
||
|
y_test = df_test['Sentiment']
|
||
|
|
||
|
# Ustawienie hiperparametrów modelu
|
||
|
vocab_size = 10000
|
||
|
embedding_dim = 128
|
||
|
# max_length = max([len(text) for text in (df_train['Input'].append(df_test['Input']))])
|
||
|
max_length = int(sum([len(text) for text in (df_train['Input'].append(df_test['Input']))])
|
||
|
/ len(df_train['Input'].append(df_test['Input'])))
|
||
|
epochs = 5
|
||
|
|
||
|
# Utworzenie word embeddingów
|
||
|
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
|
||
|
tokenizer.fit_on_texts(x_train)
|
||
|
x_train_sequences = tokenizer.texts_to_sequences(x_train)
|
||
|
x_train_padded = pad_sequences(x_train_sequences, maxlen=max_length, padding='post', truncating='post')
|
||
|
x_test_sequences = tokenizer.texts_to_sequences(x_test)
|
||
|
x_test_padded = pad_sequences(x_test_sequences, maxlen=max_length, padding='post', truncating='post')
|
||
|
|
||
|
# Utworzenie modelu: definicja architektury, kompilacja, trening i zapisanie do pliku
|
||
|
if os.path.isdir('lstm_model'):
|
||
|
model = load_model('lstm_model')
|
||
|
model.summary()
|
||
|
else:
|
||
|
model = Sequential()
|
||
|
model.add(Embedding(vocab_size, embedding_dim))
|
||
|
model.add(Dropout(0.5))
|
||
|
model.add(Bidirectional(LSTM(embedding_dim)))
|
||
|
model.add(Dense(6, activation='softmax'))
|
||
|
model.summary()
|
||
|
|
||
|
model.compile(
|
||
|
loss='sparse_categorical_crossentropy',
|
||
|
optimizer='adam',
|
||
|
metrics=['accuracy'],
|
||
|
)
|
||
|
|
||
|
history = model.fit(x_train_padded, y_train, epochs=epochs)
|
||
|
|
||
|
model.save('lstm_model')
|
||
|
|
||
|
# Dokonanie predykcji na zbiorze testowym
|
||
|
y_pred = model.predict(x_test_padded)
|
||
|
y_pred_cat = []
|
||
|
for pred in y_pred:
|
||
|
y_pred_cat.append(np.argmax(pred))
|
||
|
|
||
|
# Ewaluacja modelu
|
||
|
results_text = classification_report(y_test, y_pred_cat, zero_division=True)
|
||
|
results_dict = classification_report(y_test, y_pred_cat, zero_division=True, output_dict=True)
|
||
|
|
||
|
print(colored('---------- MODEL 2: LSTM ----------', 'blue'))
|
||
|
print(colored(results_text, 'blue'))
|
||
|
|
||
|
return results_dict
|