109 lines
3.7 KiB
Python
109 lines
3.7 KiB
Python
|
from keras import Input
|
|||
|
from keras.losses import CategoricalCrossentropy
|
|||
|
from keras.metrics import CategoricalAccuracy
|
|||
|
from keras.optimizers import Adam
|
|||
|
import numpy as np
|
|||
|
import tensorflow as tf
|
|||
|
from keras.layers import Dense
|
|||
|
from keras.saving.save import load_model
|
|||
|
from keras.utils import to_categorical
|
|||
|
from transformers import AutoTokenizer, TFBertModel
|
|||
|
from sklearn.metrics import classification_report
|
|||
|
import os
|
|||
|
from termcolor import colored
|
|||
|
|
|||
|
|
|||
|
def bert(df_train, df_test):
|
|||
|
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
|
|||
|
|
|||
|
print("Number of GPUs available: ", len(tf.config.list_physical_devices('GPU')))
|
|||
|
|
|||
|
# Inicjalizacja tokenizera i modelu BERT
|
|||
|
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
|
|||
|
bert = TFBertModel.from_pretrained('bert-base-cased')
|
|||
|
|
|||
|
# Tokenizacja danych wejściowych
|
|||
|
x_train = tokenizer(
|
|||
|
text=df_train.Input.tolist(),
|
|||
|
add_special_tokens=True,
|
|||
|
max_length=68,
|
|||
|
truncation=True,
|
|||
|
padding=True,
|
|||
|
return_tensors='tf',
|
|||
|
return_token_type_ids=False,
|
|||
|
return_attention_mask=True,
|
|||
|
verbose=True)
|
|||
|
x_test = tokenizer(
|
|||
|
text=df_test.Input.tolist(),
|
|||
|
add_special_tokens=True,
|
|||
|
max_length=70,
|
|||
|
truncation=True,
|
|||
|
padding=True,
|
|||
|
return_tensors='tf',
|
|||
|
return_token_type_ids=False,
|
|||
|
return_attention_mask=True,
|
|||
|
verbose=True)
|
|||
|
|
|||
|
# Zamiana docelowych danych wyjściowych na odpowiedni format
|
|||
|
y_test = df_test.Sentiment
|
|||
|
y_train_cat = to_categorical(df_train.Sentiment)
|
|||
|
y_test_cat = to_categorical(df_test.Sentiment)
|
|||
|
|
|||
|
# Utworzenie modelu: definicja architektury, kompilacja, trening i zapisanie do pliku
|
|||
|
if os.path.isdir('bert_model'):
|
|||
|
model = load_model('bert_model')
|
|||
|
model.summary()
|
|||
|
else:
|
|||
|
max_len = 68
|
|||
|
input_ids = Input(shape=max_len, dtype=tf.int32, name="input_ids")
|
|||
|
input_mask = Input(shape=max_len, dtype=tf.int32, name="attention_mask")
|
|||
|
embeddings = bert.bert(input_ids, attention_mask=input_mask)[0]
|
|||
|
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
|
|||
|
out = Dense(128, activation='relu')(out)
|
|||
|
out = tf.keras.layers.Dropout(0.1)(out)
|
|||
|
out = Dense(32, activation='relu')(out)
|
|||
|
y = Dense(6, activation='sigmoid')(out)
|
|||
|
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
|
|||
|
model.layers[2].trainable = True
|
|||
|
model.summary()
|
|||
|
|
|||
|
optimizer = Adam(
|
|||
|
learning_rate=5e-05,
|
|||
|
epsilon=1e-08,
|
|||
|
decay=0.01,
|
|||
|
clipnorm=1.0)
|
|||
|
|
|||
|
loss = CategoricalCrossentropy()
|
|||
|
metric = CategoricalAccuracy('balanced_accuracy')
|
|||
|
|
|||
|
model.compile(
|
|||
|
optimizer=optimizer,
|
|||
|
loss=loss,
|
|||
|
metrics=metric)
|
|||
|
|
|||
|
train_history = model.fit(
|
|||
|
x={'input_ids': x_train['input_ids'], 'attention_mask': x_train['attention_mask']},
|
|||
|
y=y_train_cat,
|
|||
|
validation_data=(
|
|||
|
{'input_ids': x_test['input_ids'], 'attention_mask': x_test['attention_mask']}, y_test_cat
|
|||
|
),
|
|||
|
epochs=1,
|
|||
|
batch_size=16
|
|||
|
)
|
|||
|
|
|||
|
if not os.path.isdir('bert_model'):
|
|||
|
model.save('bert_model')
|
|||
|
|
|||
|
# Dokonanie predykcji na zbiorze testowym
|
|||
|
predicted_raw = model.predict({'input_ids': x_test['input_ids'], 'attention_mask': x_test['attention_mask']})
|
|||
|
y_pred = np.argmax(predicted_raw, axis=1)
|
|||
|
|
|||
|
# Ewaluacja wyników
|
|||
|
results_text = classification_report(y_test, y_pred)
|
|||
|
results_dict = classification_report(y_test, y_pred, output_dict=True)
|
|||
|
|
|||
|
print(colored('---------- MODEL 3: BERT ----------', 'blue'))
|
|||
|
print(colored(results_text, 'blue'))
|
|||
|
|
|||
|
return results_dict
|