dl_projekt/lstm.ipynb
2024-06-03 17:09:59 +02:00

97 KiB

LSTM

import pandas as pd
import numpy as np

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
valid = pd.read_csv("valid.csv")

train.loc[train["review_score"]==-1, "review_score"]=0
test.loc[test["review_score"]==-1, "review_score"]=0
valid.loc[valid["review_score"]==-1, "review_score"]=0

Sprawdzanie długości najdłuższej recenzji (teoretycznie Steam zezwala na max 8000 znaków)

train["seq_length"] = train["review_text"].apply(lambda x : len(x.split()))
train["seq_length"].describe()
count    43230.000000
mean        74.154962
std        127.088261
min          0.000000
25%         12.000000
50%         31.000000
75%         80.000000
max       1570.000000
Name: seq_length, dtype: float64

Niektóre recenzje są bardzo długie ale większość jest poniżej 100 słów. W celu przyspieszenia treningu usunę z zestawu treningowego te przykłady, które są dłuższe.

_Notka: najpierw próbowałem wytrenować model na sekwencjach długości 1600 tokenów (większych niż najdłuższa recenzja). Model się bardzo długo i bardzo źle trenował.

#train.drop(train["seq_length"]>200, inplace=True)
train.drop(train[train.seq_length > 200].index, inplace=True)
train["seq_length"].describe()
count    39571.000000
mean        44.135124
std         44.780534
min          0.000000
25%         11.000000
50%         27.000000
75%         62.000000
max        200.000000
Name: seq_length, dtype: float64
import tensorflow as tf

SEQ_PADDED_LENGTH = 200
VOCABULARY_SIZE = 4000
vectorizer = tf.keras.layers.TextVectorization(output_sequence_length=SEQ_PADDED_LENGTH, max_tokens=VOCABULARY_SIZE)
vectorizer.adapt(train["review_text"])
len(vectorizer.get_vocabulary())
4000
train["vectorized"] = train["review_text"].apply(vectorizer)
test["vectorized"] = test["review_text"].apply(vectorizer)
valid["vectorized"] = valid["review_text"].apply(vectorizer)
from keras.optimizers import Adam
import keras.layers as layers
import keras


def create_model():
    input_layer = layers.Input(shape=(SEQ_PADDED_LENGTH,))
    embedding_layer = layers.Embedding(input_dim=VOCABULARY_SIZE+1, output_dim=128, input_length=SEQ_PADDED_LENGTH)(input_layer)
    #lstm_layer = layers.LSTM(64)(embedding_layer)
    lstm_layer = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(embedding_layer)
    dropout_layer = layers.Dropout(0.5)(lstm_layer)
    lstm_layer_2 = layers.Bidirectional(layers.LSTM(64))(dropout_layer)
    output_layer = layers.Dense(1,activation="sigmoid")(lstm_layer_2)
    model = keras.Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=1e-3), metrics=['accuracy'])
    return model
model = create_model()
model.summary()
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_1 (InputLayer)        [(None, 200)]             0         
                                                                 
 embedding (Embedding)       (None, 200, 128)          512128    
                                                                 
 bidirectional (Bidirectiona  (None, 200, 128)         98816     
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 200, 128)          0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
=================================================================
Total params: 709,889
Trainable params: 709,889
Non-trainable params: 0
_________________________________________________________________
train.iloc[120]["vectorized"].shape
TensorShape([200])
train.iloc[120]["vectorized"].get_shape().as_list()
[200]
train.iloc[120]["vectorized"]
<tf.Tensor: shape=(200,), dtype=int64, numpy=
array([ 225, 1120,    2,  113,    1, 1816,    3,  108,   97, 1417,   23,
         12,   52,   19,  257,   10,    3,   52,   34,    8,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0], dtype=int64)>

Część recenzji nie zawierała tekstu więc po usunięciu interpunkcji i znaków specjalnych były puste, teksty te trzeba usunąć z materiału treningowego

train["shapes"] = train["vectorized"].apply(lambda x : x.get_shape().as_list()[0])
train["shapes"].value_counts()
shapes
200    39452
0        119
Name: count, dtype: int64
train.drop(train[train["vectorized"].map(lambda x : x.get_shape().as_list()[0])!=SEQ_PADDED_LENGTH].index, inplace=True)
train["shapes"].value_counts()
shapes
200    39452
Name: count, dtype: int64
#valid.drop(valid[valid["vectorized"].map(lambda x : x.get_shape().as_list()[0])!=1600].index, inplace=True)

empty_valid = valid[valid["vectorized"].map(lambda x : x.get_shape().as_list()[0])==0]
empty_valid.head()
Unnamed: 0 review_text review_score vectorized
42 4552590 !!! 1 ()
124 5286261 . 1 ()
259 4934066 ........ 1 ()
468 5584357 . 1 ()
717 2172088 =] 1 ()

"0" to maskowane pozycje, puste dane w zbiorze testowym można nimi uzupełnić

#test.loc[test["vectorized"].map(lambda x : x.get_shape().as_list()[0])!=SEQ_PADDED_LENGTH,"vectorized"] = tf.zeros((SEQ_PADDED_LENGTH,), dtype=tf.dtypes.int64)
#valid.loc[valid["vectorized"].map(lambda x : x.get_shape().as_list()[0])!=SEQ_PADDED_LENGTH,"vectorized"] = tf.zeros((SEQ_PADDED_LENGTH,), dtype=tf.dtypes.int64)
#empty_valid["vectorized"] = tf.zeros((len(empty_valid.index),1600), dtype=tf.dtypes.int64)
#empty_test["vectorized"] = tf.zeros((len(empty_test.index),1600), dtype=tf.dtypes.int64)

#empty_valid["vectorized"].iloc[0]

def vector_fix(x):
    if x.get_shape().as_list()[0]==SEQ_PADDED_LENGTH:
        return x
    return tf.zeros((SEQ_PADDED_LENGTH,), dtype=tf.dtypes.int64)

test["vectorized"] = test["vectorized"].apply(vector_fix)
valid["vectorized"] = valid["vectorized"].apply(vector_fix)
#train["vectorized"] = train["vectorized"].apply(lambda x : x.numpy())
#valid["vectorized"] = valid["vectorized"].apply(lambda x : x.numpy())
#test["vectorized"] = test["vectorized"].apply(lambda x : x.numpy())
train.iloc[0]["vectorized"]
<tf.Tensor: shape=(200,), dtype=int64, numpy=
array([  41,   50, 1864,   20,    2,  201,    3,   90,   27,   98,   47,
          4,  243,   50,  381,  184,    7,  139,  408,   71,   10,    5,
        120,   14,    2,  688,    2,    3,    9,   48,    1,   30,   85,
         31,    7,  314,   87,   12,  577,    6,  494,   10,    3,   63,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0], dtype=int64)>
#train_y = np.stack(train["review_score"].values)
train_y = np.stack(train["review_score"].values)
valid_y = np.stack(valid["review_score"].values)

test_y = np.stack(test["review_score"].values)

###
#train_x =  np.stack(train["vectorized"].values)
train_x =  np.stack(train["vectorized"].values)

test_x = np.stack(test["vectorized"].values)
valid_x =  np.stack(valid["vectorized"].values)


#callback = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=3, restore_best_weights=True)
history = model.fit(train_x, train_y, validation_data=(valid_x, valid_y), epochs=3)
Epoch 1/3
1233/1233 [==============================] - 288s 230ms/step - loss: 0.4453 - accuracy: 0.7923 - val_loss: 0.3532 - val_accuracy: 0.8514
Epoch 2/3
1233/1233 [==============================] - 289s 235ms/step - loss: 0.3145 - accuracy: 0.8669 - val_loss: 0.3272 - val_accuracy: 0.8519
Epoch 3/3
1233/1233 [==============================] - 289s 234ms/step - loss: 0.2684 - accuracy: 0.8875 - val_loss: 0.3216 - val_accuracy: 0.8635
model.save("lstm_model.keras")
from matplotlib import pyplot as plt
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Wartość funkcji straty')
plt.ylabel('Strata')
plt.xlabel('Epoka')
plt.legend(['train', 'test'], loc='upper left')
<matplotlib.legend.Legend at 0x1bf88e819a0>
from matplotlib import pyplot as plt
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
<matplotlib.legend.Legend at 0x1bf8b19f490>

Dodatkowy trening modelu

callback = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=1, restore_best_weights=True)
history = model.fit(train_x, train_y, validation_data=(valid_x, valid_y), epochs=5, callbacks=[callback])
Epoch 1/5
1233/1233 [==============================] - 273s 222ms/step - loss: 0.2408 - accuracy: 0.9019 - val_loss: 0.3459 - val_accuracy: 0.8605
Epoch 2/5
1233/1233 [==============================] - 272s 221ms/step - loss: 0.2180 - accuracy: 0.9105 - val_loss: 0.3498 - val_accuracy: 0.8656
model.save("lstm_model_v2.keras")

Testowanie i ewaluacja modelu

import tensorflow as tf
def test_review_text(sentence):
    vectorized = vectorizer(sentence)
    reshaped = tf.reshape(vectorized,shape=(1,200))
    #print(vectorized.shape)
    score = float(model(reshaped))
    score_rounded = round(score)
    print(score)
    if score_rounded==0:
        print("Negative review")
    else:
        print("Positive review")
test_review_text("A buggy, uninspired mess")
0.02259424328804016
Negative review
test_review_text("This game is bad")
0.066298708319664
Negative review
test_review_text("This game destroyed my life")
0.9277510643005371
Positive review
test_review_text("Best game I've ever played")
0.990617036819458
Positive review
test_review_text("Fun cooperative play with scalable difficulty. Rapid path to get into a game with friends or open public games. ")
0.9053470492362976
Positive review
test_review_text("Deliriously buggy. Fun if/when it works properly. Wait and see if they actually QA the next few patches before you play.")
0.3265230357646942
Negative review
test["model_predictions"] = model(np.stack(test["vectorized"].values))
test["model_predictions"] = test["model_predictions"].apply(lambda x : round(float(x)))
def get_metrics():
    df = test
    predictions = df["model_predictions"].to_numpy()
    true_values = df["review_score"].to_numpy()
    accuracy = np.sum(np.rint(predictions) == true_values)/len(true_values)
    TN_count = len(df.query("`review_score`==0 and `model_predictions`==0").index)
    TP_count = len(df.query("`review_score`==1 and `model_predictions`==1").index)
    FP_count = len(df.query("`review_score`==0 and `model_predictions`==1").index)
    FN_count = len(df.query("`review_score`==1 and `model_predictions`==0").index)
    precision = TP_count/(TP_count+FP_count)
    recall = TP_count/(TP_count+FN_count)
    F1_score = (2*precision*recall)/(precision+recall)
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {F1_score:.2f}")
get_metrics()
Accuracy: 0.86
Precision: 0.97
Recall: 0.86
F1 Score: 0.91