# LSTM

In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
valid = pd.read_csv("valid.csv")

train.loc[train["review_score"]==-1, "review_score"]=0
test.loc[test["review_score"]==-1, "review_score"]=0
valid.loc[valid["review_score"]==-1, "review_score"]=0

### Sprawdzanie długości najdłuższej recenzji (teoretycznie Steam zezwala na max 8000 znaków)

In [2]:
train["seq_length"] = train["review_text"].apply(lambda x : len(x.split()))

In [3]:
print(train["seq_length"].max())

1570


In [4]:
import tensorflow as tf

SEQ_PADDED_LENGTH = 1600
VOCABULARY_SIZE = 4000
vectorizer = tf.keras.layers.TextVectorization(output_sequence_length=SEQ_PADDED_LENGTH, max_tokens=VOCABULARY_SIZE)
vectorizer.adapt(train["review_text"])

In [5]:
len(vectorizer.get_vocabulary())

4000

In [6]:
train["vectorized"] = train["review_text"].apply(vectorizer)

In [7]:
test["vectorized"] = test["review_text"].apply(vectorizer)
valid["vectorized"] = valid["review_text"].apply(vectorizer)

In [42]:
from keras.optimizers import Adam
import keras.layers as layers
import keras


def create_model():
    input_layer = layers.Input(shape=(SEQ_PADDED_LENGTH,))
    embedding_layer = layers.Embedding(input_dim=VOCABULARY_SIZE+1, output_dim=16, input_length=SEQ_PADDED_LENGTH)(input_layer)
    lstm_layer = layers.LSTM(64)(embedding_layer)
    output_layer = layers.Dense(1,activation="sigmoid")(lstm_layer)
    model = keras.Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=1e-3), metrics=['accuracy'])
    return model
model = create_model()
model.summary()

Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_15 (InputLayer)       [(None, 1600)]            0         
                                                                 
 embedding_14 (Embedding)    (None, 1600, 16)          64016     
                                                                 
 lstm_15 (LSTM)              (None, 64)                20736     
                                                                 
 dense_9 (Dense)             (None, 1)                 65        
                                                                 
Total params: 84,817
Trainable params: 84,817
Non-trainable params: 0
_________________________________________________________________


In [11]:
train.iloc[120]["vectorized"].shape

TensorShape([1600])

In [12]:
train.iloc[120]["vectorized"].get_shape().as_list()

[1600]

In [13]:
train.iloc[120]["vectorized"]

<tf.Tensor: shape=(1600,), dtype=int64, numpy=array([423, 635, 423, ...,   0,   0,   0], dtype=int64)>

### Część recenzji nie zawierała tekstu więc po usunięciu interpunkcji i znaków specjalnych były puste, teksty te trzeba usunąć z materiału treningowego

In [14]:
train["shapes"] = train["vectorized"].apply(lambda x : x.get_shape().as_list()[0])
train["shapes"].value_counts()

shapes
1600    43111
0         119
Name: count, dtype: int64

In [15]:
train.drop(train[train["vectorized"].map(lambda x : x.get_shape().as_list()[0])!=SEQ_PADDED_LENGTH].index, inplace=True)
train["shapes"].value_counts()

shapes
1600    43111
Name: count, dtype: int64

In [16]:
#valid.drop(valid[valid["vectorized"].map(lambda x : x.get_shape().as_list()[0])!=1600].index, inplace=True)

empty_valid = valid[valid["vectorized"].map(lambda x : x.get_shape().as_list()[0])==0]
empty_valid.head()

Unnamed: 0.1,Unnamed: 0,review_text,review_score,vectorized
42,4552590,!!!,1,()
124,5286261,.,1,()
259,4934066,........,1,()
468,5584357,.,1,()
717,2172088,=],1,()


"0" to maskowane pozycje, puste dane w zbiorze testowym można nimi uzupełnić

In [17]:
#test.loc[test["vectorized"].map(lambda x : x.get_shape().as_list()[0])!=SEQ_PADDED_LENGTH,"vectorized"] = tf.zeros((SEQ_PADDED_LENGTH,), dtype=tf.dtypes.int64)
#valid.loc[valid["vectorized"].map(lambda x : x.get_shape().as_list()[0])!=SEQ_PADDED_LENGTH,"vectorized"] = tf.zeros((SEQ_PADDED_LENGTH,), dtype=tf.dtypes.int64)
#empty_valid["vectorized"] = tf.zeros((len(empty_valid.index),1600), dtype=tf.dtypes.int64)
#empty_test["vectorized"] = tf.zeros((len(empty_test.index),1600), dtype=tf.dtypes.int64)

#empty_valid["vectorized"].iloc[0]

def vector_fix(x):
    if x.get_shape().as_list()[0]==SEQ_PADDED_LENGTH:
        return x
    return tf.zeros((1600,), dtype=tf.dtypes.int64)

test["vectorized"] = test["vectorized"].apply(vector_fix)
valid["vectorized"] = valid["vectorized"].apply(vector_fix)

In [18]:
#train["vectorized"] = train["vectorized"].apply(lambda x : x.numpy())
#valid["vectorized"] = valid["vectorized"].apply(lambda x : x.numpy())
#test["vectorized"] = test["vectorized"].apply(lambda x : x.numpy())

In [19]:
train.iloc[0]["vectorized"]

<tf.Tensor: shape=(1600,), dtype=int64, numpy=array([ 96,   2, 824, ...,   0,   0,   0], dtype=int64)>

### Trening nawet mniejszego modelu na pełnym zbiorze danych zajmował bardzo dużo czasu więc skróciłem zbiór treningowy

In [44]:
#train_y = np.stack(train["review_score"].values)
train_y = np.stack(train["review_score"].values)
valid_y = np.stack(valid["review_score"].values)

test_y = np.stack(test["review_score"].values)

###
#train_x =  np.stack(train["vectorized"].values)
train_x =  np.stack(train["vectorized"].values)

test_x = np.stack(test["vectorized"].values)
valid_x =  np.stack(valid["vectorized"].values)


#callback = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=3, restore_best_weights=True)
history = model.fit(train_x, train_y, validation_data=(valid_x, valid_y), epochs=3)

Epoch 1/3
Epoch 2/3

KeyboardInterrupt: 