dl_projekt/lstm.ipynb
2024-06-03 16:32:11 +02:00

27 KiB
Raw Blame History

LSTM

import pandas as pd
import numpy as np

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
valid = pd.read_csv("valid.csv")

train.loc[train["review_score"]==-1, "review_score"]=0
test.loc[test["review_score"]==-1, "review_score"]=0
valid.loc[valid["review_score"]==-1, "review_score"]=0

Sprawdzanie długości najdłuższej recenzji (teoretycznie Steam zezwala na max 8000 znaków)

train["seq_length"] = train["review_text"].apply(lambda x : len(x.split()))
print(train["seq_length"].max())
1570
import tensorflow as tf

SEQ_PADDED_LENGTH = 1600
VOCABULARY_SIZE = 4000
vectorizer = tf.keras.layers.TextVectorization(output_sequence_length=SEQ_PADDED_LENGTH, max_tokens=VOCABULARY_SIZE)
vectorizer.adapt(train["review_text"])
len(vectorizer.get_vocabulary())
4000
train["vectorized"] = train["review_text"].apply(vectorizer)
test["vectorized"] = test["review_text"].apply(vectorizer)
valid["vectorized"] = valid["review_text"].apply(vectorizer)
from keras.optimizers import Adam
import keras.layers as layers
import keras


def create_model():
    input_layer = layers.Input(shape=(SEQ_PADDED_LENGTH,))
    embedding_layer = layers.Embedding(input_dim=VOCABULARY_SIZE+1, output_dim=16, input_length=SEQ_PADDED_LENGTH)(input_layer)
    lstm_layer = layers.LSTM(64)(embedding_layer)
    output_layer = layers.Dense(1,activation="sigmoid")(lstm_layer)
    model = keras.Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=1e-3), metrics=['accuracy'])
    return model
model = create_model()
model.summary()
Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_15 (InputLayer)       [(None, 1600)]            0         
                                                                 
 embedding_14 (Embedding)    (None, 1600, 16)          64016     
                                                                 
 lstm_15 (LSTM)              (None, 64)                20736     
                                                                 
 dense_9 (Dense)             (None, 1)                 65        
                                                                 
=================================================================
Total params: 84,817
Trainable params: 84,817
Non-trainable params: 0
_________________________________________________________________
train.iloc[120]["vectorized"].shape
TensorShape([1600])
train.iloc[120]["vectorized"].get_shape().as_list()
[1600]
train.iloc[120]["vectorized"]
<tf.Tensor: shape=(1600,), dtype=int64, numpy=array([423, 635, 423, ...,   0,   0,   0], dtype=int64)>

Część recenzji nie zawierała tekstu więc po usunięciu interpunkcji i znaków specjalnych były puste, teksty te trzeba usunąć z materiału treningowego

train["shapes"] = train["vectorized"].apply(lambda x : x.get_shape().as_list()[0])
train["shapes"].value_counts()
shapes
1600    43111
0         119
Name: count, dtype: int64
train.drop(train[train["vectorized"].map(lambda x : x.get_shape().as_list()[0])!=SEQ_PADDED_LENGTH].index, inplace=True)
train["shapes"].value_counts()
shapes
1600    43111
Name: count, dtype: int64
#valid.drop(valid[valid["vectorized"].map(lambda x : x.get_shape().as_list()[0])!=1600].index, inplace=True)

empty_valid = valid[valid["vectorized"].map(lambda x : x.get_shape().as_list()[0])==0]
empty_valid.head()
Unnamed: 0 review_text review_score vectorized
42 4552590 !!! 1 ()
124 5286261 . 1 ()
259 4934066 ........ 1 ()
468 5584357 . 1 ()
717 2172088 =] 1 ()

"0" to maskowane pozycje, puste dane w zbiorze testowym można nimi uzupełnić

#test.loc[test["vectorized"].map(lambda x : x.get_shape().as_list()[0])!=SEQ_PADDED_LENGTH,"vectorized"] = tf.zeros((SEQ_PADDED_LENGTH,), dtype=tf.dtypes.int64)
#valid.loc[valid["vectorized"].map(lambda x : x.get_shape().as_list()[0])!=SEQ_PADDED_LENGTH,"vectorized"] = tf.zeros((SEQ_PADDED_LENGTH,), dtype=tf.dtypes.int64)
#empty_valid["vectorized"] = tf.zeros((len(empty_valid.index),1600), dtype=tf.dtypes.int64)
#empty_test["vectorized"] = tf.zeros((len(empty_test.index),1600), dtype=tf.dtypes.int64)

#empty_valid["vectorized"].iloc[0]

def vector_fix(x):
    if x.get_shape().as_list()[0]==SEQ_PADDED_LENGTH:
        return x
    return tf.zeros((1600,), dtype=tf.dtypes.int64)

test["vectorized"] = test["vectorized"].apply(vector_fix)
valid["vectorized"] = valid["vectorized"].apply(vector_fix)
#train["vectorized"] = train["vectorized"].apply(lambda x : x.numpy())
#valid["vectorized"] = valid["vectorized"].apply(lambda x : x.numpy())
#test["vectorized"] = test["vectorized"].apply(lambda x : x.numpy())
train.iloc[0]["vectorized"]
<tf.Tensor: shape=(1600,), dtype=int64, numpy=array([ 96,   2, 824, ...,   0,   0,   0], dtype=int64)>

Trening nawet mniejszego modelu na pełnym zbiorze danych zajmował bardzo dużo czasu więc skróciłem zbiór treningowy

#train_y = np.stack(train["review_score"].values)
train_y = np.stack(train["review_score"].values)
valid_y = np.stack(valid["review_score"].values)

test_y = np.stack(test["review_score"].values)

###
#train_x =  np.stack(train["vectorized"].values)
train_x =  np.stack(train["vectorized"].values)

test_x = np.stack(test["vectorized"].values)
valid_x =  np.stack(valid["vectorized"].values)


#callback = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=3, restore_best_weights=True)
history = model.fit(train_x, train_y, validation_data=(valid_x, valid_y), epochs=3)
Epoch 1/3
1348/1348 [==============================] - 627s 465ms/step - loss: 0.6933 - accuracy: 0.4947 - val_loss: 0.6950 - val_accuracy: 0.1744
Epoch 2/3
 918/1348 [===================>..........] - ETA: 3:00 - loss: 0.6932 - accuracy: 0.4982
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In [44], line 16
     12 valid_x =  np.stack(valid["vectorized"].values)
     15 #callback = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=3, restore_best_weights=True)
---> 16 history = model.fit(train_x, train_y, validation_data=(valid_x, valid_y), epochs=3)

File ~\miniconda3\lib\site-packages\keras\utils\traceback_utils.py:65, in filter_traceback.<locals>.error_handler(*args, **kwargs)
     63 filtered_tb = None
     64 try:
---> 65     return fn(*args, **kwargs)
     66 except Exception as e:
     67     filtered_tb = _process_traceback_frames(e.__traceback__)

File ~\miniconda3\lib\site-packages\keras\engine\training.py:1564, in Model.fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
   1556 with tf.profiler.experimental.Trace(
   1557     "train",
   1558     epoch_num=epoch,
   (...)
   1561     _r=1,
   1562 ):
   1563     callbacks.on_train_batch_begin(step)
-> 1564     tmp_logs = self.train_function(iterator)
   1565     if data_handler.should_sync:
   1566         context.async_wait()

File ~\miniconda3\lib\site-packages\tensorflow\python\util\traceback_utils.py:150, in filter_traceback.<locals>.error_handler(*args, **kwargs)
    148 filtered_tb = None
    149 try:
--> 150   return fn(*args, **kwargs)
    151 except Exception as e:
    152   filtered_tb = _process_traceback_frames(e.__traceback__)

File ~\miniconda3\lib\site-packages\tensorflow\python\eager\def_function.py:915, in Function.__call__(self, *args, **kwds)
    912 compiler = "xla" if self._jit_compile else "nonXla"
    914 with OptionalXlaContext(self._jit_compile):
--> 915   result = self._call(*args, **kwds)
    917 new_tracing_count = self.experimental_get_tracing_count()
    918 without_tracing = (tracing_count == new_tracing_count)

File ~\miniconda3\lib\site-packages\tensorflow\python\eager\def_function.py:947, in Function._call(self, *args, **kwds)
    944   self._lock.release()
    945   # In this case we have created variables on the first call, so we run the
    946   # defunned version which is guaranteed to never create variables.
--> 947   return self._stateless_fn(*args, **kwds)  # pylint: disable=not-callable
    948 elif self._stateful_fn is not None:
    949   # Release the lock early so that multiple threads can perform the call
    950   # in parallel.
    951   self._lock.release()

File ~\miniconda3\lib\site-packages\tensorflow\python\eager\function.py:2496, in Function.__call__(self, *args, **kwargs)
   2493 with self._lock:
   2494   (graph_function,
   2495    filtered_flat_args) = self._maybe_define_function(args, kwargs)
-> 2496 return graph_function._call_flat(
   2497     filtered_flat_args, captured_inputs=graph_function.captured_inputs)

File ~\miniconda3\lib\site-packages\tensorflow\python\eager\function.py:1862, in ConcreteFunction._call_flat(self, args, captured_inputs, cancellation_manager)
   1858 possible_gradient_type = gradients_util.PossibleTapeGradientTypes(args)
   1859 if (possible_gradient_type == gradients_util.POSSIBLE_GRADIENT_TYPES_NONE
   1860     and executing_eagerly):
   1861   # No tape is watching; skip to running the function.
-> 1862   return self._build_call_outputs(self._inference_function.call(
   1863       ctx, args, cancellation_manager=cancellation_manager))
   1864 forward_backward = self._select_forward_and_backward_functions(
   1865     args,
   1866     possible_gradient_type,
   1867     executing_eagerly)
   1868 forward_function, args_with_tangents = forward_backward.forward()

File ~\miniconda3\lib\site-packages\tensorflow\python\eager\function.py:499, in _EagerDefinedFunction.call(self, ctx, args, cancellation_manager)
    497 with _InterpolateFunctionError(self):
    498   if cancellation_manager is None:
--> 499     outputs = execute.execute(
    500         str(self.signature.name),
    501         num_outputs=self._num_outputs,
    502         inputs=args,
    503         attrs=attrs,
    504         ctx=ctx)
    505   else:
    506     outputs = execute.execute_with_cancellation(
    507         str(self.signature.name),
    508         num_outputs=self._num_outputs,
   (...)
    511         ctx=ctx,
    512         cancellation_manager=cancellation_manager)

File ~\miniconda3\lib\site-packages\tensorflow\python\eager\execute.py:54, in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
     52 try:
     53   ctx.ensure_initialized()
---> 54   tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
     55                                       inputs, attrs, num_outputs)
     56 except core._NotOkStatusException as e:
     57   if name is not None:

KeyboardInterrupt: