dl_projekt/lstm.ipynb
2024-06-03 17:09:59 +02:00

837 lines
97 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"source": [
"# LSTM"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 1,
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"train = pd.read_csv(\"train.csv\")\n",
"test = pd.read_csv(\"test.csv\")\n",
"valid = pd.read_csv(\"valid.csv\")\n",
"\n",
"train.loc[train[\"review_score\"]==-1, \"review_score\"]=0\n",
"test.loc[test[\"review_score\"]==-1, \"review_score\"]=0\n",
"valid.loc[valid[\"review_score\"]==-1, \"review_score\"]=0"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Sprawdzanie długości najdłuższej recenzji (teoretycznie Steam zezwala na max 8000 znaków)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [],
"source": [
"train[\"seq_length\"] = train[\"review_text\"].apply(lambda x : len(x.split()))"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [
{
"data": {
"text/plain": "count 43230.000000\nmean 74.154962\nstd 127.088261\nmin 0.000000\n25% 12.000000\n50% 31.000000\n75% 80.000000\nmax 1570.000000\nName: seq_length, dtype: float64"
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train[\"seq_length\"].describe()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Niektóre recenzje są bardzo długie ale większość jest poniżej 100 słów. W celu przyspieszenia treningu usunę z zestawu treningowego te przykłady, które są dłuższe.\n",
"\n",
"*Notka: najpierw próbowałem wytrenować model na sekwencjach długości 1600 tokenów (większych niż najdłuższa recenzja). Model się bardzo długo i bardzo źle trenował.*"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [],
"source": [
"#train.drop(train[\"seq_length\"]>200, inplace=True)\n",
"train.drop(train[train.seq_length > 200].index, inplace=True)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [
{
"data": {
"text/plain": "count 39571.000000\nmean 44.135124\nstd 44.780534\nmin 0.000000\n25% 11.000000\n50% 27.000000\n75% 62.000000\nmax 200.000000\nName: seq_length, dtype: float64"
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train[\"seq_length\"].describe()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [],
"source": [
"import tensorflow as tf\n",
"\n",
"SEQ_PADDED_LENGTH = 200\n",
"VOCABULARY_SIZE = 4000\n",
"vectorizer = tf.keras.layers.TextVectorization(output_sequence_length=SEQ_PADDED_LENGTH, max_tokens=VOCABULARY_SIZE)\n",
"vectorizer.adapt(train[\"review_text\"])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [
{
"data": {
"text/plain": "4000"
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(vectorizer.get_vocabulary())"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [],
"source": [
"train[\"vectorized\"] = train[\"review_text\"].apply(vectorizer)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [],
"source": [
"test[\"vectorized\"] = test[\"review_text\"].apply(vectorizer)\n",
"valid[\"vectorized\"] = valid[\"review_text\"].apply(vectorizer)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: \"model\"\n",
"_________________________________________________________________\n",
" Layer (type) Output Shape Param # \n",
"=================================================================\n",
" input_1 (InputLayer) [(None, 200)] 0 \n",
" \n",
" embedding (Embedding) (None, 200, 128) 512128 \n",
" \n",
" bidirectional (Bidirectiona (None, 200, 128) 98816 \n",
" l) \n",
" \n",
" dropout (Dropout) (None, 200, 128) 0 \n",
" \n",
" bidirectional_1 (Bidirectio (None, 128) 98816 \n",
" nal) \n",
" \n",
" dense (Dense) (None, 1) 129 \n",
" \n",
"=================================================================\n",
"Total params: 709,889\n",
"Trainable params: 709,889\n",
"Non-trainable params: 0\n",
"_________________________________________________________________\n"
]
}
],
"source": [
"from keras.optimizers import Adam\n",
"import keras.layers as layers\n",
"import keras\n",
"\n",
"\n",
"def create_model():\n",
" input_layer = layers.Input(shape=(SEQ_PADDED_LENGTH,))\n",
" embedding_layer = layers.Embedding(input_dim=VOCABULARY_SIZE+1, output_dim=128, input_length=SEQ_PADDED_LENGTH)(input_layer)\n",
" #lstm_layer = layers.LSTM(64)(embedding_layer)\n",
" lstm_layer = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(embedding_layer)\n",
" dropout_layer = layers.Dropout(0.5)(lstm_layer)\n",
" lstm_layer_2 = layers.Bidirectional(layers.LSTM(64))(dropout_layer)\n",
" output_layer = layers.Dense(1,activation=\"sigmoid\")(lstm_layer_2)\n",
" model = keras.Model(inputs=input_layer, outputs=output_layer)\n",
" model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=1e-3), metrics=['accuracy'])\n",
" return model\n",
"model = create_model()\n",
"model.summary()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 14,
"outputs": [
{
"data": {
"text/plain": "TensorShape([200])"
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.iloc[120][\"vectorized\"].shape"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 15,
"outputs": [
{
"data": {
"text/plain": "[200]"
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.iloc[120][\"vectorized\"].get_shape().as_list()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 16,
"outputs": [
{
"data": {
"text/plain": "<tf.Tensor: shape=(200,), dtype=int64, numpy=\narray([ 225, 1120, 2, 113, 1, 1816, 3, 108, 97, 1417, 23,\n 12, 52, 19, 257, 10, 3, 52, 34, 8, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0], dtype=int64)>"
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.iloc[120][\"vectorized\"]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Część recenzji nie zawierała tekstu więc po usunięciu interpunkcji i znaków specjalnych były puste, teksty te trzeba usunąć z materiału treningowego"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 17,
"outputs": [
{
"data": {
"text/plain": "shapes\n200 39452\n0 119\nName: count, dtype: int64"
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train[\"shapes\"] = train[\"vectorized\"].apply(lambda x : x.get_shape().as_list()[0])\n",
"train[\"shapes\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 18,
"outputs": [
{
"data": {
"text/plain": "shapes\n200 39452\nName: count, dtype: int64"
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.drop(train[train[\"vectorized\"].map(lambda x : x.get_shape().as_list()[0])!=SEQ_PADDED_LENGTH].index, inplace=True)\n",
"train[\"shapes\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 19,
"outputs": [
{
"data": {
"text/plain": " Unnamed: 0 review_text review_score vectorized\n42 4552590 !!! 1 ()\n124 5286261 . 1 ()\n259 4934066 ........ 1 ()\n468 5584357 . 1 ()\n717 2172088 =] 1 ()",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Unnamed: 0</th>\n <th>review_text</th>\n <th>review_score</th>\n <th>vectorized</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>42</th>\n <td>4552590</td>\n <td>!!!</td>\n <td>1</td>\n <td>()</td>\n </tr>\n <tr>\n <th>124</th>\n <td>5286261</td>\n <td>.</td>\n <td>1</td>\n <td>()</td>\n </tr>\n <tr>\n <th>259</th>\n <td>4934066</td>\n <td>........</td>\n <td>1</td>\n <td>()</td>\n </tr>\n <tr>\n <th>468</th>\n <td>5584357</td>\n <td>.</td>\n <td>1</td>\n <td>()</td>\n </tr>\n <tr>\n <th>717</th>\n <td>2172088</td>\n <td>=]</td>\n <td>1</td>\n <td>()</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#valid.drop(valid[valid[\"vectorized\"].map(lambda x : x.get_shape().as_list()[0])!=1600].index, inplace=True)\n",
"\n",
"empty_valid = valid[valid[\"vectorized\"].map(lambda x : x.get_shape().as_list()[0])==0]\n",
"empty_valid.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"\"0\" to maskowane pozycje, puste dane w zbiorze testowym można nimi uzupełnić"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 20,
"outputs": [],
"source": [
"#test.loc[test[\"vectorized\"].map(lambda x : x.get_shape().as_list()[0])!=SEQ_PADDED_LENGTH,\"vectorized\"] = tf.zeros((SEQ_PADDED_LENGTH,), dtype=tf.dtypes.int64)\n",
"#valid.loc[valid[\"vectorized\"].map(lambda x : x.get_shape().as_list()[0])!=SEQ_PADDED_LENGTH,\"vectorized\"] = tf.zeros((SEQ_PADDED_LENGTH,), dtype=tf.dtypes.int64)\n",
"#empty_valid[\"vectorized\"] = tf.zeros((len(empty_valid.index),1600), dtype=tf.dtypes.int64)\n",
"#empty_test[\"vectorized\"] = tf.zeros((len(empty_test.index),1600), dtype=tf.dtypes.int64)\n",
"\n",
"#empty_valid[\"vectorized\"].iloc[0]\n",
"\n",
"def vector_fix(x):\n",
" if x.get_shape().as_list()[0]==SEQ_PADDED_LENGTH:\n",
" return x\n",
" return tf.zeros((SEQ_PADDED_LENGTH,), dtype=tf.dtypes.int64)\n",
"\n",
"test[\"vectorized\"] = test[\"vectorized\"].apply(vector_fix)\n",
"valid[\"vectorized\"] = valid[\"vectorized\"].apply(vector_fix)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 21,
"outputs": [],
"source": [
"#train[\"vectorized\"] = train[\"vectorized\"].apply(lambda x : x.numpy())\n",
"#valid[\"vectorized\"] = valid[\"vectorized\"].apply(lambda x : x.numpy())\n",
"#test[\"vectorized\"] = test[\"vectorized\"].apply(lambda x : x.numpy())"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 22,
"outputs": [
{
"data": {
"text/plain": "<tf.Tensor: shape=(200,), dtype=int64, numpy=\narray([ 41, 50, 1864, 20, 2, 201, 3, 90, 27, 98, 47,\n 4, 243, 50, 381, 184, 7, 139, 408, 71, 10, 5,\n 120, 14, 2, 688, 2, 3, 9, 48, 1, 30, 85,\n 31, 7, 314, 87, 12, 577, 6, 494, 10, 3, 63,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0], dtype=int64)>"
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.iloc[0][\"vectorized\"]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 23,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/3\n",
"1233/1233 [==============================] - 288s 230ms/step - loss: 0.4453 - accuracy: 0.7923 - val_loss: 0.3532 - val_accuracy: 0.8514\n",
"Epoch 2/3\n",
"1233/1233 [==============================] - 289s 235ms/step - loss: 0.3145 - accuracy: 0.8669 - val_loss: 0.3272 - val_accuracy: 0.8519\n",
"Epoch 3/3\n",
"1233/1233 [==============================] - 289s 234ms/step - loss: 0.2684 - accuracy: 0.8875 - val_loss: 0.3216 - val_accuracy: 0.8635\n"
]
}
],
"source": [
"#train_y = np.stack(train[\"review_score\"].values)\n",
"train_y = np.stack(train[\"review_score\"].values)\n",
"valid_y = np.stack(valid[\"review_score\"].values)\n",
"\n",
"test_y = np.stack(test[\"review_score\"].values)\n",
"\n",
"###\n",
"#train_x = np.stack(train[\"vectorized\"].values)\n",
"train_x = np.stack(train[\"vectorized\"].values)\n",
"\n",
"test_x = np.stack(test[\"vectorized\"].values)\n",
"valid_x = np.stack(valid[\"vectorized\"].values)\n",
"\n",
"\n",
"#callback = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=3, restore_best_weights=True)\n",
"history = model.fit(train_x, train_y, validation_data=(valid_x, valid_y), epochs=3)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 24,
"outputs": [],
"source": [
"model.save(\"lstm_model.keras\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 25,
"outputs": [
{
"data": {
"text/plain": "<matplotlib.legend.Legend at 0x1bf88e819a0>"
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "\n"
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from matplotlib import pyplot as plt\n",
"plt.plot(history.history['loss'])\n",
"plt.plot(history.history['val_loss'])\n",
"plt.title('Wartość funkcji straty')\n",
"plt.ylabel('Strata')\n",
"plt.xlabel('Epoka')\n",
"plt.legend(['train', 'test'], loc='upper left')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 26,
"outputs": [
{
"data": {
"text/plain": "<matplotlib.legend.Legend at 0x1bf8b19f490>"
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "\n"
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from matplotlib import pyplot as plt\n",
"plt.plot(history.history['accuracy'])\n",
"plt.plot(history.history['val_accuracy'])\n",
"plt.title('model accuracy')\n",
"plt.ylabel('accuracy')\n",
"plt.xlabel('epoch')\n",
"plt.legend(['train', 'test'], loc='upper left')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Dodatkowy trening modelu"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 27,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/5\n",
"1233/1233 [==============================] - 273s 222ms/step - loss: 0.2408 - accuracy: 0.9019 - val_loss: 0.3459 - val_accuracy: 0.8605\n",
"Epoch 2/5\n",
"1233/1233 [==============================] - 272s 221ms/step - loss: 0.2180 - accuracy: 0.9105 - val_loss: 0.3498 - val_accuracy: 0.8656\n"
]
}
],
"source": [
"callback = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=1, restore_best_weights=True)\n",
"history = model.fit(train_x, train_y, validation_data=(valid_x, valid_y), epochs=5, callbacks=[callback])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 28,
"outputs": [],
"source": [
"model.save(\"lstm_model_v2.keras\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Testowanie i ewaluacja modelu"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 31,
"outputs": [],
"source": [
"import tensorflow as tf\n",
"def test_review_text(sentence):\n",
" vectorized = vectorizer(sentence)\n",
" reshaped = tf.reshape(vectorized,shape=(1,200))\n",
" #print(vectorized.shape)\n",
" score = float(model(reshaped))\n",
" score_rounded = round(score)\n",
" print(score)\n",
" if score_rounded==0:\n",
" print(\"Negative review\")\n",
" else:\n",
" print(\"Positive review\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 32,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.02259424328804016\n",
"Negative review\n"
]
}
],
"source": [
"test_review_text(\"A buggy, uninspired mess\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 33,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.066298708319664\n",
"Negative review\n"
]
}
],
"source": [
"test_review_text(\"This game is bad\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 34,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9277510643005371\n",
"Positive review\n"
]
}
],
"source": [
"test_review_text(\"This game destroyed my life\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 35,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.990617036819458\n",
"Positive review\n"
]
}
],
"source": [
"test_review_text(\"Best game I've ever played\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 36,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9053470492362976\n",
"Positive review\n"
]
}
],
"source": [
"test_review_text(\"Fun cooperative play with scalable difficulty. Rapid path to get into a game with friends or open public games. \")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 37,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.3265230357646942\n",
"Negative review\n"
]
}
],
"source": [
"test_review_text(\"Deliriously buggy. Fun if/when it works properly. Wait and see if they actually QA the next few patches before you play.\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 38,
"outputs": [],
"source": [
"test[\"model_predictions\"] = model(np.stack(test[\"vectorized\"].values))"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 39,
"outputs": [],
"source": [
"test[\"model_predictions\"] = test[\"model_predictions\"].apply(lambda x : round(float(x)))"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 40,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.86\n",
"Precision: 0.97\n",
"Recall: 0.86\n",
"F1 Score: 0.91\n"
]
}
],
"source": [
"def get_metrics():\n",
" df = test\n",
" predictions = df[\"model_predictions\"].to_numpy()\n",
" true_values = df[\"review_score\"].to_numpy()\n",
" accuracy = np.sum(np.rint(predictions) == true_values)/len(true_values)\n",
" TN_count = len(df.query(\"`review_score`==0 and `model_predictions`==0\").index)\n",
" TP_count = len(df.query(\"`review_score`==1 and `model_predictions`==1\").index)\n",
" FP_count = len(df.query(\"`review_score`==0 and `model_predictions`==1\").index)\n",
" FN_count = len(df.query(\"`review_score`==1 and `model_predictions`==0\").index)\n",
" precision = TP_count/(TP_count+FP_count)\n",
" recall = TP_count/(TP_count+FN_count)\n",
" F1_score = (2*precision*recall)/(precision+recall)\n",
" print(f\"Accuracy: {accuracy:.2f}\")\n",
" print(f\"Precision: {precision:.2f}\")\n",
" print(f\"Recall: {recall:.2f}\")\n",
" print(f\"F1 Score: {F1_score:.2f}\")\n",
"get_metrics()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}