From 6f4035f7592161ff61622d456cbb10698496e37b Mon Sep 17 00:00:00 2001
From: Adrian Klessa <50918271+AdrianKlessa@users.noreply.github.com>
Date: Tue, 4 Jun 2024 15:30:43 +0200
Subject: [PATCH] Pretrained transformer

---
 transformer.ipynb | 533 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 533 insertions(+)
 create mode 100644 transformer.ipynb

diff --git a/transformer.ipynb b/transformer.ipynb
new file mode 100644
index 0000000..d4b8d49
--- /dev/null
+++ b/transformer.ipynb
@@ -0,0 +1,533 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "train = pd.read_csv(\"train.csv\")\n",
+    "test = pd.read_csv(\"test.csv\")\n",
+    "valid = pd.read_csv(\"valid.csv\")\n",
+    "\n",
+    "train.loc[train[\"review_score\"]==-1, \"review_score\"]=0\n",
+    "test.loc[test[\"review_score\"]==-1, \"review_score\"]=0\n",
+    "valid.loc[valid[\"review_score\"]==-1, \"review_score\"]=0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "True"
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "torch.cuda.is_available()"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).\n",
+      "Using a pipeline without specifying a model name and revision in production is not recommended.\n",
+      "C:\\Users\\Adrian\\miniconda3\\lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import pipeline\n",
+    "sentiment_pipeline = pipeline(\"sentiment-analysis\", device=0)"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "outputs": [],
+   "source": [
+    "test[\"predicted_score\"] = sentiment_pipeline(test[\"review_text\"].tolist(), truncation=True)"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "{'label': 'POSITIVE', 'score': 0.9997923970222473}"
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test.iloc[0][\"predicted_score\"]"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "outputs": [],
+   "source": [
+    "str_to_int_score = {\"POSITIVE\" : 1, \"NEGATIVE\" : 0}\n",
+    "\n",
+    "test[\"model_predictions\"] = test[\"predicted_score\"].apply(lambda x: str_to_int_score[x[\"label\"]])"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "   Unnamed: 0                                        review_text  \\\n0     1265039  I love the Fact you can do what EVER you want ...   \n1     3132003  Tony Hawk's without the Pro Skater. Finding ou...   \n2      880195                                  It's pretty good.   \n3      717128  This the best dungeon game I have played since...   \n4     5221356  Totally awesome game alone or with a friend. I...   \n\n   review_score                                    predicted_score  \\\n0             1  {'label': 'POSITIVE', 'score': 0.9997923970222...   \n1             1  {'label': 'POSITIVE', 'score': 0.9989967942237...   \n2             1  {'label': 'POSITIVE', 'score': 0.9998482465744...   \n3             1  {'label': 'POSITIVE', 'score': 0.9998807907104...   \n4             1  {'label': 'POSITIVE', 'score': 0.9998763799667...   \n\n   model_predictions  \n0                  1  \n1                  1  \n2                  1  \n3                  1  \n4                  1  ",
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Unnamed: 0</th>\n      <th>review_text</th>\n      <th>review_score</th>\n      <th>predicted_score</th>\n      <th>model_predictions</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1265039</td>\n      <td>I love the Fact you can do what EVER you want ...</td>\n      <td>1</td>\n      <td>{'label': 'POSITIVE', 'score': 0.9997923970222...</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>3132003</td>\n      <td>Tony Hawk's without the Pro Skater. Finding ou...</td>\n      <td>1</td>\n      <td>{'label': 'POSITIVE', 'score': 0.9989967942237...</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>880195</td>\n      <td>It's pretty good.</td>\n      <td>1</td>\n      <td>{'label': 'POSITIVE', 'score': 0.9998482465744...</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>717128</td>\n      <td>This the best dungeon game I have played since...</td>\n      <td>1</td>\n      <td>{'label': 'POSITIVE', 'score': 0.9998807907104...</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>5221356</td>\n      <td>Totally awesome game alone or with a friend. I...</td>\n      <td>1</td>\n      <td>{'label': 'POSITIVE', 'score': 0.9998763799667...</td>\n      <td>1</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test.head()"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy: 0.77\n",
+      "Precision: 0.97\n",
+      "Recall: 0.75\n",
+      "F1 Score: 0.84\n"
+     ]
+    }
+   ],
+   "source": [
+    "def get_metrics():\n",
+    "    df = test\n",
+    "    predictions = df[\"model_predictions\"].to_numpy()\n",
+    "    true_values = df[\"review_score\"].to_numpy()\n",
+    "    accuracy = np.sum(np.rint(predictions) == true_values)/len(true_values)\n",
+    "    TN_count = len(df.query(\"`review_score`==0 and `model_predictions`==0\").index)\n",
+    "    TP_count = len(df.query(\"`review_score`==1 and `model_predictions`==1\").index)\n",
+    "    FP_count = len(df.query(\"`review_score`==0 and `model_predictions`==1\").index)\n",
+    "    FN_count = len(df.query(\"`review_score`==1 and `model_predictions`==0\").index)\n",
+    "    precision = TP_count/(TP_count+FP_count)\n",
+    "    recall = TP_count/(TP_count+FN_count)\n",
+    "    F1_score = (2*precision*recall)/(precision+recall)\n",
+    "    print(f\"Accuracy: {accuracy:.2f}\")\n",
+    "    print(f\"Precision: {precision:.2f}\")\n",
+    "    print(f\"Recall: {recall:.2f}\")\n",
+    "    print(f\"F1 Score: {F1_score:.2f}\")\n",
+    "get_metrics()"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Użyty domyślnie model (distilbert/distilbert-base-uncased-finetuned-sst-2-english) jest (wg. karty modelu) modelem do klasyfikacji tematów. Spróbujmy modelu, który jest dedykowany pod zadanie sentiment analysis dla recenzji."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\Adrian\\miniconda3\\lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": "config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]",
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "5a2ae4a13fe0488999d71c40c63f5623"
+      }
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\Adrian\\miniconda3\\lib\\site-packages\\huggingface_hub\\file_download.py:157: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\Adrian\\.cache\\huggingface\\hub\\models--nlptown--bert-base-multilingual-uncased-sentiment. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
+      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
+      "  warnings.warn(message)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": "pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]",
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "a2c241c538e24be4b70a95e45391d716"
+      }
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": "tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]",
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "74745b6d436247ffa5341c579996fc18"
+      }
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": "vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]",
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "9b6c34bb4be441d6adc8144642103c07"
+      }
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": "special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]",
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "d3cca3ea6bf143be8389dbc8a4fcbc9a"
+      }
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "sentiment_pipeline = pipeline(model=\"nlptown/bert-base-multilingual-uncased-sentiment\", device=0)"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "[{'label': '5 stars', 'score': 0.8000338673591614}]"
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sentiment_pipeline(test.iloc[0][\"review_text\"])"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "outputs": [],
+   "source": [
+    "test[\"predicted_score\"] = sentiment_pipeline(test[\"review_text\"].tolist(), truncation=True)"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "predicted_score\n5 stars    6183\n4 stars    3952\n1 star     2399\n3 stars    1883\n2 stars    1299\nName: count, dtype: int64"
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test[\"predicted_score\"] = test[\"predicted_score\"].apply(lambda x : x[\"label\"])\n",
+    "test[\"predicted_score\"].value_counts()"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "outputs": [],
+   "source": [
+    "str_to_int_score = {\"5 stars\" : 1, \"4 stars\" : 1, \"3 stars\": 1, \"2 stars\": 0, \"1 star\": 0} # Arbitralnie ustalone progi\n",
+    "\n",
+    "test[\"model_predictions\"] = test[\"predicted_score\"].apply(lambda x: str_to_int_score[x])"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy: 0.86\n",
+      "Precision: 0.95\n",
+      "Recall: 0.88\n",
+      "F1 Score: 0.91\n"
+     ]
+    }
+   ],
+   "source": [
+    "get_metrics()"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Wyniki są teraz lepsze. W porównaniu z LSTM model ten ma odrobinę niższą precyzję i wyższy recall, czyli więcej recenzji (również błędnie) uznaje za pozytywne."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "outputs": [],
+   "source": [
+    "def test_review_text(sentence):\n",
+    "    model_output = sentiment_pipeline([sentence])\n",
+    "    score = str_to_int_score[model_output[0][\"label\"]]\n",
+    "    print(score)\n",
+    "    if score==0:\n",
+    "        print(\"Negative review\")\n",
+    "    else:\n",
+    "        print(\"Positive review\")"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "Negative review\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_review_text(\"A buggy, uninspired mess\")"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "Negative review\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_review_text(\"This game is bad\")"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "Negative review\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_review_text(\"This game destroyed my life\")"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1\n",
+      "Positive review\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_review_text(\"Best game I've ever played\")"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1\n",
+      "Positive review\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_review_text(\"Fun cooperative play with scalable difficulty. Rapid path to get into a game with friends or open public games. \")"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "Negative review\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_review_text(\"Deliriously buggy. Fun if/when it works properly. Wait and see if they actually QA the next few patches before you play.\")"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}