From 6f4035f7592161ff61622d456cbb10698496e37b Mon Sep 17 00:00:00 2001 From: Adrian Klessa <50918271+AdrianKlessa@users.noreply.github.com> Date: Tue, 4 Jun 2024 15:30:43 +0200 Subject: [PATCH] Pretrained transformer --- transformer.ipynb | 533 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 533 insertions(+) create mode 100644 transformer.ipynb diff --git a/transformer.ipynb b/transformer.ipynb new file mode 100644 index 0000000..d4b8d49 --- /dev/null +++ b/transformer.ipynb @@ -0,0 +1,533 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "train = pd.read_csv(\"train.csv\")\n", + "test = pd.read_csv(\"test.csv\")\n", + "valid = pd.read_csv(\"valid.csv\")\n", + "\n", + "train.loc[train[\"review_score\"]==-1, \"review_score\"]=0\n", + "test.loc[test[\"review_score\"]==-1, \"review_score\"]=0\n", + "valid.loc[valid[\"review_score\"]==-1, \"review_score\"]=0" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "outputs": [ + { + "data": { + "text/plain": "True" + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import torch\n", + "torch.cuda.is_available()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 16, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).\n", + "Using a pipeline without specifying a model name and revision in production is not recommended.\n", + "C:\\Users\\Adrian\\miniconda3\\lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "from transformers import pipeline\n", + "sentiment_pipeline = pipeline(\"sentiment-analysis\", device=0)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 17, + "outputs": [], + "source": [ + "test[\"predicted_score\"] = sentiment_pipeline(test[\"review_text\"].tolist(), truncation=True)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 18, + "outputs": [ + { + "data": { + "text/plain": "{'label': 'POSITIVE', 'score': 0.9997923970222473}" + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test.iloc[0][\"predicted_score\"]" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 20, + "outputs": [], + "source": [ + "str_to_int_score = {\"POSITIVE\" : 1, \"NEGATIVE\" : 0}\n", + "\n", + "test[\"model_predictions\"] = test[\"predicted_score\"].apply(lambda x: str_to_int_score[x[\"label\"]])" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 21, + "outputs": [ + { + "data": { + "text/plain": " Unnamed: 0 review_text \\\n0 1265039 I love the Fact you can do what EVER you want ... \n1 3132003 Tony Hawk's without the Pro Skater. Finding ou... \n2 880195 It's pretty good. \n3 717128 This the best dungeon game I have played since... \n4 5221356 Totally awesome game alone or with a friend. I... \n\n review_score predicted_score \\\n0 1 {'label': 'POSITIVE', 'score': 0.9997923970222... \n1 1 {'label': 'POSITIVE', 'score': 0.9989967942237... \n2 1 {'label': 'POSITIVE', 'score': 0.9998482465744... \n3 1 {'label': 'POSITIVE', 'score': 0.9998807907104... \n4 1 {'label': 'POSITIVE', 'score': 0.9998763799667... \n\n model_predictions \n0 1 \n1 1 \n2 1 \n3 1 \n4 1 ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Unnamed: 0review_textreview_scorepredicted_scoremodel_predictions
01265039I love the Fact you can do what EVER you want ...1{'label': 'POSITIVE', 'score': 0.9997923970222...1
13132003Tony Hawk's without the Pro Skater. Finding ou...1{'label': 'POSITIVE', 'score': 0.9989967942237...1
2880195It's pretty good.1{'label': 'POSITIVE', 'score': 0.9998482465744...1
3717128This the best dungeon game I have played since...1{'label': 'POSITIVE', 'score': 0.9998807907104...1
45221356Totally awesome game alone or with a friend. I...1{'label': 'POSITIVE', 'score': 0.9998763799667...1
\n
" + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test.head()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 22, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.77\n", + "Precision: 0.97\n", + "Recall: 0.75\n", + "F1 Score: 0.84\n" + ] + } + ], + "source": [ + "def get_metrics():\n", + " df = test\n", + " predictions = df[\"model_predictions\"].to_numpy()\n", + " true_values = df[\"review_score\"].to_numpy()\n", + " accuracy = np.sum(np.rint(predictions) == true_values)/len(true_values)\n", + " TN_count = len(df.query(\"`review_score`==0 and `model_predictions`==0\").index)\n", + " TP_count = len(df.query(\"`review_score`==1 and `model_predictions`==1\").index)\n", + " FP_count = len(df.query(\"`review_score`==0 and `model_predictions`==1\").index)\n", + " FN_count = len(df.query(\"`review_score`==1 and `model_predictions`==0\").index)\n", + " precision = TP_count/(TP_count+FP_count)\n", + " recall = TP_count/(TP_count+FN_count)\n", + " F1_score = (2*precision*recall)/(precision+recall)\n", + " print(f\"Accuracy: {accuracy:.2f}\")\n", + " print(f\"Precision: {precision:.2f}\")\n", + " print(f\"Recall: {recall:.2f}\")\n", + " print(f\"F1 Score: {F1_score:.2f}\")\n", + "get_metrics()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "Użyty domyślnie model (distilbert/distilbert-base-uncased-finetuned-sst-2-english) jest (wg. karty modelu) modelem do klasyfikacji tematów. Spróbujmy modelu, który jest dedykowany pod zadanie sentiment analysis dla recenzji." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 26, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Adrian\\miniconda3\\lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/plain": "config.json: 0%| | 0.00/953 [00:00