Pretrained transformer

This commit is contained in:
Adrian Klessa 2024-06-04 15:30:43 +02:00
parent 486385a5c6
commit 6f4035f759
1 changed files with 533 additions and 0 deletions

533
transformer.ipynb Normal file
View File

@ -0,0 +1,533 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"train = pd.read_csv(\"train.csv\")\n",
"test = pd.read_csv(\"test.csv\")\n",
"valid = pd.read_csv(\"valid.csv\")\n",
"\n",
"train.loc[train[\"review_score\"]==-1, \"review_score\"]=0\n",
"test.loc[test[\"review_score\"]==-1, \"review_score\"]=0\n",
"valid.loc[valid[\"review_score\"]==-1, \"review_score\"]=0"
]
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [
{
"data": {
"text/plain": "True"
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import torch\n",
"torch.cuda.is_available()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 16,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).\n",
"Using a pipeline without specifying a model name and revision in production is not recommended.\n",
"C:\\Users\\Adrian\\miniconda3\\lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
" warnings.warn(\n"
]
}
],
"source": [
"from transformers import pipeline\n",
"sentiment_pipeline = pipeline(\"sentiment-analysis\", device=0)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 17,
"outputs": [],
"source": [
"test[\"predicted_score\"] = sentiment_pipeline(test[\"review_text\"].tolist(), truncation=True)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 18,
"outputs": [
{
"data": {
"text/plain": "{'label': 'POSITIVE', 'score': 0.9997923970222473}"
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.iloc[0][\"predicted_score\"]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 20,
"outputs": [],
"source": [
"str_to_int_score = {\"POSITIVE\" : 1, \"NEGATIVE\" : 0}\n",
"\n",
"test[\"model_predictions\"] = test[\"predicted_score\"].apply(lambda x: str_to_int_score[x[\"label\"]])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 21,
"outputs": [
{
"data": {
"text/plain": " Unnamed: 0 review_text \\\n0 1265039 I love the Fact you can do what EVER you want ... \n1 3132003 Tony Hawk's without the Pro Skater. Finding ou... \n2 880195 It's pretty good. \n3 717128 This the best dungeon game I have played since... \n4 5221356 Totally awesome game alone or with a friend. I... \n\n review_score predicted_score \\\n0 1 {'label': 'POSITIVE', 'score': 0.9997923970222... \n1 1 {'label': 'POSITIVE', 'score': 0.9989967942237... \n2 1 {'label': 'POSITIVE', 'score': 0.9998482465744... \n3 1 {'label': 'POSITIVE', 'score': 0.9998807907104... \n4 1 {'label': 'POSITIVE', 'score': 0.9998763799667... \n\n model_predictions \n0 1 \n1 1 \n2 1 \n3 1 \n4 1 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Unnamed: 0</th>\n <th>review_text</th>\n <th>review_score</th>\n <th>predicted_score</th>\n <th>model_predictions</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1265039</td>\n <td>I love the Fact you can do what EVER you want ...</td>\n <td>1</td>\n <td>{'label': 'POSITIVE', 'score': 0.9997923970222...</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>3132003</td>\n <td>Tony Hawk's without the Pro Skater. Finding ou...</td>\n <td>1</td>\n <td>{'label': 'POSITIVE', 'score': 0.9989967942237...</td>\n <td>1</td>\n </tr>\n <tr>\n <th>2</th>\n <td>880195</td>\n <td>It's pretty good.</td>\n <td>1</td>\n <td>{'label': 'POSITIVE', 'score': 0.9998482465744...</td>\n <td>1</td>\n </tr>\n <tr>\n <th>3</th>\n <td>717128</td>\n <td>This the best dungeon game I have played since...</td>\n <td>1</td>\n <td>{'label': 'POSITIVE', 'score': 0.9998807907104...</td>\n <td>1</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5221356</td>\n <td>Totally awesome game alone or with a friend. I...</td>\n <td>1</td>\n <td>{'label': 'POSITIVE', 'score': 0.9998763799667...</td>\n <td>1</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 22,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.77\n",
"Precision: 0.97\n",
"Recall: 0.75\n",
"F1 Score: 0.84\n"
]
}
],
"source": [
"def get_metrics():\n",
" df = test\n",
" predictions = df[\"model_predictions\"].to_numpy()\n",
" true_values = df[\"review_score\"].to_numpy()\n",
" accuracy = np.sum(np.rint(predictions) == true_values)/len(true_values)\n",
" TN_count = len(df.query(\"`review_score`==0 and `model_predictions`==0\").index)\n",
" TP_count = len(df.query(\"`review_score`==1 and `model_predictions`==1\").index)\n",
" FP_count = len(df.query(\"`review_score`==0 and `model_predictions`==1\").index)\n",
" FN_count = len(df.query(\"`review_score`==1 and `model_predictions`==0\").index)\n",
" precision = TP_count/(TP_count+FP_count)\n",
" recall = TP_count/(TP_count+FN_count)\n",
" F1_score = (2*precision*recall)/(precision+recall)\n",
" print(f\"Accuracy: {accuracy:.2f}\")\n",
" print(f\"Precision: {precision:.2f}\")\n",
" print(f\"Recall: {recall:.2f}\")\n",
" print(f\"F1 Score: {F1_score:.2f}\")\n",
"get_metrics()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Użyty domyślnie model (distilbert/distilbert-base-uncased-finetuned-sst-2-english) jest (wg. karty modelu) modelem do klasyfikacji tematów. Spróbujmy modelu, który jest dedykowany pod zadanie sentiment analysis dla recenzji."
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 26,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Adrian\\miniconda3\\lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/plain": "config.json: 0%| | 0.00/953 [00:00<?, ?B/s]",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "5a2ae4a13fe0488999d71c40c63f5623"
}
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Adrian\\miniconda3\\lib\\site-packages\\huggingface_hub\\file_download.py:157: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\Adrian\\.cache\\huggingface\\hub\\models--nlptown--bert-base-multilingual-uncased-sentiment. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
"To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
" warnings.warn(message)\n"
]
},
{
"data": {
"text/plain": "pytorch_model.bin: 0%| | 0.00/669M [00:00<?, ?B/s]",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "a2c241c538e24be4b70a95e45391d716"
}
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": "tokenizer_config.json: 0%| | 0.00/39.0 [00:00<?, ?B/s]",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "74745b6d436247ffa5341c579996fc18"
}
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": "vocab.txt: 0%| | 0.00/872k [00:00<?, ?B/s]",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "9b6c34bb4be441d6adc8144642103c07"
}
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": "special_tokens_map.json: 0%| | 0.00/112 [00:00<?, ?B/s]",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "d3cca3ea6bf143be8389dbc8a4fcbc9a"
}
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sentiment_pipeline = pipeline(model=\"nlptown/bert-base-multilingual-uncased-sentiment\", device=0)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 27,
"outputs": [
{
"data": {
"text/plain": "[{'label': '5 stars', 'score': 0.8000338673591614}]"
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sentiment_pipeline(test.iloc[0][\"review_text\"])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 28,
"outputs": [],
"source": [
"test[\"predicted_score\"] = sentiment_pipeline(test[\"review_text\"].tolist(), truncation=True)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 29,
"outputs": [
{
"data": {
"text/plain": "predicted_score\n5 stars 6183\n4 stars 3952\n1 star 2399\n3 stars 1883\n2 stars 1299\nName: count, dtype: int64"
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test[\"predicted_score\"] = test[\"predicted_score\"].apply(lambda x : x[\"label\"])\n",
"test[\"predicted_score\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 31,
"outputs": [],
"source": [
"str_to_int_score = {\"5 stars\" : 1, \"4 stars\" : 1, \"3 stars\": 1, \"2 stars\": 0, \"1 star\": 0} # Arbitralnie ustalone progi\n",
"\n",
"test[\"model_predictions\"] = test[\"predicted_score\"].apply(lambda x: str_to_int_score[x])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 32,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.86\n",
"Precision: 0.95\n",
"Recall: 0.88\n",
"F1 Score: 0.91\n"
]
}
],
"source": [
"get_metrics()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Wyniki są teraz lepsze. W porównaniu z LSTM model ten ma odrobinę niższą precyzję i wyższy recall, czyli więcej recenzji (również błędnie) uznaje za pozytywne."
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 38,
"outputs": [],
"source": [
"def test_review_text(sentence):\n",
" model_output = sentiment_pipeline([sentence])\n",
" score = str_to_int_score[model_output[0][\"label\"]]\n",
" print(score)\n",
" if score==0:\n",
" print(\"Negative review\")\n",
" else:\n",
" print(\"Positive review\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 39,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n",
"Negative review\n"
]
}
],
"source": [
"test_review_text(\"A buggy, uninspired mess\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 40,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n",
"Negative review\n"
]
}
],
"source": [
"test_review_text(\"This game is bad\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 41,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n",
"Negative review\n"
]
}
],
"source": [
"test_review_text(\"This game destroyed my life\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 42,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1\n",
"Positive review\n"
]
}
],
"source": [
"test_review_text(\"Best game I've ever played\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 43,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1\n",
"Positive review\n"
]
}
],
"source": [
"test_review_text(\"Fun cooperative play with scalable difficulty. Rapid path to get into a game with friends or open public games. \")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 44,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n",
"Negative review\n"
]
}
],
"source": [
"test_review_text(\"Deliriously buggy. Fun if/when it works properly. Wait and see if they actually QA the next few patches before you play.\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}