{ "cells": [ { "cell_type": "markdown", "source": [ "### Importy" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true, "ExecuteTime": { "start_time": "2024-06-09T12:46:01.122296Z", "end_time": "2024-06-09T12:46:01.143307Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "import tensorflow as tf\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.metrics import accuracy_score, confusion_matrix\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from keras.preprocessing.text import Tokenizer\n", "from keras.models import Sequential\n", "from keras.layers import Embedding, LSTM, Dense, Dropout\n", "from keras.optimizers import Adam\n", "from transformers import pipeline\n", "from tqdm import tqdm\n", "from keras_preprocessing.sequence import pad_sequences\n", "from sklearn.metrics import classification_report" ] }, { "cell_type": "markdown", "source": [ "### Pobiernie danych" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 2, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Warning: Looks like you're using an outdated API Version, please consider updating (server 1.6.14 / client 1.6.11)\n", "Dataset URL: https://www.kaggle.com/datasets/shivamkushwaha/bbc-full-text-document-classification\n", "License(s): DbCL-1.0\n", "Downloading bbc-full-text-document-classification.zip to C:\\Users\\adamw\\PycharmProjects\\pythonProject\\dl_projekt\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", " 0%| | 0.00/5.59M [00:00, ?B/s]\n", " 18%|#7 | 1.00M/5.59M [00:00<00:03, 1.58MB/s]\n", " 36%|###5 | 2.00M/5.59M [00:00<00:01, 3.16MB/s]\n", " 72%|#######1 | 4.00M/5.59M [00:00<00:00, 6.34MB/s]\n", "100%|##########| 5.59M/5.59M [00:00<00:00, 6.05MB/s]\n" ] } ], "source": [ "!kaggle datasets download -d shivamkushwaha/bbc-full-text-document-classification --unzip" ], "metadata": { "collapsed": false, "ExecuteTime": { "start_time": "2024-06-08T22:27:32.388353Z", "end_time": "2024-06-08T22:27:39.576352Z" } } }, { "cell_type": "markdown", "source": [ "## Sprawdzenie dostępności GPU" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 2, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Num GPUs Available: 1\n", "[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]\n" ] } ], "source": [ "# Check GPU availability\n", "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n", "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'\n", "physical_devices = tf.config.experimental.list_physical_devices('GPU')\n", "print(\"Num GPUs Available: \", len(physical_devices))\n", "print(tf.config.list_physical_devices('GPU'))\n" ], "metadata": { "collapsed": false, "ExecuteTime": { "start_time": "2024-06-09T12:44:59.718075Z", "end_time": "2024-06-09T12:44:59.731077Z" } } }, { "cell_type": "markdown", "source": [ "## Ładowanie danych" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 40, "outputs": [], "source": [ "datapath = 'bbc/'\n", "directory, file, title, text, label = [], [], [], [], []\n", "for dirname, _, filenames in os.walk(datapath):\n", " for filename in filenames:\n", " if filename == 'README.TXT':\n", " continue\n", " directory.append(dirname)\n", " file.append(filename)\n", " label.append(dirname.split('/')[-1])\n", " fullpathfile = os.path.join(dirname, filename)\n", " with open(fullpathfile, 'r', encoding=\"utf8\", errors='ignore') as infile:\n", " intext = ''\n", " firstline = True\n", " for line in infile:\n", " if firstline:\n", " title.append(line.replace('\\n', ''))\n", " firstline = False\n", " else:\n", " intext += ' ' + line.replace('\\n', '')\n", " text.append(intext)\n" ], "metadata": { "collapsed": false, "ExecuteTime": { "start_time": "2024-06-09T12:50:53.380989Z", "end_time": "2024-06-09T12:50:53.698989Z" } } }, { "cell_type": "markdown", "source": [ "## Konwersja na DataFrame" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 41, "outputs": [], "source": [ "df = pd.DataFrame(list(zip(directory, file, title, text, label)), columns=['directory', 'file', 'title', 'text', 'label'])\n", "df = df.filter(['title', 'text', 'label'], axis=1)" ], "metadata": { "collapsed": false, "ExecuteTime": { "start_time": "2024-06-09T12:50:53.930080Z", "end_time": "2024-06-09T12:50:53.986076Z" } } }, { "cell_type": "code", "execution_count": 42, "outputs": [ { "data": { "text/plain": " title \\\n0 Ad sales boost Time Warner profit \n1 Dollar gains on Greenspan speech \n2 Yukos unit buyer faces loan claim \n3 High fuel prices hit BA's profits \n4 Pernod takeover talk lifts Domecq \n\n text label \n0 Quarterly profits at US media giant TimeWarn... business \n1 The dollar has hit its highest level against... business \n2 The owners of embattled Russian oil giant Yu... business \n3 British Airways has blamed high fuel prices ... business \n4 Shares in UK drinks and food firm Allied Dom... business ", "text/html": "
\n | title | \ntext | \nlabel | \n
---|---|---|---|
0 | \nAd sales boost Time Warner profit | \nQuarterly profits at US media giant TimeWarn... | \nbusiness | \n
1 | \nDollar gains on Greenspan speech | \nThe dollar has hit its highest level against... | \nbusiness | \n
2 | \nYukos unit buyer faces loan claim | \nThe owners of embattled Russian oil giant Yu... | \nbusiness | \n
3 | \nHigh fuel prices hit BA's profits | \nBritish Airways has blamed high fuel prices ... | \nbusiness | \n
4 | \nPernod takeover talk lifts Domecq | \nShares in UK drinks and food firm Allied Dom... | \nbusiness | \n