{ "cells": [ { "cell_type": "markdown", "id": "39adcbc2-16ea-43b5-9724-a3cd67a8ae73", "metadata": {}, "source": [ "# Pobieranie datasetu" ] }, { "cell_type": "markdown", "id": "7db8b2ca-5142-464f-8d9f-81e6d5f4c9bd", "metadata": {}, "source": [ "## Ustawianie danych do kaggle" ] }, { "cell_type": "code", "execution_count": 1, "id": "4c9a2caa-c19c-450f-8b52-bf8124a9e7b2", "metadata": {}, "outputs": [], "source": [ "import os\n", "os.environ['KAGGLE_USERNAME'] = 'worldwidepaniel'\n", "os.environ['KAGGLE_KEY'] = '0a2ff35462d562da0e8b76b3bf22d7b7'" ] }, { "cell_type": "code", "execution_count": 2, "id": "187d593d-4cb2-43e6-a640-da81916ba547", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Defaulting to user installation because normal site-packages is not writeable\n", "Requirement already satisfied: kaggle in /home/students/s495716/.local/lib/python3.9/site-packages (1.6.6)\n", "Requirement already satisfied: bleach in /usr/local/lib/python3.9/dist-packages (from kaggle) (5.0.1)\n", "Requirement already satisfied: certifi in /usr/local/lib/python3.9/dist-packages (from kaggle) (2022.9.14)\n", "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.9/dist-packages (from kaggle) (2.8.2)\n", "Requirement already satisfied: python-slugify in /home/students/s495716/.local/lib/python3.9/site-packages (from kaggle) (8.0.4)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from kaggle) (2.28.1)\n", "Requirement already satisfied: six>=1.10 in /usr/lib/python3/dist-packages (from kaggle) (1.16.0)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.9/dist-packages (from kaggle) (4.64.1)\n", "Requirement already satisfied: urllib3 in /usr/local/lib/python3.9/dist-packages (from kaggle) (1.26.12)\n", "Requirement already satisfied: webencodings in /usr/local/lib/python3.9/dist-packages (from bleach->kaggle) (0.5.1)\n", "Requirement already satisfied: text-unidecode>=1.3 in /home/students/s495716/.local/lib/python3.9/site-packages (from python-slugify->kaggle) (1.3)\n", "Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.9/dist-packages (from requests->kaggle) (2.1.1)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests->kaggle) (3.4)\n" ] } ], "source": [ "!pip install kaggle" ] }, { "cell_type": "markdown", "id": "ee00edf9-1014-4f8d-88d2-272619fd5983", "metadata": {}, "source": [ "## Importy" ] }, { "cell_type": "code", "execution_count": 2, "id": "d156f51c-859e-4ec9-9867-9fc429bcef6c", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import kaggle" ] }, { "cell_type": "markdown", "id": "ad9902bd-65e8-420f-bf52-52125cc73409", "metadata": {}, "source": [ "## Pobranie datasetu" ] }, { "cell_type": "code", "execution_count": 4, "id": "fdc8a3c4-e7b2-4076-9e31-b70120b5580d", "metadata": {}, "outputs": [], "source": [ "kaggle.api.authenticate()\n", "kaggle.api.dataset_download_files('iabhishekofficial/mobile-price-classification', path='.', unzip=True)" ] }, { "cell_type": "markdown", "id": "cdb73db0-169d-4818-9601-2810c530cbb4", "metadata": {}, "source": [ "# Połączenie plików train.csv i test.csv" ] }, { "cell_type": "code", "execution_count": 3, "id": "7ae7d6df-ba81-4349-9a93-a10ba5d153a7", "metadata": {}, "outputs": [], "source": [ "dataset = pd.read_csv(\"train.csv\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "5a2aed33-c25c-4b01-a261-968d9dfc7be7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " battery_power blue clock_speed dual_sim fc four_g int_memory \\\n", "0 842 0 2.2 0 1 0 7 \n", "1 1021 1 0.5 1 0 1 53 \n", "2 563 1 0.5 1 2 1 41 \n", "3 615 1 2.5 0 0 0 10 \n", "4 1821 1 1.2 0 13 1 44 \n", "... ... ... ... ... .. ... ... \n", "1995 794 1 0.5 1 0 1 2 \n", "1996 1965 1 2.6 1 0 0 39 \n", "1997 1911 0 0.9 1 1 1 36 \n", "1998 1512 0 0.9 0 4 1 46 \n", "1999 510 1 2.0 1 5 1 45 \n", "\n", " m_dep mobile_wt n_cores ... px_height px_width ram sc_h sc_w \\\n", "0 0.6 188 2 ... 20 756 2549 9 7 \n", "1 0.7 136 3 ... 905 1988 2631 17 3 \n", "2 0.9 145 5 ... 1263 1716 2603 11 2 \n", "3 0.8 131 6 ... 1216 1786 2769 16 8 \n", "4 0.6 141 2 ... 1208 1212 1411 8 2 \n", "... ... ... ... ... ... ... ... ... ... \n", "1995 0.8 106 6 ... 1222 1890 668 13 4 \n", "1996 0.2 187 4 ... 915 1965 2032 11 10 \n", "1997 0.7 108 8 ... 868 1632 3057 9 1 \n", "1998 0.1 145 5 ... 336 670 869 18 10 \n", "1999 0.9 168 6 ... 483 754 3919 19 4 \n", "\n", " talk_time three_g touch_screen wifi price_range \n", "0 19 0 0 1 1 \n", "1 7 1 1 0 2 \n", "2 9 1 1 0 2 \n", "3 11 1 0 0 2 \n", "4 15 1 1 0 1 \n", "... ... ... ... ... ... \n", "1995 19 1 1 0 0 \n", "1996 16 1 1 1 2 \n", "1997 5 1 1 0 3 \n", "1998 19 1 1 1 0 \n", "1999 2 1 1 1 3 \n", "\n", "[2000 rows x 21 columns]\n" ] } ], "source": [ "print(dataset)" ] }, { "cell_type": "markdown", "id": "cbc9aa88-4854-486f-9402-f9e52d626d44", "metadata": {}, "source": [ "# Podział na predyktory i zmienne zależne" ] }, { "cell_type": "code", "execution_count": 5, "id": "1e912383-b4ce-4d35-ba75-e850660abbbe", "metadata": {}, "outputs": [], "source": [ "X = dataset.iloc[:, :-1].values\n", "y = dataset.iloc[:, -1].values" ] }, { "cell_type": "code", "execution_count": 6, "id": "fa453618-4483-4f77-ac42-b44448ccde2b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[8.420e+02 0.000e+00 2.200e+00 ... 0.000e+00 0.000e+00 1.000e+00]\n", " [1.021e+03 1.000e+00 5.000e-01 ... 1.000e+00 1.000e+00 0.000e+00]\n", " [5.630e+02 1.000e+00 5.000e-01 ... 1.000e+00 1.000e+00 0.000e+00]\n", " ...\n", " [1.911e+03 0.000e+00 9.000e-01 ... 1.000e+00 1.000e+00 0.000e+00]\n", " [1.512e+03 0.000e+00 9.000e-01 ... 1.000e+00 1.000e+00 1.000e+00]\n", " [5.100e+02 1.000e+00 2.000e+00 ... 1.000e+00 1.000e+00 1.000e+00]]\n" ] } ], "source": [ "print(X)" ] }, { "cell_type": "code", "execution_count": 7, "id": "cf06b8e2-b079-4891-8daf-d217b54dbe4f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[1 2 2 ... 3 0 3]\n" ] } ], "source": [ "print(y)" ] }, { "cell_type": "markdown", "id": "c42dc6b1-8a68-471a-a308-d5e2cb6bd6de", "metadata": {}, "source": [ "# Podział na dane testowe i treningowe" ] }, { "cell_type": "code", "execution_count": 8, "id": "6da626e8-33bc-4ec1-bebf-c902b6a6586c", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)" ] }, { "cell_type": "markdown", "id": "a44b61fc-af30-4692-a035-1f5f0f7cccb3", "metadata": {}, "source": [ "# Wyodrębnienie danych walidacyjnych z setu treningowego" ] }, { "cell_type": "code", "execution_count": 9, "id": "c6b15724-ec5a-4b3c-88dc-5e95ac8c447f", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=1)" ] }, { "cell_type": "markdown", "id": "bd7bc9a2-1020-4c11-91aa-a63bfe7dfd9d", "metadata": {}, "source": [ "# Feature scalling -> standardyzacja" ] }, { "cell_type": "code", "execution_count": 10, "id": "effe421e-c05b-4fe0-a535-985e4e1b47ea", "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import StandardScaler\n", "sc = StandardScaler()\n", "X_train[:, :] = sc.fit_transform(X_train[:, :])\n", "X_test[:, :] = sc.transform(X_test[:, :])\n", "X_val[:, :] = sc.transform(X_val[:, :])" ] }, { "cell_type": "code", "execution_count": 11, "id": "d2458a09-769d-46eb-bbfc-f626b854f54c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[ 0.61048768 -1.02080461 1.20382281 ... 0.57168984 -1.0344154\n", " -1.0268304 ]\n", " [ 0.0354526 -1.02080461 -1.25001265 ... 0.57168984 -1.0344154\n", " 0.97387066]\n", " [ 0.38507393 -1.02080461 -1.25001265 ... 0.57168984 -1.0344154\n", " -1.0268304 ]\n", " ...\n", " [-0.44297658 0.97961941 -0.88193733 ... 0.57168984 0.96672961\n", " 0.97387066]\n", " [ 1.14182009 -1.02080461 0.95843926 ... 0.57168984 -1.0344154\n", " 0.97387066]\n", " [ 0.09755639 -1.02080461 -1.25001265 ... 0.57168984 -1.0344154\n", " 0.97387066]]\n" ] } ], "source": [ "print(X_train)" ] }, { "cell_type": "code", "execution_count": 12, "id": "a65d9f5b-6ea0-444a-a3ab-eccec041e932", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[-0.48667924 -1.02080461 1.32651458 ... 0.57168984 -1.0344154\n", " -1.0268304 ]\n", " [-0.56028373 0.97961941 0.3449804 ... 0.57168984 0.96672961\n", " -1.0268304 ]\n", " [-1.46423887 0.97961941 -1.25001265 ... 0.57168984 0.96672961\n", " 0.97387066]\n", " ...\n", " [ 1.45923945 0.97961941 0.3449804 ... 0.57168984 -1.0344154\n", " -1.0268304 ]\n", " [ 1.65935165 -1.02080461 -1.25001265 ... 0.57168984 0.96672961\n", " -1.0268304 ]\n", " [-1.33543101 0.97961941 -1.0046291 ... 0.57168984 -1.0344154\n", " -1.0268304 ]]\n" ] } ], "source": [ "print(X_test)" ] }, { "cell_type": "code", "execution_count": 13, "id": "9099df76-0c07-448d-ac20-597c642a3d15", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[-0.36017153 -1.02080461 -0.14578669 ... 0.57168984 -1.0344154\n", " -1.0268304 ]\n", " [-0.87770309 -1.02080461 1.20382281 ... 0.57168984 0.96672961\n", " -1.0268304 ]\n", " [-1.47343943 -1.02080461 -0.02309492 ... 0.57168984 -1.0344154\n", " -1.0268304 ]\n", " ...\n", " [ 0.46557884 0.97961941 -1.25001265 ... 0.57168984 0.96672961\n", " 0.97387066]\n", " [-1.5516442 -1.02080461 -1.12732088 ... 0.57168984 -1.0344154\n", " -1.0268304 ]\n", " [ 1.4109365 -1.02080461 -1.25001265 ... 0.57168984 -1.0344154\n", " -1.0268304 ]]\n" ] } ], "source": [ "print(X_val)" ] }, { "cell_type": "code", "execution_count": null, "id": "3bc95b69-17e2-44fc-bd03-61a436129ef0", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" } }, "nbformat": 4, "nbformat_minor": 5 }