{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "39adcbc2-16ea-43b5-9724-a3cd67a8ae73",
   "metadata": {},
   "source": [
    "# Pobieranie datasetu"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7db8b2ca-5142-464f-8d9f-81e6d5f4c9bd",
   "metadata": {},
   "source": [
    "## Ustawianie danych do kaggle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "4c9a2caa-c19c-450f-8b52-bf8124a9e7b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['KAGGLE_USERNAME'] = 'worldwidepaniel'\n",
    "os.environ['KAGGLE_KEY'] = '0a2ff35462d562da0e8b76b3bf22d7b7'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "187d593d-4cb2-43e6-a640-da81916ba547",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Defaulting to user installation because normal site-packages is not writeable\n",
      "Requirement already satisfied: kaggle in /home/students/s495716/.local/lib/python3.9/site-packages (1.6.6)\n",
      "Requirement already satisfied: bleach in /usr/local/lib/python3.9/dist-packages (from kaggle) (5.0.1)\n",
      "Requirement already satisfied: certifi in /usr/local/lib/python3.9/dist-packages (from kaggle) (2022.9.14)\n",
      "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.9/dist-packages (from kaggle) (2.8.2)\n",
      "Requirement already satisfied: python-slugify in /home/students/s495716/.local/lib/python3.9/site-packages (from kaggle) (8.0.4)\n",
      "Requirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from kaggle) (2.28.1)\n",
      "Requirement already satisfied: six>=1.10 in /usr/lib/python3/dist-packages (from kaggle) (1.16.0)\n",
      "Requirement already satisfied: tqdm in /usr/local/lib/python3.9/dist-packages (from kaggle) (4.64.1)\n",
      "Requirement already satisfied: urllib3 in /usr/local/lib/python3.9/dist-packages (from kaggle) (1.26.12)\n",
      "Requirement already satisfied: webencodings in /usr/local/lib/python3.9/dist-packages (from bleach->kaggle) (0.5.1)\n",
      "Requirement already satisfied: text-unidecode>=1.3 in /home/students/s495716/.local/lib/python3.9/site-packages (from python-slugify->kaggle) (1.3)\n",
      "Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.9/dist-packages (from requests->kaggle) (2.1.1)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests->kaggle) (3.4)\n"
     ]
    }
   ],
   "source": [
    "!pip install kaggle"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ee00edf9-1014-4f8d-88d2-272619fd5983",
   "metadata": {},
   "source": [
    "## Importy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d156f51c-859e-4ec9-9867-9fc429bcef6c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import kaggle"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ad9902bd-65e8-420f-bf52-52125cc73409",
   "metadata": {},
   "source": [
    "## Pobranie datasetu"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "fdc8a3c4-e7b2-4076-9e31-b70120b5580d",
   "metadata": {},
   "outputs": [],
   "source": [
    "kaggle.api.authenticate()\n",
    "kaggle.api.dataset_download_files('iabhishekofficial/mobile-price-classification', path='./', unzip=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cdb73db0-169d-4818-9601-2810c530cbb4",
   "metadata": {},
   "source": [
    "# Połączenie plików train.csv i test.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7ae7d6df-ba81-4349-9a93-a10ba5d153a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = pd.read_csv(\"train.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "5a2aed33-c25c-4b01-a261-968d9dfc7be7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  \\\n",
      "0               842     0          2.2         0   1       0           7   \n",
      "1              1021     1          0.5         1   0       1          53   \n",
      "2               563     1          0.5         1   2       1          41   \n",
      "3               615     1          2.5         0   0       0          10   \n",
      "4              1821     1          1.2         0  13       1          44   \n",
      "...             ...   ...          ...       ...  ..     ...         ...   \n",
      "1995            794     1          0.5         1   0       1           2   \n",
      "1996           1965     1          2.6         1   0       0          39   \n",
      "1997           1911     0          0.9         1   1       1          36   \n",
      "1998           1512     0          0.9         0   4       1          46   \n",
      "1999            510     1          2.0         1   5       1          45   \n",
      "\n",
      "      m_dep  mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  \\\n",
      "0       0.6        188        2  ...         20       756  2549     9     7   \n",
      "1       0.7        136        3  ...        905      1988  2631    17     3   \n",
      "2       0.9        145        5  ...       1263      1716  2603    11     2   \n",
      "3       0.8        131        6  ...       1216      1786  2769    16     8   \n",
      "4       0.6        141        2  ...       1208      1212  1411     8     2   \n",
      "...     ...        ...      ...  ...        ...       ...   ...   ...   ...   \n",
      "1995    0.8        106        6  ...       1222      1890   668    13     4   \n",
      "1996    0.2        187        4  ...        915      1965  2032    11    10   \n",
      "1997    0.7        108        8  ...        868      1632  3057     9     1   \n",
      "1998    0.1        145        5  ...        336       670   869    18    10   \n",
      "1999    0.9        168        6  ...        483       754  3919    19     4   \n",
      "\n",
      "      talk_time  three_g  touch_screen  wifi  price_range  \n",
      "0            19        0             0     1            1  \n",
      "1             7        1             1     0            2  \n",
      "2             9        1             1     0            2  \n",
      "3            11        1             0     0            2  \n",
      "4            15        1             1     0            1  \n",
      "...         ...      ...           ...   ...          ...  \n",
      "1995         19        1             1     0            0  \n",
      "1996         16        1             1     1            2  \n",
      "1997          5        1             1     0            3  \n",
      "1998         19        1             1     1            0  \n",
      "1999          2        1             1     1            3  \n",
      "\n",
      "[2000 rows x 21 columns]\n"
     ]
    }
   ],
   "source": [
    "print(dataset)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cbc9aa88-4854-486f-9402-f9e52d626d44",
   "metadata": {},
   "source": [
    "# Podział na predyktory i zmienne zależne"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "1e912383-b4ce-4d35-ba75-e850660abbbe",
   "metadata": {},
   "outputs": [],
   "source": [
    "X = dataset.iloc[:, :-1].values\n",
    "y = dataset.iloc[:, -1].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "fa453618-4483-4f77-ac42-b44448ccde2b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[8.420e+02 0.000e+00 2.200e+00 ... 0.000e+00 0.000e+00 1.000e+00]\n",
      " [1.021e+03 1.000e+00 5.000e-01 ... 1.000e+00 1.000e+00 0.000e+00]\n",
      " [5.630e+02 1.000e+00 5.000e-01 ... 1.000e+00 1.000e+00 0.000e+00]\n",
      " ...\n",
      " [1.911e+03 0.000e+00 9.000e-01 ... 1.000e+00 1.000e+00 0.000e+00]\n",
      " [1.512e+03 0.000e+00 9.000e-01 ... 1.000e+00 1.000e+00 1.000e+00]\n",
      " [5.100e+02 1.000e+00 2.000e+00 ... 1.000e+00 1.000e+00 1.000e+00]]\n"
     ]
    }
   ],
   "source": [
    "print(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "cf06b8e2-b079-4891-8daf-d217b54dbe4f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1 2 2 ... 3 0 3]\n"
     ]
    }
   ],
   "source": [
    "print(y)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c42dc6b1-8a68-471a-a308-d5e2cb6bd6de",
   "metadata": {},
   "source": [
    "# Podział na dane testowe i treningowe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "6da626e8-33bc-4ec1-bebf-c902b6a6586c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a44b61fc-af30-4692-a035-1f5f0f7cccb3",
   "metadata": {},
   "source": [
    "# Wyodrębnienie danych walidacyjnych z setu treningowego"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "c6b15724-ec5a-4b3c-88dc-5e95ac8c447f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bd7bc9a2-1020-4c11-91aa-a63bfe7dfd9d",
   "metadata": {},
   "source": [
    "# Feature scalling -> standardyzacja"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "effe421e-c05b-4fe0-a535-985e4e1b47ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "sc = StandardScaler()\n",
    "X_train[:, :] = sc.fit_transform(X_train[:, :])\n",
    "X_test[:, :] = sc.transform(X_test[:, :])\n",
    "X_val[:, :] = sc.transform(X_val[:, :])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "d2458a09-769d-46eb-bbfc-f626b854f54c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 0.61048768 -1.02080461  1.20382281 ...  0.57168984 -1.0344154\n",
      "  -1.0268304 ]\n",
      " [ 0.0354526  -1.02080461 -1.25001265 ...  0.57168984 -1.0344154\n",
      "   0.97387066]\n",
      " [ 0.38507393 -1.02080461 -1.25001265 ...  0.57168984 -1.0344154\n",
      "  -1.0268304 ]\n",
      " ...\n",
      " [-0.44297658  0.97961941 -0.88193733 ...  0.57168984  0.96672961\n",
      "   0.97387066]\n",
      " [ 1.14182009 -1.02080461  0.95843926 ...  0.57168984 -1.0344154\n",
      "   0.97387066]\n",
      " [ 0.09755639 -1.02080461 -1.25001265 ...  0.57168984 -1.0344154\n",
      "   0.97387066]]\n"
     ]
    }
   ],
   "source": [
    "print(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "a65d9f5b-6ea0-444a-a3ab-eccec041e932",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[-0.48667924 -1.02080461  1.32651458 ...  0.57168984 -1.0344154\n",
      "  -1.0268304 ]\n",
      " [-0.56028373  0.97961941  0.3449804  ...  0.57168984  0.96672961\n",
      "  -1.0268304 ]\n",
      " [-1.46423887  0.97961941 -1.25001265 ...  0.57168984  0.96672961\n",
      "   0.97387066]\n",
      " ...\n",
      " [ 1.45923945  0.97961941  0.3449804  ...  0.57168984 -1.0344154\n",
      "  -1.0268304 ]\n",
      " [ 1.65935165 -1.02080461 -1.25001265 ...  0.57168984  0.96672961\n",
      "  -1.0268304 ]\n",
      " [-1.33543101  0.97961941 -1.0046291  ...  0.57168984 -1.0344154\n",
      "  -1.0268304 ]]\n"
     ]
    }
   ],
   "source": [
    "print(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "9099df76-0c07-448d-ac20-597c642a3d15",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[-0.36017153 -1.02080461 -0.14578669 ...  0.57168984 -1.0344154\n",
      "  -1.0268304 ]\n",
      " [-0.87770309 -1.02080461  1.20382281 ...  0.57168984  0.96672961\n",
      "  -1.0268304 ]\n",
      " [-1.47343943 -1.02080461 -0.02309492 ...  0.57168984 -1.0344154\n",
      "  -1.0268304 ]\n",
      " ...\n",
      " [ 0.46557884  0.97961941 -1.25001265 ...  0.57168984  0.96672961\n",
      "   0.97387066]\n",
      " [-1.5516442  -1.02080461 -1.12732088 ...  0.57168984 -1.0344154\n",
      "  -1.0268304 ]\n",
      " [ 1.4109365  -1.02080461 -1.25001265 ...  0.57168984 -1.0344154\n",
      "  -1.0268304 ]]\n"
     ]
    }
   ],
   "source": [
    "print(X_val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3bc95b69-17e2-44fc-bd03-61a436129ef0",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}