ium_495716/Data_download.ipynb
2024-03-19 20:59:23 +01:00

418 lines
13 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "39adcbc2-16ea-43b5-9724-a3cd67a8ae73",
"metadata": {},
"source": [
"# Pobieranie datasetu"
]
},
{
"cell_type": "markdown",
"id": "7db8b2ca-5142-464f-8d9f-81e6d5f4c9bd",
"metadata": {},
"source": [
"## Ustawianie danych do kaggle"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "4c9a2caa-c19c-450f-8b52-bf8124a9e7b2",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.environ['KAGGLE_USERNAME'] = 'worldwidepaniel'\n",
"os.environ['KAGGLE_KEY'] = '0a2ff35462d562da0e8b76b3bf22d7b7'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "187d593d-4cb2-43e6-a640-da81916ba547",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Defaulting to user installation because normal site-packages is not writeable\n",
"Requirement already satisfied: kaggle in /home/students/s495716/.local/lib/python3.9/site-packages (1.6.6)\n",
"Requirement already satisfied: bleach in /usr/local/lib/python3.9/dist-packages (from kaggle) (5.0.1)\n",
"Requirement already satisfied: certifi in /usr/local/lib/python3.9/dist-packages (from kaggle) (2022.9.14)\n",
"Requirement already satisfied: python-dateutil in /usr/local/lib/python3.9/dist-packages (from kaggle) (2.8.2)\n",
"Requirement already satisfied: python-slugify in /home/students/s495716/.local/lib/python3.9/site-packages (from kaggle) (8.0.4)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from kaggle) (2.28.1)\n",
"Requirement already satisfied: six>=1.10 in /usr/lib/python3/dist-packages (from kaggle) (1.16.0)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.9/dist-packages (from kaggle) (4.64.1)\n",
"Requirement already satisfied: urllib3 in /usr/local/lib/python3.9/dist-packages (from kaggle) (1.26.12)\n",
"Requirement already satisfied: webencodings in /usr/local/lib/python3.9/dist-packages (from bleach->kaggle) (0.5.1)\n",
"Requirement already satisfied: text-unidecode>=1.3 in /home/students/s495716/.local/lib/python3.9/site-packages (from python-slugify->kaggle) (1.3)\n",
"Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.9/dist-packages (from requests->kaggle) (2.1.1)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests->kaggle) (3.4)\n"
]
}
],
"source": [
"!pip install kaggle"
]
},
{
"cell_type": "markdown",
"id": "ee00edf9-1014-4f8d-88d2-272619fd5983",
"metadata": {},
"source": [
"## Importy"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "d156f51c-859e-4ec9-9867-9fc429bcef6c",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import kaggle"
]
},
{
"cell_type": "markdown",
"id": "ad9902bd-65e8-420f-bf52-52125cc73409",
"metadata": {},
"source": [
"## Pobranie datasetu"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "fdc8a3c4-e7b2-4076-9e31-b70120b5580d",
"metadata": {},
"outputs": [],
"source": [
"kaggle.api.authenticate()\n",
"kaggle.api.dataset_download_files('iabhishekofficial/mobile-price-classification', path='./', unzip=True)"
]
},
{
"cell_type": "markdown",
"id": "cdb73db0-169d-4818-9601-2810c530cbb4",
"metadata": {},
"source": [
"# Połączenie plików train.csv i test.csv"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7ae7d6df-ba81-4349-9a93-a10ba5d153a7",
"metadata": {},
"outputs": [],
"source": [
"dataset = pd.read_csv(\"train.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "5a2aed33-c25c-4b01-a261-968d9dfc7be7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" battery_power blue clock_speed dual_sim fc four_g int_memory \\\n",
"0 842 0 2.2 0 1 0 7 \n",
"1 1021 1 0.5 1 0 1 53 \n",
"2 563 1 0.5 1 2 1 41 \n",
"3 615 1 2.5 0 0 0 10 \n",
"4 1821 1 1.2 0 13 1 44 \n",
"... ... ... ... ... .. ... ... \n",
"1995 794 1 0.5 1 0 1 2 \n",
"1996 1965 1 2.6 1 0 0 39 \n",
"1997 1911 0 0.9 1 1 1 36 \n",
"1998 1512 0 0.9 0 4 1 46 \n",
"1999 510 1 2.0 1 5 1 45 \n",
"\n",
" m_dep mobile_wt n_cores ... px_height px_width ram sc_h sc_w \\\n",
"0 0.6 188 2 ... 20 756 2549 9 7 \n",
"1 0.7 136 3 ... 905 1988 2631 17 3 \n",
"2 0.9 145 5 ... 1263 1716 2603 11 2 \n",
"3 0.8 131 6 ... 1216 1786 2769 16 8 \n",
"4 0.6 141 2 ... 1208 1212 1411 8 2 \n",
"... ... ... ... ... ... ... ... ... ... \n",
"1995 0.8 106 6 ... 1222 1890 668 13 4 \n",
"1996 0.2 187 4 ... 915 1965 2032 11 10 \n",
"1997 0.7 108 8 ... 868 1632 3057 9 1 \n",
"1998 0.1 145 5 ... 336 670 869 18 10 \n",
"1999 0.9 168 6 ... 483 754 3919 19 4 \n",
"\n",
" talk_time three_g touch_screen wifi price_range \n",
"0 19 0 0 1 1 \n",
"1 7 1 1 0 2 \n",
"2 9 1 1 0 2 \n",
"3 11 1 0 0 2 \n",
"4 15 1 1 0 1 \n",
"... ... ... ... ... ... \n",
"1995 19 1 1 0 0 \n",
"1996 16 1 1 1 2 \n",
"1997 5 1 1 0 3 \n",
"1998 19 1 1 1 0 \n",
"1999 2 1 1 1 3 \n",
"\n",
"[2000 rows x 21 columns]\n"
]
}
],
"source": [
"print(dataset)"
]
},
{
"cell_type": "markdown",
"id": "cbc9aa88-4854-486f-9402-f9e52d626d44",
"metadata": {},
"source": [
"# Podział na predyktory i zmienne zależne"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "1e912383-b4ce-4d35-ba75-e850660abbbe",
"metadata": {},
"outputs": [],
"source": [
"X = dataset.iloc[:, :-1].values\n",
"y = dataset.iloc[:, -1].values"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "fa453618-4483-4f77-ac42-b44448ccde2b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[8.420e+02 0.000e+00 2.200e+00 ... 0.000e+00 0.000e+00 1.000e+00]\n",
" [1.021e+03 1.000e+00 5.000e-01 ... 1.000e+00 1.000e+00 0.000e+00]\n",
" [5.630e+02 1.000e+00 5.000e-01 ... 1.000e+00 1.000e+00 0.000e+00]\n",
" ...\n",
" [1.911e+03 0.000e+00 9.000e-01 ... 1.000e+00 1.000e+00 0.000e+00]\n",
" [1.512e+03 0.000e+00 9.000e-01 ... 1.000e+00 1.000e+00 1.000e+00]\n",
" [5.100e+02 1.000e+00 2.000e+00 ... 1.000e+00 1.000e+00 1.000e+00]]\n"
]
}
],
"source": [
"print(X)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "cf06b8e2-b079-4891-8daf-d217b54dbe4f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1 2 2 ... 3 0 3]\n"
]
}
],
"source": [
"print(y)"
]
},
{
"cell_type": "markdown",
"id": "c42dc6b1-8a68-471a-a308-d5e2cb6bd6de",
"metadata": {},
"source": [
"# Podział na dane testowe i treningowe"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "6da626e8-33bc-4ec1-bebf-c902b6a6586c",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)"
]
},
{
"cell_type": "markdown",
"id": "a44b61fc-af30-4692-a035-1f5f0f7cccb3",
"metadata": {},
"source": [
"# Wyodrębnienie danych walidacyjnych z setu treningowego"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "c6b15724-ec5a-4b3c-88dc-5e95ac8c447f",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=1)"
]
},
{
"cell_type": "markdown",
"id": "bd7bc9a2-1020-4c11-91aa-a63bfe7dfd9d",
"metadata": {},
"source": [
"# Feature scalling -> standardyzacja"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "effe421e-c05b-4fe0-a535-985e4e1b47ea",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import StandardScaler\n",
"sc = StandardScaler()\n",
"X_train[:, :] = sc.fit_transform(X_train[:, :])\n",
"X_test[:, :] = sc.transform(X_test[:, :])\n",
"X_val[:, :] = sc.transform(X_val[:, :])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "d2458a09-769d-46eb-bbfc-f626b854f54c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 0.61048768 -1.02080461 1.20382281 ... 0.57168984 -1.0344154\n",
" -1.0268304 ]\n",
" [ 0.0354526 -1.02080461 -1.25001265 ... 0.57168984 -1.0344154\n",
" 0.97387066]\n",
" [ 0.38507393 -1.02080461 -1.25001265 ... 0.57168984 -1.0344154\n",
" -1.0268304 ]\n",
" ...\n",
" [-0.44297658 0.97961941 -0.88193733 ... 0.57168984 0.96672961\n",
" 0.97387066]\n",
" [ 1.14182009 -1.02080461 0.95843926 ... 0.57168984 -1.0344154\n",
" 0.97387066]\n",
" [ 0.09755639 -1.02080461 -1.25001265 ... 0.57168984 -1.0344154\n",
" 0.97387066]]\n"
]
}
],
"source": [
"print(X_train)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "a65d9f5b-6ea0-444a-a3ab-eccec041e932",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[-0.48667924 -1.02080461 1.32651458 ... 0.57168984 -1.0344154\n",
" -1.0268304 ]\n",
" [-0.56028373 0.97961941 0.3449804 ... 0.57168984 0.96672961\n",
" -1.0268304 ]\n",
" [-1.46423887 0.97961941 -1.25001265 ... 0.57168984 0.96672961\n",
" 0.97387066]\n",
" ...\n",
" [ 1.45923945 0.97961941 0.3449804 ... 0.57168984 -1.0344154\n",
" -1.0268304 ]\n",
" [ 1.65935165 -1.02080461 -1.25001265 ... 0.57168984 0.96672961\n",
" -1.0268304 ]\n",
" [-1.33543101 0.97961941 -1.0046291 ... 0.57168984 -1.0344154\n",
" -1.0268304 ]]\n"
]
}
],
"source": [
"print(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "9099df76-0c07-448d-ac20-597c642a3d15",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[-0.36017153 -1.02080461 -0.14578669 ... 0.57168984 -1.0344154\n",
" -1.0268304 ]\n",
" [-0.87770309 -1.02080461 1.20382281 ... 0.57168984 0.96672961\n",
" -1.0268304 ]\n",
" [-1.47343943 -1.02080461 -0.02309492 ... 0.57168984 -1.0344154\n",
" -1.0268304 ]\n",
" ...\n",
" [ 0.46557884 0.97961941 -1.25001265 ... 0.57168984 0.96672961\n",
" 0.97387066]\n",
" [-1.5516442 -1.02080461 -1.12732088 ... 0.57168984 -1.0344154\n",
" -1.0268304 ]\n",
" [ 1.4109365 -1.02080461 -1.25001265 ... 0.57168984 -1.0344154\n",
" -1.0268304 ]]\n"
]
}
],
"source": [
"print(X_val)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3bc95b69-17e2-44fc-bd03-61a436129ef0",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}