13 KiB
13 KiB
Pobieranie datasetu
Ustawianie danych do kaggle
import os
os.environ['KAGGLE_USERNAME'] = 'worldwidepaniel'
os.environ['KAGGLE_KEY'] = '0a2ff35462d562da0e8b76b3bf22d7b7'
!pip install kaggle
Defaulting to user installation because normal site-packages is not writeable Requirement already satisfied: kaggle in /home/students/s495716/.local/lib/python3.9/site-packages (1.6.6) Requirement already satisfied: bleach in /usr/local/lib/python3.9/dist-packages (from kaggle) (5.0.1) Requirement already satisfied: certifi in /usr/local/lib/python3.9/dist-packages (from kaggle) (2022.9.14) Requirement already satisfied: python-dateutil in /usr/local/lib/python3.9/dist-packages (from kaggle) (2.8.2) Requirement already satisfied: python-slugify in /home/students/s495716/.local/lib/python3.9/site-packages (from kaggle) (8.0.4) Requirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from kaggle) (2.28.1) Requirement already satisfied: six>=1.10 in /usr/lib/python3/dist-packages (from kaggle) (1.16.0) Requirement already satisfied: tqdm in /usr/local/lib/python3.9/dist-packages (from kaggle) (4.64.1) Requirement already satisfied: urllib3 in /usr/local/lib/python3.9/dist-packages (from kaggle) (1.26.12) Requirement already satisfied: webencodings in /usr/local/lib/python3.9/dist-packages (from bleach->kaggle) (0.5.1) Requirement already satisfied: text-unidecode>=1.3 in /home/students/s495716/.local/lib/python3.9/site-packages (from python-slugify->kaggle) (1.3) Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.9/dist-packages (from requests->kaggle) (2.1.1) Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests->kaggle) (3.4)
Importy
import numpy as np
import pandas as pd
import kaggle
Pobranie datasetu
kaggle.api.authenticate()
kaggle.api.dataset_download_files('iabhishekofficial/mobile-price-classification', path='./', unzip=True)
Połączenie plików train.csv i test.csv
dataset = pd.read_csv("train.csv")
print(dataset)
battery_power blue clock_speed dual_sim fc four_g int_memory \ 0 842 0 2.2 0 1 0 7 1 1021 1 0.5 1 0 1 53 2 563 1 0.5 1 2 1 41 3 615 1 2.5 0 0 0 10 4 1821 1 1.2 0 13 1 44 ... ... ... ... ... .. ... ... 1995 794 1 0.5 1 0 1 2 1996 1965 1 2.6 1 0 0 39 1997 1911 0 0.9 1 1 1 36 1998 1512 0 0.9 0 4 1 46 1999 510 1 2.0 1 5 1 45 m_dep mobile_wt n_cores ... px_height px_width ram sc_h sc_w \ 0 0.6 188 2 ... 20 756 2549 9 7 1 0.7 136 3 ... 905 1988 2631 17 3 2 0.9 145 5 ... 1263 1716 2603 11 2 3 0.8 131 6 ... 1216 1786 2769 16 8 4 0.6 141 2 ... 1208 1212 1411 8 2 ... ... ... ... ... ... ... ... ... ... 1995 0.8 106 6 ... 1222 1890 668 13 4 1996 0.2 187 4 ... 915 1965 2032 11 10 1997 0.7 108 8 ... 868 1632 3057 9 1 1998 0.1 145 5 ... 336 670 869 18 10 1999 0.9 168 6 ... 483 754 3919 19 4 talk_time three_g touch_screen wifi price_range 0 19 0 0 1 1 1 7 1 1 0 2 2 9 1 1 0 2 3 11 1 0 0 2 4 15 1 1 0 1 ... ... ... ... ... ... 1995 19 1 1 0 0 1996 16 1 1 1 2 1997 5 1 1 0 3 1998 19 1 1 1 0 1999 2 1 1 1 3 [2000 rows x 21 columns]
Podział na predyktory i zmienne zależne
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
print(X)
[[8.420e+02 0.000e+00 2.200e+00 ... 0.000e+00 0.000e+00 1.000e+00] [1.021e+03 1.000e+00 5.000e-01 ... 1.000e+00 1.000e+00 0.000e+00] [5.630e+02 1.000e+00 5.000e-01 ... 1.000e+00 1.000e+00 0.000e+00] ... [1.911e+03 0.000e+00 9.000e-01 ... 1.000e+00 1.000e+00 0.000e+00] [1.512e+03 0.000e+00 9.000e-01 ... 1.000e+00 1.000e+00 1.000e+00] [5.100e+02 1.000e+00 2.000e+00 ... 1.000e+00 1.000e+00 1.000e+00]]
print(y)
[1 2 2 ... 3 0 3]
Podział na dane testowe i treningowe
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
Wyodrębnienie danych walidacyjnych z setu treningowego
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=1)
Feature scalling -> standardyzacja
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, :] = sc.fit_transform(X_train[:, :])
X_test[:, :] = sc.transform(X_test[:, :])
X_val[:, :] = sc.transform(X_val[:, :])
print(X_train)
[[ 0.61048768 -1.02080461 1.20382281 ... 0.57168984 -1.0344154 -1.0268304 ] [ 0.0354526 -1.02080461 -1.25001265 ... 0.57168984 -1.0344154 0.97387066] [ 0.38507393 -1.02080461 -1.25001265 ... 0.57168984 -1.0344154 -1.0268304 ] ... [-0.44297658 0.97961941 -0.88193733 ... 0.57168984 0.96672961 0.97387066] [ 1.14182009 -1.02080461 0.95843926 ... 0.57168984 -1.0344154 0.97387066] [ 0.09755639 -1.02080461 -1.25001265 ... 0.57168984 -1.0344154 0.97387066]]
print(X_test)
[[-0.48667924 -1.02080461 1.32651458 ... 0.57168984 -1.0344154 -1.0268304 ] [-0.56028373 0.97961941 0.3449804 ... 0.57168984 0.96672961 -1.0268304 ] [-1.46423887 0.97961941 -1.25001265 ... 0.57168984 0.96672961 0.97387066] ... [ 1.45923945 0.97961941 0.3449804 ... 0.57168984 -1.0344154 -1.0268304 ] [ 1.65935165 -1.02080461 -1.25001265 ... 0.57168984 0.96672961 -1.0268304 ] [-1.33543101 0.97961941 -1.0046291 ... 0.57168984 -1.0344154 -1.0268304 ]]
print(X_val)
[[-0.36017153 -1.02080461 -0.14578669 ... 0.57168984 -1.0344154 -1.0268304 ] [-0.87770309 -1.02080461 1.20382281 ... 0.57168984 0.96672961 -1.0268304 ] [-1.47343943 -1.02080461 -0.02309492 ... 0.57168984 -1.0344154 -1.0268304 ] ... [ 0.46557884 0.97961941 -1.25001265 ... 0.57168984 0.96672961 0.97387066] [-1.5516442 -1.02080461 -1.12732088 ... 0.57168984 -1.0344154 -1.0268304 ] [ 1.4109365 -1.02080461 -1.25001265 ... 0.57168984 -1.0344154 -1.0268304 ]]