ium_495719/IUM_02_Dane.ipynb
2024-03-19 23:33:31 +01:00

184 KiB
Raw Blame History

Import bibliotek

import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from google.colab import files
import pandas as pd

Pobranie danych

#Zainstalujmy potrzebne biblioteki
!pip install --user kaggle #API Kaggle, do pobrania zbioru
!pip install --user pandas
Requirement already satisfied: kaggle in /usr/local/lib/python3.10/dist-packages (1.5.16)
Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.10/dist-packages (from kaggle) (1.16.0)
Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from kaggle) (2024.2.2)
Requirement already satisfied: python-dateutil in /usr/local/lib/python3.10/dist-packages (from kaggle) (2.8.2)
Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from kaggle) (2.31.0)
Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from kaggle) (4.66.2)
Requirement already satisfied: python-slugify in /usr/local/lib/python3.10/dist-packages (from kaggle) (8.0.4)
Requirement already satisfied: urllib3 in /usr/local/lib/python3.10/dist-packages (from kaggle) (2.0.7)
Requirement already satisfied: bleach in /usr/local/lib/python3.10/dist-packages (from kaggle) (6.1.0)
Requirement already satisfied: webencodings in /usr/local/lib/python3.10/dist-packages (from bleach->kaggle) (0.5.1)
Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.10/dist-packages (from python-slugify->kaggle) (1.3)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->kaggle) (3.3.2)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->kaggle) (3.6)
Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (1.5.3)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2023.4)
Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas) (1.25.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving kaggle.json to kaggle (4).json
mkdir: cannot create directory /root/.kaggle: File exists
!kaggle datasets download -d muhammadbinimran/housing-price-prediction-data
housing-price-prediction-data.zip: Skipping, found more recently modified local copy (use --force to force download)
!unzip -o housing-price-prediction-data.zip
Archive:  housing-price-prediction-data.zip
  inflating: housing_price_dataset.csv  

Wczytanie zbioru

!pip install --user pandas
Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (1.5.3)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2023.4)
Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas) (1.25.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)
housing_price_dataset = pd.read_csv('housing_price_dataset.csv')
housing_price_dataset
SquareFeet Bedrooms Bathrooms Neighborhood YearBuilt Price
0 2126 4 1 Rural 1969 215355.283618
1 2459 3 2 Rural 1980 195014.221626
2 1860 2 1 Suburb 1970 306891.012076
3 2294 2 1 Urban 1996 206786.787153
4 2130 5 2 Suburb 2001 272436.239065
... ... ... ... ... ... ...
49995 1282 5 3 Rural 1975 100080.865895
49996 2854 2 2 Suburb 1988 374507.656727
49997 2979 5 3 Suburb 1962 384110.555590
49998 2596 5 2 Rural 1984 380512.685957
49999 1572 5 3 Rural 2011 221618.583218

50000 rows × 6 columns

Podział zbioru

hp_train_test, hp_dev = sklearn.model_selection.train_test_split(housing_price_dataset, test_size=0.1)
hp_train, hp_test = sklearn.model_selection.train_test_split(hp_train_test, test_size=1000)

Normalizacja danych

housing_price_dataset["Neighborhood"].unique()
array(['Rural', 'Suburb', 'Urban'], dtype=object)
hp_train = pd.get_dummies(hp_train, columns=['Neighborhood'])
hp_dev = pd.get_dummies(hp_dev, columns=['Neighborhood'])
hp_test = pd.get_dummies(hp_test, columns=['Neighborhood'])
hp_train
SquareFeet Bedrooms Bathrooms YearBuilt Price Neighborhood_Rural Neighborhood_Suburb Neighborhood_Urban
7616 2027 3 3 2013 237960.032012 0 0 1
47787 1292 5 1 2021 86121.435887 0 1 0
35285 1964 2 3 1970 208054.904277 0 0 1
8718 2581 4 2 1990 230475.439055 1 0 0
36680 2020 5 2 2011 278860.337033 0 0 1
... ... ... ... ... ... ... ... ...
22830 1245 5 1 1975 167679.728402 1 0 0
43699 2065 4 2 2021 257521.317661 0 1 0
21160 1967 3 1 1951 262332.423882 0 1 0
30915 2867 2 3 1990 311233.596471 0 0 1
19117 1631 3 1 1967 200594.974438 1 0 0

44000 rows × 8 columns

hp_dev
SquareFeet Bedrooms Bathrooms YearBuilt Price Neighborhood_Rural Neighborhood_Suburb Neighborhood_Urban
46301 2845 4 3 1954 354875.353057 0 0 1
10023 2362 4 3 2010 292371.871755 1 0 0
37044 1058 3 2 2007 155277.040755 0 1 0
17462 2891 5 1 2005 239120.147027 1 0 0
13804 2244 5 2 1966 254005.280471 1 0 0
... ... ... ... ... ... ... ... ...
35925 1684 4 1 1950 212224.505489 1 0 0
21799 1021 5 3 1995 139005.940982 1 0 0
4318 2741 4 2 1962 339074.548520 1 0 0
31492 2053 3 3 2014 239382.414641 0 0 1
26727 2963 3 1 2004 321585.613385 0 1 0

5000 rows × 8 columns

hp_test
SquareFeet Bedrooms Bathrooms YearBuilt Price Neighborhood_Rural Neighborhood_Suburb Neighborhood_Urban
49356 1174 5 3 1996 143866.306649 0 1 0
18656 1776 3 1 1964 125553.381347 0 0 1
27368 2524 2 2 2010 327261.077660 1 0 0
27243 1633 2 1 1953 241231.423110 1 0 0
24653 2811 4 2 1982 315724.479288 1 0 0
... ... ... ... ... ... ... ... ...
20015 2106 2 2 2014 216406.701646 0 0 1
40921 1704 3 3 1986 153770.810572 1 0 0
30027 1150 5 3 1973 138938.157678 0 0 1
16008 2822 2 2 1982 296193.916437 0 1 0
23919 1348 2 2 1983 133497.577808 1 0 0

1000 rows × 8 columns

Statystyki

Wielkość podzbiorów

housing_price_dataset.describe()
SquareFeet Bedrooms Bathrooms YearBuilt Price
count 50000.000000 50000.000000 50000.000000 50000.000000 50000.000000
mean 2006.374680 3.498700 1.995420 1985.404420 224827.325151
std 575.513241 1.116326 0.815851 20.719377 76141.842966
min 1000.000000 2.000000 1.000000 1950.000000 -36588.165397
25% 1513.000000 3.000000 1.000000 1967.000000 169955.860225
50% 2007.000000 3.000000 2.000000 1985.000000 225052.141166
75% 2506.000000 4.000000 3.000000 2003.000000 279373.630052
max 2999.000000 5.000000 3.000000 2021.000000 492195.259972
hp_train.shape
(44000, 8)
hp_dev.shape
(5000, 8)
hp_test.shape
(1000, 8)

Statystyki kolumn

hp_train.describe()
SquareFeet Bedrooms Bathrooms YearBuilt Price Neighborhood_Rural Neighborhood_Suburb Neighborhood_Urban
count 44000.000000 44000.000000 44000.000000 44000.000000 44000.000000 44000.000000 44000.000000 44000.000000
mean 2006.261182 3.499636 1.997864 1985.416750 224928.983383 0.332841 0.333636 0.333523
std 575.306280 1.117315 0.815760 20.700559 76107.652516 0.471235 0.471517 0.471477
min 1000.000000 2.000000 1.000000 1950.000000 -36588.165397 0.000000 0.000000 0.000000
25% 1513.000000 3.000000 1.000000 1967.000000 170088.571867 0.000000 0.000000 0.000000
50% 2007.000000 3.000000 2.000000 1985.000000 225246.904135 0.000000 0.000000 0.000000
75% 2505.000000 5.000000 3.000000 2003.000000 279365.119289 1.000000 1.000000 1.000000
max 2999.000000 5.000000 3.000000 2021.000000 492195.259972 1.000000 1.000000 1.000000
hp_dev.describe()
SquareFeet Bedrooms Bathrooms YearBuilt Price Neighborhood_Rural Neighborhood_Suburb Neighborhood_Urban
count 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000
mean 2008.190800 3.487200 1.972600 1985.485400 224290.794530 0.337800 0.341600 0.320600
std 576.206366 1.104753 0.816077 20.960049 76778.005658 0.473007 0.474294 0.466754
min 1000.000000 2.000000 1.000000 1950.000000 -18159.685676 0.000000 0.000000 0.000000
25% 1510.750000 3.000000 1.000000 1967.000000 169103.151768 0.000000 0.000000 0.000000
50% 2007.000000 3.000000 2.000000 1985.000000 223614.924625 0.000000 0.000000 0.000000
75% 2503.000000 4.000000 3.000000 2004.000000 279651.548644 1.000000 1.000000 1.000000
max 2999.000000 5.000000 3.000000 2021.000000 467492.827823 1.000000 1.000000 1.000000
hp_test.describe()
SquareFeet Bedrooms Bathrooms YearBuilt Price Neighborhood_Rural Neighborhood_Suburb Neighborhood_Urban
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 2002.288000 3.515000 2.002000 1984.457000 223037.016061 0.342000 0.333000 0.325000
std 581.670136 1.130953 0.817719 20.330949 74475.155327 0.474617 0.471522 0.468609
min 1000.000000 2.000000 1.000000 1950.000000 -7550.504574 0.000000 0.000000 0.000000
25% 1507.250000 2.000000 1.000000 1967.000000 168905.529102 0.000000 0.000000 0.000000
50% 2021.500000 4.000000 2.000000 1983.000000 220416.485632 0.000000 0.000000 0.000000
75% 2524.000000 5.000000 3.000000 2002.000000 279628.697596 1.000000 1.000000 1.000000
max 2999.000000 5.000000 3.000000 2021.000000 437047.713441 1.000000 1.000000 1.000000
def print_sum(df_name, df):
  columns = ['Neighborhood_Rural', 'Neighborhood_Suburb', 'Neighborhood_Urban']
  print(df_name)
  for col in columns:
    print(col, df[col].sum())
  print()

print_sum("hp_train", hp_train)
print_sum("hp_dev", hp_dev)
print_sum("hp_test", hp_test)
hp_train
Neighborhood_Rural 14645
Neighborhood_Suburb 14680
Neighborhood_Urban 14675

hp_dev
Neighborhood_Rural 1689
Neighborhood_Suburb 1708
Neighborhood_Urban 1603

hp_test
Neighborhood_Rural 342
Neighborhood_Suburb 333
Neighborhood_Urban 325