ium_452662/lab02.ipynb
2023-03-20 13:35:46 +01:00

39 KiB

!kaggle datasets download -d aleksandrglotov/car-prices-poland
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /root/.kaggle/kaggle.json'
Downloading car-prices-poland.zip to /content
  0% 0.00/1.64M [00:00<?, ?B/s]
100% 1.64M/1.64M [00:00<00:00, 120MB/s]
!unzip -o car-prices-poland.zip
Archive:  car-prices-poland.zip
  inflating: Car_Prices_Poland_Kaggle.csv  
!wc -l Car_Prices_Poland_Kaggle.csv
117928 Car_Prices_Poland_Kaggle.csv
!head -n 5 Car_Prices_Poland_Kaggle.csv
,mark,model,generation_name,year,mileage,vol_engine,fuel,city,province,price
0,opel,combo,gen-d-2011,2015,139568,1248,Diesel,Janki,Mazowieckie,35900
1,opel,combo,gen-d-2011,2018,31991,1499,Diesel,Katowice,Śląskie,78501
2,opel,combo,gen-d-2011,2015,278437,1598,Diesel,Brzeg,Opolskie,27000
3,opel,combo,gen-d-2011,2016,47600,1248,Diesel,Korfantów,Opolskie,30800
!pip install --user pandas
!pip install --user seaborn
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: pandas in /usr/local/lib/python3.9/dist-packages (1.4.4)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.9/dist-packages (from pandas) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas) (2022.7.1)
Requirement already satisfied: numpy>=1.18.5 in /usr/local/lib/python3.9/dist-packages (from pandas) (1.22.4)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.8.1->pandas) (1.15.0)
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: seaborn in /usr/local/lib/python3.9/dist-packages (0.12.2)
Requirement already satisfied: pandas>=0.25 in /usr/local/lib/python3.9/dist-packages (from seaborn) (1.4.4)
Requirement already satisfied: matplotlib!=3.6.1,>=3.1 in /usr/local/lib/python3.9/dist-packages (from seaborn) (3.7.1)
Requirement already satisfied: numpy!=1.24.0,>=1.17 in /usr/local/lib/python3.9/dist-packages (from seaborn) (1.22.4)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (23.0)
Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (2.8.2)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.4.4)
Requirement already satisfied: importlib-resources>=3.2.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (5.12.0)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (3.0.9)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (4.39.0)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.0.7)
Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (8.4.0)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=0.25->seaborn) (2022.7.1)
Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.9/dist-packages (from importlib-resources>=3.2.0->matplotlib!=3.6.1,>=3.1->seaborn) (3.15.0)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.1->seaborn) (1.15.0)
import pandas as pd
cars = pd.read_csv('Car_Prices_Poland_Kaggle.csv')
cars.describe(include='all')
Unnamed: 0 mark model generation_name year mileage vol_engine fuel city province price
count 117927.000000 117927 117927 87842 117927.000000 1.179270e+05 117927.000000 117927 117927 117927 1.179270e+05
unique NaN 23 328 364 NaN NaN NaN 6 4427 23 NaN
top NaN audi astra gen-8p-2003-2012 NaN NaN NaN Gasoline Warszawa Mazowieckie NaN
freq NaN 12031 3331 1567 NaN NaN NaN 61597 7972 22219 NaN
mean 58963.000000 NaN NaN NaN 2012.925259 1.409768e+05 1812.057782 NaN NaN NaN 7.029988e+04
std 34042.736935 NaN NaN NaN 5.690135 9.236936e+04 643.613438 NaN NaN NaN 8.482458e+04
min 0.000000 NaN NaN NaN 1945.000000 0.000000e+00 0.000000 NaN NaN NaN 5.000000e+02
25% 29481.500000 NaN NaN NaN 2009.000000 6.700000e+04 1461.000000 NaN NaN NaN 2.100000e+04
50% 58963.000000 NaN NaN NaN 2013.000000 1.462690e+05 1796.000000 NaN NaN NaN 4.190000e+04
75% 88444.500000 NaN NaN NaN 2018.000000 2.030000e+05 1995.000000 NaN NaN NaN 8.360000e+04
max 117926.000000 NaN NaN NaN 2022.000000 2.800000e+06 7600.000000 NaN NaN NaN 2.399900e+06
cars = cars.drop(73436) #wiersz z błednymi danymi
def normalize(df,feature_name):
    result = df.copy()
    max_value = df[feature_name].max()
    min_value = df[feature_name].min()
    result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result
cars_normalized = normalize(cars,'vol_engine')
print(cars_normalized)
        Unnamed: 0   mark  model    generation_name  year  mileage  \
0                0   opel  combo         gen-d-2011  2015   139568   
1                1   opel  combo         gen-d-2011  2018    31991   
2                2   opel  combo         gen-d-2011  2015   278437   
3                3   opel  combo         gen-d-2011  2016    47600   
4                4   opel  combo         gen-d-2011  2014   103000   
...            ...    ...    ...                ...   ...      ...   
117922      117922  volvo  xc-90  gen-ii-2014-xc-90  2020    40000   
117923      117923  volvo  xc-90  gen-ii-2014-xc-90  2017    51000   
117924      117924  volvo  xc-90  gen-ii-2014-xc-90  2016    83500   
117925      117925  volvo  xc-90  gen-ii-2014-xc-90  2017   174000   
117926      117926  volvo  xc-90  gen-ii-2014-xc-90  2016   189020   

        vol_engine      fuel              city       province   price  
0         0.164211    Diesel             Janki    Mazowieckie   35900  
1         0.197237    Diesel          Katowice        Śląskie   78501  
2         0.210263    Diesel             Brzeg       Opolskie   27000  
3         0.164211    Diesel         Korfantów       Opolskie   30800  
4         0.184211       CNG   Tarnowskie Góry        Śląskie   35900  
...            ...       ...               ...            ...     ...  
117922    0.259079    Hybrid          Katowice        Śląskie  222790  
117923    0.259079    Diesel  Chechło Pierwsze        Łódzkie  229900  
117924    0.259079  Gasoline   Pruszcz Gdański      Pomorskie  135000  
117925    0.259079    Diesel            Kalisz  Wielkopolskie  154500  
117926    0.259079  Gasoline            Sionna    Mazowieckie  130000  

[117926 rows x 11 columns]
import sklearn
import sklearn.model_selection
cars_train, cars_test = sklearn.model_selection.train_test_split(cars_normalized, test_size=23586, random_state=1)
cars_train["province"].value_counts()
Mazowieckie                 17750
Śląskie                     13441
Wielkopolskie               11162
Małopolskie                  7796
Dolnośląskie                 7092
Łódzkie                      6303
Pomorskie                    6094
Kujawsko-pomorskie           4256
Lubelskie                    3775
Zachodniopomorskie           3165
Podkarpackie                 2826
Świętokrzyskie               2657
Warmińsko-mazurskie          2375
Lubuskie                     2220
Podlaskie                    1716
Opolskie                     1679
Moravian-Silesian Region       27
Wiedeń                          2
Berlin                          2
Trenczyn                        1
Niedersachsen                   1
Name: province, dtype: int64
cars_dev, cars_test = sklearn.model_selection.train_test_split(cars_test, test_size=11793, random_state=1)
cars_dev["province"].value_counts()
Mazowieckie                 2261
Śląskie                     1666
Wielkopolskie               1418
Małopolskie                  948
Dolnośląskie                 867
Łódzkie                      775
Pomorskie                    766
Kujawsko-pomorskie           532
Lubelskie                    504
Zachodniopomorskie           396
Podkarpackie                 365
Świętokrzyskie               353
Warmińsko-mazurskie          282
Lubuskie                     263
Opolskie                     199
Podlaskie                    192
Moravian-Silesian Region       4
Nordrhein-Westfalen            1
Berlin                         1
Name: province, dtype: int64

cars_test["province"].value_counts()
Mazowieckie                 2208
Śląskie                     1599
Wielkopolskie               1436
Małopolskie                 1012
Dolnośląskie                 879
Łódzkie                      806
Pomorskie                    745
Kujawsko-pomorskie           583
Lubelskie                    461
Zachodniopomorskie           402
Podkarpackie                 362
Świętokrzyskie               327
Warmińsko-mazurskie          299
Lubuskie                     260
Podlaskie                    215
Opolskie                     195
Moravian-Silesian Region       4
Name: province, dtype: int64
#Ilość wartości w zbiorach
print(cars_normalized.size)
print(cars_train.size)
print(cars_dev.size)
print(cars_test.size)
1297186
1037740
129723
129723
#Średnie wartości parametrów
print(cars_normalized['price'].mean())
print(cars_train['price'].mean())
print(cars_dev['price'].mean())
print(cars_test['price'].mean())
70299.94754337466
70432.62519609921
69244.09963537692
70294.41923174764
#Najmniejsze ceny pojazdów
print(cars_normalized['price'].min())
print(cars_train['price'].min())
print(cars_dev['price'].min())
print(cars_test['price'].min())
500
500
1250
900
#Największe ceny pojazdów
print(cars_normalized['price'].max())
print(cars_train['price'].max())
print(cars_dev['price'].max())
print(cars_test['price'].max())
2399900
2399900
1368341
1000000
#Odchylenie standardowe
print(cars_normalized['price'].std())
print(cars_train['price'].std())
print(cars_dev['price'].std())
print(cars_test['price'].std())
84824.93470827927
85120.16823252657
82128.74927832028
85111.52408658911
#Mediany cen pojazdów
print(cars_normalized['price'].median())
print(cars_train['price'].median())
print(cars_dev['price'].median())
print(cars_test['price'].median())
41900.0
41900.0
41901.0
40900.0
#Podział według regionów
cars_normalized["province"].value_counts()
Mazowieckie                 22219
Śląskie                     16706
Wielkopolskie               14016
Małopolskie                  9756
Dolnośląskie                 8838
Łódzkie                      7884
Pomorskie                    7605
Kujawsko-pomorskie           5371
Lubelskie                    4740
Zachodniopomorskie           3963
Podkarpackie                 3553
Świętokrzyskie               3337
Warmińsko-mazurskie          2956
Lubuskie                     2743
Podlaskie                    2123
Opolskie                     2073
Moravian-Silesian Region       35
Berlin                          3
Wiedeń                          2
Niedersachsen                   1
Trenczyn                        1
Nordrhein-Westfalen             1
Name: province, dtype: int64
#Podział według marki
cars_normalized["mark"].value_counts()
audi             12031
opel             11914
bmw              11070
volkswagen       10848
ford              9664
mercedes-benz     7136
renault           6976
skoda             5888
toyota            5119
peugeot           5056
volvo             4384
hyundai           4032
kia               3744
nissan            3072
fiat              2880
mazda             2848
seat              2848
citroen           2720
honda             2176
mitsubishi        1120
mini              1088
alfa-romeo         704
chevrolet          608
Name: mark, dtype: int64