39 KiB
39 KiB
!kaggle datasets download -d aleksandrglotov/car-prices-poland
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /root/.kaggle/kaggle.json' Downloading car-prices-poland.zip to /content 0% 0.00/1.64M [00:00<?, ?B/s] 100% 1.64M/1.64M [00:00<00:00, 120MB/s]
!unzip -o car-prices-poland.zip
Archive: car-prices-poland.zip inflating: Car_Prices_Poland_Kaggle.csv
!wc -l Car_Prices_Poland_Kaggle.csv
117928 Car_Prices_Poland_Kaggle.csv
!head -n 5 Car_Prices_Poland_Kaggle.csv
,mark,model,generation_name,year,mileage,vol_engine,fuel,city,province,price 0,opel,combo,gen-d-2011,2015,139568,1248,Diesel,Janki,Mazowieckie,35900 1,opel,combo,gen-d-2011,2018,31991,1499,Diesel,Katowice,Śląskie,78501 2,opel,combo,gen-d-2011,2015,278437,1598,Diesel,Brzeg,Opolskie,27000 3,opel,combo,gen-d-2011,2016,47600,1248,Diesel,Korfantów,Opolskie,30800
!pip install --user pandas
!pip install --user seaborn
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Requirement already satisfied: pandas in /usr/local/lib/python3.9/dist-packages (1.4.4) Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.9/dist-packages (from pandas) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas) (2022.7.1) Requirement already satisfied: numpy>=1.18.5 in /usr/local/lib/python3.9/dist-packages (from pandas) (1.22.4) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.8.1->pandas) (1.15.0) Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Requirement already satisfied: seaborn in /usr/local/lib/python3.9/dist-packages (0.12.2) Requirement already satisfied: pandas>=0.25 in /usr/local/lib/python3.9/dist-packages (from seaborn) (1.4.4) Requirement already satisfied: matplotlib!=3.6.1,>=3.1 in /usr/local/lib/python3.9/dist-packages (from seaborn) (3.7.1) Requirement already satisfied: numpy!=1.24.0,>=1.17 in /usr/local/lib/python3.9/dist-packages (from seaborn) (1.22.4) Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (23.0) Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (2.8.2) Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.4.4) Requirement already satisfied: importlib-resources>=3.2.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (5.12.0) Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (3.0.9) Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (4.39.0) Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.0.7) Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (8.4.0) Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=0.25->seaborn) (2022.7.1) Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.9/dist-packages (from importlib-resources>=3.2.0->matplotlib!=3.6.1,>=3.1->seaborn) (3.15.0) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.1->seaborn) (1.15.0)
import pandas as pd
cars = pd.read_csv('Car_Prices_Poland_Kaggle.csv')
cars.describe(include='all')
Unnamed: 0 | mark | model | generation_name | year | mileage | vol_engine | fuel | city | province | price | |
---|---|---|---|---|---|---|---|---|---|---|---|
count | 117927.000000 | 117927 | 117927 | 87842 | 117927.000000 | 1.179270e+05 | 117927.000000 | 117927 | 117927 | 117927 | 1.179270e+05 |
unique | NaN | 23 | 328 | 364 | NaN | NaN | NaN | 6 | 4427 | 23 | NaN |
top | NaN | audi | astra | gen-8p-2003-2012 | NaN | NaN | NaN | Gasoline | Warszawa | Mazowieckie | NaN |
freq | NaN | 12031 | 3331 | 1567 | NaN | NaN | NaN | 61597 | 7972 | 22219 | NaN |
mean | 58963.000000 | NaN | NaN | NaN | 2012.925259 | 1.409768e+05 | 1812.057782 | NaN | NaN | NaN | 7.029988e+04 |
std | 34042.736935 | NaN | NaN | NaN | 5.690135 | 9.236936e+04 | 643.613438 | NaN | NaN | NaN | 8.482458e+04 |
min | 0.000000 | NaN | NaN | NaN | 1945.000000 | 0.000000e+00 | 0.000000 | NaN | NaN | NaN | 5.000000e+02 |
25% | 29481.500000 | NaN | NaN | NaN | 2009.000000 | 6.700000e+04 | 1461.000000 | NaN | NaN | NaN | 2.100000e+04 |
50% | 58963.000000 | NaN | NaN | NaN | 2013.000000 | 1.462690e+05 | 1796.000000 | NaN | NaN | NaN | 4.190000e+04 |
75% | 88444.500000 | NaN | NaN | NaN | 2018.000000 | 2.030000e+05 | 1995.000000 | NaN | NaN | NaN | 8.360000e+04 |
max | 117926.000000 | NaN | NaN | NaN | 2022.000000 | 2.800000e+06 | 7600.000000 | NaN | NaN | NaN | 2.399900e+06 |
cars = cars.drop(73436) #wiersz z błednymi danymi
def normalize(df,feature_name):
result = df.copy()
max_value = df[feature_name].max()
min_value = df[feature_name].min()
result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
return result
cars_normalized = normalize(cars,'vol_engine')
print(cars_normalized)
Unnamed: 0 mark model generation_name year mileage \ 0 0 opel combo gen-d-2011 2015 139568 1 1 opel combo gen-d-2011 2018 31991 2 2 opel combo gen-d-2011 2015 278437 3 3 opel combo gen-d-2011 2016 47600 4 4 opel combo gen-d-2011 2014 103000 ... ... ... ... ... ... ... 117922 117922 volvo xc-90 gen-ii-2014-xc-90 2020 40000 117923 117923 volvo xc-90 gen-ii-2014-xc-90 2017 51000 117924 117924 volvo xc-90 gen-ii-2014-xc-90 2016 83500 117925 117925 volvo xc-90 gen-ii-2014-xc-90 2017 174000 117926 117926 volvo xc-90 gen-ii-2014-xc-90 2016 189020 vol_engine fuel city province price 0 0.164211 Diesel Janki Mazowieckie 35900 1 0.197237 Diesel Katowice Śląskie 78501 2 0.210263 Diesel Brzeg Opolskie 27000 3 0.164211 Diesel Korfantów Opolskie 30800 4 0.184211 CNG Tarnowskie Góry Śląskie 35900 ... ... ... ... ... ... 117922 0.259079 Hybrid Katowice Śląskie 222790 117923 0.259079 Diesel Chechło Pierwsze Łódzkie 229900 117924 0.259079 Gasoline Pruszcz Gdański Pomorskie 135000 117925 0.259079 Diesel Kalisz Wielkopolskie 154500 117926 0.259079 Gasoline Sionna Mazowieckie 130000 [117926 rows x 11 columns]
import sklearn
import sklearn.model_selection
cars_train, cars_test = sklearn.model_selection.train_test_split(cars_normalized, test_size=23586, random_state=1)
cars_train["province"].value_counts()
Mazowieckie 17750 Śląskie 13441 Wielkopolskie 11162 Małopolskie 7796 Dolnośląskie 7092 Łódzkie 6303 Pomorskie 6094 Kujawsko-pomorskie 4256 Lubelskie 3775 Zachodniopomorskie 3165 Podkarpackie 2826 Świętokrzyskie 2657 Warmińsko-mazurskie 2375 Lubuskie 2220 Podlaskie 1716 Opolskie 1679 Moravian-Silesian Region 27 Wiedeń 2 Berlin 2 Trenczyn 1 Niedersachsen 1 Name: province, dtype: int64
cars_dev, cars_test = sklearn.model_selection.train_test_split(cars_test, test_size=11793, random_state=1)
cars_dev["province"].value_counts()
Mazowieckie 2261 Śląskie 1666 Wielkopolskie 1418 Małopolskie 948 Dolnośląskie 867 Łódzkie 775 Pomorskie 766 Kujawsko-pomorskie 532 Lubelskie 504 Zachodniopomorskie 396 Podkarpackie 365 Świętokrzyskie 353 Warmińsko-mazurskie 282 Lubuskie 263 Opolskie 199 Podlaskie 192 Moravian-Silesian Region 4 Nordrhein-Westfalen 1 Berlin 1 Name: province, dtype: int64
cars_test["province"].value_counts()
Mazowieckie 2208 Śląskie 1599 Wielkopolskie 1436 Małopolskie 1012 Dolnośląskie 879 Łódzkie 806 Pomorskie 745 Kujawsko-pomorskie 583 Lubelskie 461 Zachodniopomorskie 402 Podkarpackie 362 Świętokrzyskie 327 Warmińsko-mazurskie 299 Lubuskie 260 Podlaskie 215 Opolskie 195 Moravian-Silesian Region 4 Name: province, dtype: int64
#Ilość wartości w zbiorach
print(cars_normalized.size)
print(cars_train.size)
print(cars_dev.size)
print(cars_test.size)
1297186 1037740 129723 129723
#Średnie wartości parametrów
print(cars_normalized['price'].mean())
print(cars_train['price'].mean())
print(cars_dev['price'].mean())
print(cars_test['price'].mean())
70299.94754337466 70432.62519609921 69244.09963537692 70294.41923174764
#Najmniejsze ceny pojazdów
print(cars_normalized['price'].min())
print(cars_train['price'].min())
print(cars_dev['price'].min())
print(cars_test['price'].min())
500 500 1250 900
#Największe ceny pojazdów
print(cars_normalized['price'].max())
print(cars_train['price'].max())
print(cars_dev['price'].max())
print(cars_test['price'].max())
2399900 2399900 1368341 1000000
#Odchylenie standardowe
print(cars_normalized['price'].std())
print(cars_train['price'].std())
print(cars_dev['price'].std())
print(cars_test['price'].std())
84824.93470827927 85120.16823252657 82128.74927832028 85111.52408658911
#Mediany cen pojazdów
print(cars_normalized['price'].median())
print(cars_train['price'].median())
print(cars_dev['price'].median())
print(cars_test['price'].median())
41900.0 41900.0 41901.0 40900.0
#Podział według regionów
cars_normalized["province"].value_counts()
Mazowieckie 22219 Śląskie 16706 Wielkopolskie 14016 Małopolskie 9756 Dolnośląskie 8838 Łódzkie 7884 Pomorskie 7605 Kujawsko-pomorskie 5371 Lubelskie 4740 Zachodniopomorskie 3963 Podkarpackie 3553 Świętokrzyskie 3337 Warmińsko-mazurskie 2956 Lubuskie 2743 Podlaskie 2123 Opolskie 2073 Moravian-Silesian Region 35 Berlin 3 Wiedeń 2 Niedersachsen 1 Trenczyn 1 Nordrhein-Westfalen 1 Name: province, dtype: int64
#Podział według marki
cars_normalized["mark"].value_counts()
audi 12031 opel 11914 bmw 11070 volkswagen 10848 ford 9664 mercedes-benz 7136 renault 6976 skoda 5888 toyota 5119 peugeot 5056 volvo 4384 hyundai 4032 kia 3744 nissan 3072 fiat 2880 mazda 2848 seat 2848 citroen 2720 honda 2176 mitsubishi 1120 mini 1088 alfa-romeo 704 chevrolet 608 Name: mark, dtype: int64