63 KiB
63 KiB
import pandas as pd
import sklearn.model_selection
from datasets import load_dataset
dataset = load_dataset("mstz/wine", "wine")
Found cached dataset wine (C:/Users/s487176/.cache/huggingface/datasets/mstz___wine/wine/1.0.0/0913b614badc418a000d75d098776831f39ebf5ee208ecd3cfad4d5db1418d76)
0%| | 0/1 [00:00<?, ?it/s]
dataset["train"]
Dataset({ features: ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality', 'color'], num_rows: 6497 })
wine_dataset = pd.DataFrame(dataset["train"])
wine_dataset.head()# podgląd danych
fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | color | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | 0 |
1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 | 0 |
2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 | 0 |
3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 | 0 |
4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | 0 |
wine_dataset.describe(include='all')
fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | color | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 |
mean | 7.215307 | 0.339666 | 0.318633 | 5.443235 | 0.056034 | 30.525319 | 115.744574 | 0.994697 | 3.218501 | 0.531268 | 10.491801 | 5.818378 | 0.753886 |
std | 1.296434 | 0.164636 | 0.145318 | 4.757804 | 0.035034 | 17.749400 | 56.521855 | 0.002999 | 0.160787 | 0.148806 | 1.192712 | 0.873255 | 0.430779 |
min | 3.800000 | 0.080000 | 0.000000 | 0.600000 | 0.009000 | 1.000000 | 6.000000 | 0.987110 | 2.720000 | 0.220000 | 8.000000 | 3.000000 | 0.000000 |
25% | 6.400000 | 0.230000 | 0.250000 | 1.800000 | 0.038000 | 17.000000 | 77.000000 | 0.992340 | 3.110000 | 0.430000 | 9.500000 | 5.000000 | 1.000000 |
50% | 7.000000 | 0.290000 | 0.310000 | 3.000000 | 0.047000 | 29.000000 | 118.000000 | 0.994890 | 3.210000 | 0.510000 | 10.300000 | 6.000000 | 1.000000 |
75% | 7.700000 | 0.400000 | 0.390000 | 8.100000 | 0.065000 | 41.000000 | 156.000000 | 0.996990 | 3.320000 | 0.600000 | 11.300000 | 6.000000 | 1.000000 |
max | 15.900000 | 1.580000 | 1.660000 | 65.800000 | 0.611000 | 289.000000 | 440.000000 | 1.038980 | 4.010000 | 2.000000 | 14.900000 | 9.000000 | 1.000000 |
wine_dataset["color"].value_counts().plot(kind="bar")
<Axes: >
wine_dataset["fixed_acidity"].std()
1.2964337577998153
import numpy as np
np.where(pd.isnull(wine_dataset))## sprawdzanie czy istnieją puste wartości
(array([], dtype=int64), array([], dtype=int64))
for column in wine_dataset.columns:
wine_dataset[column] = wine_dataset[column] / wine_dataset[column].abs().max() # normalizacja
wine_dataset.describe(include='all') # sprawdzanie wartości po znormalizowaniu
fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | color | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 |
mean | 0.453793 | 0.214978 | 0.191948 | 0.082724 | 0.091708 | 0.105624 | 0.263056 | 0.957378 | 0.802619 | 0.265634 | 0.704148 | 0.646486 | 0.753886 |
std | 0.081537 | 0.104200 | 0.087541 | 0.072307 | 0.057338 | 0.061417 | 0.128459 | 0.002886 | 0.040097 | 0.074403 | 0.080048 | 0.097028 | 0.430779 |
min | 0.238994 | 0.050633 | 0.000000 | 0.009119 | 0.014730 | 0.003460 | 0.013636 | 0.950076 | 0.678304 | 0.110000 | 0.536913 | 0.333333 | 0.000000 |
25% | 0.402516 | 0.145570 | 0.150602 | 0.027356 | 0.062193 | 0.058824 | 0.175000 | 0.955110 | 0.775561 | 0.215000 | 0.637584 | 0.555556 | 1.000000 |
50% | 0.440252 | 0.183544 | 0.186747 | 0.045593 | 0.076923 | 0.100346 | 0.268182 | 0.957564 | 0.800499 | 0.255000 | 0.691275 | 0.666667 | 1.000000 |
75% | 0.484277 | 0.253165 | 0.234940 | 0.123100 | 0.106383 | 0.141869 | 0.354545 | 0.959585 | 0.827930 | 0.300000 | 0.758389 | 0.666667 | 1.000000 |
max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
wine_dataset["fixed_acidity"].nlargest(10) #sprawdza czy najwyższe wartości mają sens
652 1.000000 442 0.981132 557 0.981132 554 0.974843 555 0.974843 243 0.943396 244 0.943396 544 0.899371 3125 0.893082 374 0.880503 Name: fixed_acidity, dtype: float64
from sklearn.model_selection import train_test_split
wine_train, wine_test = sklearn.model_selection.train_test_split(wine_dataset, test_size=0.1, random_state=1, stratify=wine_dataset["color"])
wine_train["color"].value_counts()
# podzielenie na train i test
1.0 4408 0.0 1439 Name: color, dtype: int64
wine_test["color"].value_counts()
1.0 490 0.0 160 Name: color, dtype: int64
wine_test, wine_val = sklearn.model_selection.train_test_split(wine_test, test_size=0.5, random_state=1, stratify=wine_test["color"]) # podzielenie na test i validation
wine_test["color"].value_counts()
1.0 245 0.0 80 Name: color, dtype: int64
wine_val["color"].value_counts()
1.0 245 0.0 80 Name: color, dtype: int64
import seaborn as sns
sns.set_theme()
len(wine_dataset.columns)
13
sns.pairplot(data=wine_dataset, hue="color")
wine_test.describe()
fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | color | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 |
mean | 7.127077 | 0.342969 | 0.299846 | 5.197538 | 0.054222 | 29.773846 | 113.283077 | 0.994568 | 3.222246 | 0.527754 | 10.488564 | 5.815385 | 0.753846 |
std | 1.181391 | 0.170050 | 0.129556 | 4.608978 | 0.031405 | 15.822670 | 55.072566 | 0.002895 | 0.159630 | 0.144550 | 1.172682 | 0.855128 | 0.431433 |
min | 5.000000 | 0.100000 | 0.000000 | 0.800000 | 0.019000 | 3.000000 | 9.000000 | 0.988190 | 2.860000 | 0.260000 | 8.500000 | 3.000000 | 0.000000 |
25% | 6.400000 | 0.230000 | 0.240000 | 1.800000 | 0.037000 | 17.000000 | 74.000000 | 0.992400 | 3.110000 | 0.420000 | 9.500000 | 5.000000 | 1.000000 |
50% | 6.900000 | 0.280000 | 0.300000 | 2.800000 | 0.048000 | 29.000000 | 115.000000 | 0.994800 | 3.210000 | 0.500000 | 10.300000 | 6.000000 | 1.000000 |
75% | 7.500000 | 0.400000 | 0.370000 | 7.500000 | 0.062000 | 41.000000 | 151.000000 | 0.996750 | 3.320000 | 0.600000 | 11.300000 | 6.000000 | 1.000000 |
max | 13.000000 | 0.900000 | 0.740000 | 22.000000 | 0.415000 | 67.000000 | 253.000000 | 1.002890 | 3.680000 | 1.170000 | 14.000000 | 9.000000 | 1.000000 |
wine_train.describe()
fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | color | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 5847.000000 | 5847.000000 | 5847.000000 | 5847.000000 | 5847.000000 | 5847.000000 | 5847.000000 | 5847.000000 | 5847.000000 | 5847.000000 | 5847.000000 | 5847.000000 | 5847.000000 |
mean | 7.216179 | 0.339796 | 0.319111 | 5.417402 | 0.056310 | 30.535403 | 115.673508 | 0.994682 | 3.218303 | 0.531596 | 10.494455 | 5.820592 | 0.753891 |
std | 1.299695 | 0.164817 | 0.146141 | 4.736399 | 0.035816 | 17.845522 | 56.432512 | 0.002995 | 0.159919 | 0.149728 | 1.189801 | 0.872353 | 0.430780 |
min | 3.800000 | 0.080000 | 0.000000 | 0.600000 | 0.009000 | 1.000000 | 6.000000 | 0.987110 | 2.720000 | 0.220000 | 8.000000 | 3.000000 | 0.000000 |
25% | 6.400000 | 0.230000 | 0.250000 | 1.800000 | 0.038000 | 17.000000 | 77.500000 | 0.992300 | 3.110000 | 0.430000 | 9.500000 | 5.000000 | 1.000000 |
50% | 7.000000 | 0.290000 | 0.310000 | 3.000000 | 0.047000 | 29.000000 | 118.000000 | 0.994840 | 3.210000 | 0.510000 | 10.300000 | 6.000000 | 1.000000 |
75% | 7.700000 | 0.400000 | 0.390000 | 8.100000 | 0.065000 | 41.000000 | 155.500000 | 0.996985 | 3.320000 | 0.600000 | 11.300000 | 6.000000 | 1.000000 |
max | 15.900000 | 1.580000 | 1.660000 | 65.800000 | 0.611000 | 289.000000 | 440.000000 | 1.038980 | 4.010000 | 2.000000 | 14.900000 | 9.000000 | 1.000000 |
wine_val.describe()
fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | color | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 |
mean | 7.287846 | 0.334031 | 0.328831 | 6.153692 | 0.052874 | 31.095385 | 119.484615 | 0.995091 | 3.218308 | 0.528892 | 10.447282 | 5.781538 | 0.753846 |
std | 1.345471 | 0.156023 | 0.144192 | 5.220944 | 0.021471 | 17.861741 | 59.481580 | 0.003150 | 0.177176 | 0.136171 | 1.265593 | 0.908617 | 0.431433 |
min | 4.700000 | 0.090000 | 0.000000 | 0.800000 | 0.012000 | 3.000000 | 8.000000 | 0.987460 | 2.870000 | 0.280000 | 8.400000 | 3.000000 | 0.000000 |
25% | 6.400000 | 0.230000 | 0.260000 | 2.000000 | 0.039000 | 16.000000 | 79.000000 | 0.992700 | 3.100000 | 0.430000 | 9.400000 | 5.000000 | 1.000000 |
50% | 7.100000 | 0.290000 | 0.310000 | 4.550000 | 0.048000 | 29.000000 | 125.000000 | 0.995320 | 3.210000 | 0.500000 | 10.200000 | 6.000000 | 1.000000 |
75% | 7.800000 | 0.400000 | 0.400000 | 8.800000 | 0.060000 | 45.000000 | 163.000000 | 0.997450 | 3.320000 | 0.610000 | 11.300000 | 6.000000 | 1.000000 |
max | 15.000000 | 1.180000 | 0.740000 | 31.600000 | 0.170000 | 77.000000 | 251.000000 | 1.010300 | 4.010000 | 1.140000 | 14.000000 | 8.000000 | 1.000000 |