ium_464906/ium_01.ipynb
zgolebiewska 440d4a98be ium_1
2024-03-25 13:37:19 +01:00

564 KiB

import pandas as pd
df=pd.read_csv('OrangeQualityData.csv')
df.describe(include='all')
Size (cm) Weight (g) Brix (Sweetness) pH (Acidity) Softness (1-5) HarvestTime (days) Ripeness (1-5) Color Variety Blemishes (Y/N) Quality (1-5)
count 241.000000 241.000000 241.000000 241.000000 241.000000 241.000000 241.000000 241 241 241 241.000000
unique NaN NaN NaN NaN NaN NaN NaN 5 24 12 NaN
top NaN NaN NaN NaN NaN NaN NaN Deep Orange Cara Cara N NaN
freq NaN NaN NaN NaN NaN NaN NaN 75 21 149 NaN
mean 7.844813 205.128631 10.907884 3.473900 3.072614 15.344398 3.599585 NaN NaN NaN 3.817427
std 1.086002 56.461012 2.760446 0.421007 1.323630 5.323852 1.205214 NaN NaN NaN 1.014410
min 6.000000 100.000000 5.500000 2.800000 1.000000 4.000000 1.000000 NaN NaN NaN 1.000000
25% 6.900000 155.000000 8.500000 3.200000 2.000000 11.000000 3.000000 NaN NaN NaN 3.000000
50% 7.800000 205.000000 11.000000 3.400000 3.000000 15.000000 4.000000 NaN NaN NaN 4.000000
75% 8.700000 252.000000 13.400000 3.800000 4.000000 20.000000 4.500000 NaN NaN NaN 4.500000
max 10.000000 300.000000 16.000000 4.400000 5.000000 25.000000 5.000000 NaN NaN NaN 5.000000
df["Variety"].value_counts()
Variety
Cara Cara                21
Temple                   18
Star Ruby                18
Navel                    16
Moro (Blood)             16
Tangerine                14
Clementine               14
Washington Navel         14
Satsuma Mandarin         13
Ortanique (Hybrid)       13
Minneola (Hybrid)        12
Jaffa                    11
Ambiance                 11
Valencia                 11
California Valencia       7
Honey Tangerine           7
Hamlin                    5
Midsweet (Hybrid)         5
Clementine (Seedless)     4
Murcott (Hybrid)          3
Navel (Late Season)       3
Blood Orange              2
Navel (Early Season)      2
Tangelo (Hybrid)          1
Name: count, dtype: int64
df["Variety"].value_counts().plot(kind="bar")
<Axes: xlabel='Variety'>
df[["Variety","Size (cm)"]].groupby("Variety").mean()
Size (cm)
Variety
Ambiance 7.827273
Blood Orange 9.500000
California Valencia 7.885714
Cara Cara 8.419048
Clementine 7.578571
Clementine (Seedless) 6.500000
Hamlin 8.160000
Honey Tangerine 7.742857
Jaffa 7.090909
Midsweet (Hybrid) 8.660000
Minneola (Hybrid) 7.683333
Moro (Blood) 8.475000
Murcott (Hybrid) 7.733333
Navel 7.662500
Navel (Early Season) 7.850000
Navel (Late Season) 8.400000
Ortanique (Hybrid) 7.215385
Satsuma Mandarin 7.038462
Star Ruby 8.444444
Tangelo (Hybrid) 7.200000
Tangerine 7.607143
Temple 7.638889
Valencia 7.927273
Washington Navel 8.207143
df[["Variety","Size (cm)"]].groupby("Variety").mean().plot(kind="bar")
<Axes: xlabel='Variety'>
import seaborn as sns
sns.set_theme()
sns.relplot(data=df, x="Size (cm)", y="Weight (g)", hue="Variety")
sns.relplot(data=df, x="Size (cm)", y="Weight (g)", hue="Variety")
<seaborn.axisgrid.FacetGrid at 0x2ab15b4f150>
dfv = df[df["Variety"] !=  "Cara Cara"]
sns.relplot(data=dfv, x="Size (cm)", y="Weight (g)", hue="Variety")
<seaborn.axisgrid.FacetGrid at 0x2ab180d9610>
import sklearn
from sklearn.model_selection import train_test_split
df_train, df_test = sklearn.model_selection.train_test_split(df, test_size=50, random_state=1)
df_train["Variety"].value_counts()
Variety
Star Ruby                16
Moro (Blood)             15
Cara Cara                15
Navel                    14
Temple                   14
Clementine               11
Washington Navel         11
Tangerine                11
Ortanique (Hybrid)       10
Ambiance                  9
Satsuma Mandarin          9
Jaffa                     8
Minneola (Hybrid)         8
California Valencia       7
Valencia                  7
Honey Tangerine           6
Midsweet (Hybrid)         4
Navel (Late Season)       3
Hamlin                    3
Murcott (Hybrid)          3
Clementine (Seedless)     3
Blood Orange              2
Navel (Early Season)      1
Tangelo (Hybrid)          1
Name: count, dtype: int64
df_test["Variety"].value_counts()
Variety
Cara Cara                6
Minneola (Hybrid)        4
Satsuma Mandarin         4
Valencia                 4
Temple                   4
Tangerine                3
Ortanique (Hybrid)       3
Washington Navel         3
Clementine               3
Jaffa                    3
Ambiance                 2
Star Ruby                2
Hamlin                   2
Navel                    2
Navel (Early Season)     1
Clementine (Seedless)    1
Honey Tangerine          1
Moro (Blood)             1
Midsweet (Hybrid)        1
Name: count, dtype: int64
import sklearn
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.1, random_state=1, stratify=df["Color"])
df_train["Color"].value_counts()
Color
Deep Orange      67
Light Orange     58
Orange-Red       49
Orange           34
Yellow-Orange     8
Name: count, dtype: int64
df_test["Color"].value_counts()
Color
Deep Orange      8
Light Orange     6
Orange-Red       6
Orange           4
Yellow-Orange    1
Name: count, dtype: int64
df = df.dropna()
df.shape
(241, 11)
df.isna().sum()
Size (cm)             0
Weight (g)            0
Brix (Sweetness)      0
pH (Acidity)          0
Softness (1-5)        0
HarvestTime (days)    0
Ripeness (1-5)        0
Color                 0
Variety               0
Blemishes (Y/N)       0
Quality (1-5)         0
dtype: int64
from sklearn.preprocessing import MinMaxScaler

numeric_columns = df.select_dtypes(include=['int', 'float']).columns
scaler = MinMaxScaler(feature_range=(0, 1))

df_scaled = df.copy()
df_scaled[numeric_columns] = scaler.fit_transform(df[numeric_columns])
df_scaled.head()
Size (cm) Weight (g) Brix (Sweetness) pH (Acidity) Softness (1-5) HarvestTime (days) Ripeness (1-5) Color Variety Blemishes (Y/N) Quality (1-5)
0 0.375 0.40 0.619048 0.2500 0.250 0.285714 0.750 Orange Valencia N 0.750
1 0.550 0.60 0.476190 0.3750 0.500 0.476190 0.875 Deep Orange Navel N 0.875
2 0.200 0.25 0.809524 0.1250 0.000 0.142857 1.000 Light Orange Cara Cara N 1.000
3 0.750 0.75 0.285714 0.6250 0.750 0.809524 0.625 Orange-Red Blood Orange N 0.625
4 0.625 0.55 0.571429 0.3125 0.375 0.380952 1.000 Orange Hamlin Y (Minor) 0.875
df['Quality (1-5)'].value_counts().head(10)
Quality (1-5)
4.0    76
5.0    52
4.5    29
3.0    26
3.5    23
2.0    14
2.5    12
1.0     9
Name: count, dtype: int64
df['Ripeness (1-5)'].value_counts().head(10)
Ripeness (1-5)
5.0    58
4.0    52
3.0    46
2.0    27
4.5    23
1.0    17
3.5    12
2.5     6
Name: count, dtype: int64
df['HarvestTime (days)'].value_counts().head(10)
HarvestTime (days)
11    23
15    16
12    16
10    15
22    14
20    14
16    13
17    13
14    13
13    13
Name: count, dtype: int64
df['HarvestTime (days)'].value_counts()
HarvestTime (days)
11    23
15    16
12    16
10    15
22    14
20    14
16    13
17    13
14    13
13    13
18    12
23    12
21    12
19    12
25     7
24     6
5      6
9      6
7      6
8      5
6      5
4      2
Name: count, dtype: int64