564 KiB
564 KiB
import pandas as pd
df=pd.read_csv('OrangeQualityData.csv')
df.describe(include='all')
Size (cm) | Weight (g) | Brix (Sweetness) | pH (Acidity) | Softness (1-5) | HarvestTime (days) | Ripeness (1-5) | Color | Variety | Blemishes (Y/N) | Quality (1-5) | |
---|---|---|---|---|---|---|---|---|---|---|---|
count | 241.000000 | 241.000000 | 241.000000 | 241.000000 | 241.000000 | 241.000000 | 241.000000 | 241 | 241 | 241 | 241.000000 |
unique | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 5 | 24 | 12 | NaN |
top | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Deep Orange | Cara Cara | N | NaN |
freq | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 75 | 21 | 149 | NaN |
mean | 7.844813 | 205.128631 | 10.907884 | 3.473900 | 3.072614 | 15.344398 | 3.599585 | NaN | NaN | NaN | 3.817427 |
std | 1.086002 | 56.461012 | 2.760446 | 0.421007 | 1.323630 | 5.323852 | 1.205214 | NaN | NaN | NaN | 1.014410 |
min | 6.000000 | 100.000000 | 5.500000 | 2.800000 | 1.000000 | 4.000000 | 1.000000 | NaN | NaN | NaN | 1.000000 |
25% | 6.900000 | 155.000000 | 8.500000 | 3.200000 | 2.000000 | 11.000000 | 3.000000 | NaN | NaN | NaN | 3.000000 |
50% | 7.800000 | 205.000000 | 11.000000 | 3.400000 | 3.000000 | 15.000000 | 4.000000 | NaN | NaN | NaN | 4.000000 |
75% | 8.700000 | 252.000000 | 13.400000 | 3.800000 | 4.000000 | 20.000000 | 4.500000 | NaN | NaN | NaN | 4.500000 |
max | 10.000000 | 300.000000 | 16.000000 | 4.400000 | 5.000000 | 25.000000 | 5.000000 | NaN | NaN | NaN | 5.000000 |
df["Variety"].value_counts()
Variety Cara Cara 21 Temple 18 Star Ruby 18 Navel 16 Moro (Blood) 16 Tangerine 14 Clementine 14 Washington Navel 14 Satsuma Mandarin 13 Ortanique (Hybrid) 13 Minneola (Hybrid) 12 Jaffa 11 Ambiance 11 Valencia 11 California Valencia 7 Honey Tangerine 7 Hamlin 5 Midsweet (Hybrid) 5 Clementine (Seedless) 4 Murcott (Hybrid) 3 Navel (Late Season) 3 Blood Orange 2 Navel (Early Season) 2 Tangelo (Hybrid) 1 Name: count, dtype: int64
df["Variety"].value_counts().plot(kind="bar")
<Axes: xlabel='Variety'>
df[["Variety","Size (cm)"]].groupby("Variety").mean()
Size (cm) | |
---|---|
Variety | |
Ambiance | 7.827273 |
Blood Orange | 9.500000 |
California Valencia | 7.885714 |
Cara Cara | 8.419048 |
Clementine | 7.578571 |
Clementine (Seedless) | 6.500000 |
Hamlin | 8.160000 |
Honey Tangerine | 7.742857 |
Jaffa | 7.090909 |
Midsweet (Hybrid) | 8.660000 |
Minneola (Hybrid) | 7.683333 |
Moro (Blood) | 8.475000 |
Murcott (Hybrid) | 7.733333 |
Navel | 7.662500 |
Navel (Early Season) | 7.850000 |
Navel (Late Season) | 8.400000 |
Ortanique (Hybrid) | 7.215385 |
Satsuma Mandarin | 7.038462 |
Star Ruby | 8.444444 |
Tangelo (Hybrid) | 7.200000 |
Tangerine | 7.607143 |
Temple | 7.638889 |
Valencia | 7.927273 |
Washington Navel | 8.207143 |
df[["Variety","Size (cm)"]].groupby("Variety").mean().plot(kind="bar")
<Axes: xlabel='Variety'>
import seaborn as sns
sns.set_theme()
sns.relplot(data=df, x="Size (cm)", y="Weight (g)", hue="Variety")
sns.relplot(data=df, x="Size (cm)", y="Weight (g)", hue="Variety")
<seaborn.axisgrid.FacetGrid at 0x2ab15b4f150>
dfv = df[df["Variety"] != "Cara Cara"]
sns.relplot(data=dfv, x="Size (cm)", y="Weight (g)", hue="Variety")
<seaborn.axisgrid.FacetGrid at 0x2ab180d9610>
import sklearn
from sklearn.model_selection import train_test_split
df_train, df_test = sklearn.model_selection.train_test_split(df, test_size=50, random_state=1)
df_train["Variety"].value_counts()
Variety Star Ruby 16 Moro (Blood) 15 Cara Cara 15 Navel 14 Temple 14 Clementine 11 Washington Navel 11 Tangerine 11 Ortanique (Hybrid) 10 Ambiance 9 Satsuma Mandarin 9 Jaffa 8 Minneola (Hybrid) 8 California Valencia 7 Valencia 7 Honey Tangerine 6 Midsweet (Hybrid) 4 Navel (Late Season) 3 Hamlin 3 Murcott (Hybrid) 3 Clementine (Seedless) 3 Blood Orange 2 Navel (Early Season) 1 Tangelo (Hybrid) 1 Name: count, dtype: int64
df_test["Variety"].value_counts()
Variety Cara Cara 6 Minneola (Hybrid) 4 Satsuma Mandarin 4 Valencia 4 Temple 4 Tangerine 3 Ortanique (Hybrid) 3 Washington Navel 3 Clementine 3 Jaffa 3 Ambiance 2 Star Ruby 2 Hamlin 2 Navel 2 Navel (Early Season) 1 Clementine (Seedless) 1 Honey Tangerine 1 Moro (Blood) 1 Midsweet (Hybrid) 1 Name: count, dtype: int64
import sklearn
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.1, random_state=1, stratify=df["Color"])
df_train["Color"].value_counts()
Color Deep Orange 67 Light Orange 58 Orange-Red 49 Orange 34 Yellow-Orange 8 Name: count, dtype: int64
df_test["Color"].value_counts()
Color Deep Orange 8 Light Orange 6 Orange-Red 6 Orange 4 Yellow-Orange 1 Name: count, dtype: int64
df = df.dropna()
df.shape
(241, 11)
df.isna().sum()
Size (cm) 0 Weight (g) 0 Brix (Sweetness) 0 pH (Acidity) 0 Softness (1-5) 0 HarvestTime (days) 0 Ripeness (1-5) 0 Color 0 Variety 0 Blemishes (Y/N) 0 Quality (1-5) 0 dtype: int64
from sklearn.preprocessing import MinMaxScaler
numeric_columns = df.select_dtypes(include=['int', 'float']).columns
scaler = MinMaxScaler(feature_range=(0, 1))
df_scaled = df.copy()
df_scaled[numeric_columns] = scaler.fit_transform(df[numeric_columns])
df_scaled.head()
Size (cm) | Weight (g) | Brix (Sweetness) | pH (Acidity) | Softness (1-5) | HarvestTime (days) | Ripeness (1-5) | Color | Variety | Blemishes (Y/N) | Quality (1-5) | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.375 | 0.40 | 0.619048 | 0.2500 | 0.250 | 0.285714 | 0.750 | Orange | Valencia | N | 0.750 |
1 | 0.550 | 0.60 | 0.476190 | 0.3750 | 0.500 | 0.476190 | 0.875 | Deep Orange | Navel | N | 0.875 |
2 | 0.200 | 0.25 | 0.809524 | 0.1250 | 0.000 | 0.142857 | 1.000 | Light Orange | Cara Cara | N | 1.000 |
3 | 0.750 | 0.75 | 0.285714 | 0.6250 | 0.750 | 0.809524 | 0.625 | Orange-Red | Blood Orange | N | 0.625 |
4 | 0.625 | 0.55 | 0.571429 | 0.3125 | 0.375 | 0.380952 | 1.000 | Orange | Hamlin | Y (Minor) | 0.875 |
df['Quality (1-5)'].value_counts().head(10)
Quality (1-5) 4.0 76 5.0 52 4.5 29 3.0 26 3.5 23 2.0 14 2.5 12 1.0 9 Name: count, dtype: int64
df['Ripeness (1-5)'].value_counts().head(10)
Ripeness (1-5) 5.0 58 4.0 52 3.0 46 2.0 27 4.5 23 1.0 17 3.5 12 2.5 6 Name: count, dtype: int64
df['HarvestTime (days)'].value_counts().head(10)
HarvestTime (days) 11 23 15 16 12 16 10 15 22 14 20 14 16 13 17 13 14 13 13 13 Name: count, dtype: int64
df['HarvestTime (days)'].value_counts()
HarvestTime (days) 11 23 15 16 12 16 10 15 22 14 20 14 16 13 17 13 14 13 13 13 18 12 23 12 21 12 19 12 25 7 24 6 5 6 9 6 7 6 8 5 6 5 4 2 Name: count, dtype: int64