145 KiB
145 KiB
Importowanie bibliotek
import pandas as pd
import opendatasets as od
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import chardet
Pobieranie zbioru danych
od.download('https://www.kaggle.com/datasets/jjayfabor/lettuce-growth-days')
#{"username":"jakubbg","key":"e42b293c818e4ecd7b9365ee037af428"}
Skipping, found downloaded files in ".\lettuce-growth-days" (use force=True to force download)
Czytanie zbioru danych z pliku csv
with open('./lettuce-growth-days/lettuce_dataset_updated.csv', 'rb') as f:
result = chardet.detect(f.read())
dataset = pd.read_csv('./lettuce-growth-days/lettuce_dataset_updated.csv', encoding=result['encoding'])
length = len(dataset)
print(dataset.head())
Plant_ID Date Temperature (�C) Humidity (%) TDS Value (ppm) \ 0 1 8/3/2023 33.4 53 582 1 1 8/4/2023 33.5 53 451 2 1 8/5/2023 33.4 59 678 3 1 8/6/2023 33.4 68 420 4 1 8/7/2023 33.4 74 637 pH Level Growth Days Temperature (F) Humidity 0 6.4 1 92.12 0.53 1 6.1 2 92.30 0.53 2 6.4 3 92.12 0.59 3 6.4 4 92.12 0.68 4 6.5 5 92.12 0.74
Wyświetlenie informacji o zbiorze danych
print(dataset.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3169 entries, 0 to 3168 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Plant_ID 3169 non-null int64 1 Date 3169 non-null object 2 Temperature (�C) 3169 non-null float64 3 Humidity (%) 3169 non-null int64 4 TDS Value (ppm) 3169 non-null int64 5 pH Level 3169 non-null float64 6 Growth Days 3169 non-null int64 7 Temperature (F) 3169 non-null float64 8 Humidity 3169 non-null float64 dtypes: float64(4), int64(4), object(1) memory usage: 222.9+ KB None
Sprawdzenie czy występują puste wiersze
print(dataset.isnull().sum())
Plant_ID 0 Date 0 Temperature (�C) 0 Humidity (%) 0 TDS Value (ppm) 0 pH Level 0 Growth Days 0 Temperature (F) 0 Humidity 0 dtype: int64
Statystyki zbioru
print(dataset.describe())
Plant_ID Temperature (�C) Humidity (%) TDS Value (ppm) \ count 3169.000000 3169.000000 3169.000000 3169.000000 mean 35.441780 28.142222 64.873462 598.045440 std 20.243433 4.670521 8.988985 115.713047 min 1.000000 18.000000 50.000000 400.000000 25% 18.000000 23.600000 57.000000 498.000000 50% 35.000000 30.200000 65.000000 593.000000 75% 53.000000 31.500000 73.000000 699.000000 max 70.000000 33.500000 80.000000 800.000000 pH Level Growth Days Temperature (F) Humidity count 3169.000000 3169.000000 3169.000000 3169.000000 mean 6.399211 23.141054 82.655999 0.648735 std 0.234418 13.077107 8.406938 0.089890 min 6.000000 1.000000 64.400000 0.500000 25% 6.200000 12.000000 74.480000 0.570000 50% 6.400000 23.000000 86.360000 0.650000 75% 6.600000 34.000000 88.700000 0.730000 max 6.800000 48.000000 92.300000 0.800000
Rozkład wartości poszczególnych parametrów
import matplotlib.pyplot as plt
import seaborn as sns
feature_to_plot = ['Humidity (%)','Temperature (�C)','TDS Value (ppm)','pH Level','Growth Days','Temperature (F)','Humidity']
fig, axs = plt.subplots(len(feature_to_plot), figsize=(5, 10))
for i, feature in enumerate(feature_to_plot):
sns.histplot(dataset[feature], ax=axs[i], kde=True)
axs[i].set_title(f'Distribution of {feature}')
axs[i].set_ylabel('Frequency')
plt.tight_layout()
plt.show()
Normalizacja danych liczbowych do zakresu [0,1]
columns = ['Humidity (%)','Temperature (�C)','TDS Value (ppm)','pH Level','Growth Days','Temperature (F)','Humidity']
for col in columns:
dataset[col] = preprocessing.MinMaxScaler().fit_transform(dataset[col].values.reshape(-1, 1))
print(dataset.head())
Plant_ID Date Temperature (�C) Humidity (%) TDS Value (ppm) \ 0 1 8/3/2023 0.993548 0.1 0.4550 1 1 8/4/2023 1.000000 0.1 0.1275 2 1 8/5/2023 0.993548 0.3 0.6950 3 1 8/6/2023 0.993548 0.6 0.0500 4 1 8/7/2023 0.993548 0.8 0.5925 pH Level Growth Days Temperature (F) Humidity 0 0.500 0.000000 0.993548 0.1 1 0.125 0.021277 1.000000 0.1 2 0.500 0.042553 0.993548 0.3 3 0.500 0.063830 0.993548 0.6 4 0.625 0.085106 0.993548 0.8
Podział danych na podzbiory train/dev/test
# 60 / 20 / 20
X_train, X_test = train_test_split(dataset, train_size=0.8, random_state=1)
X_train, X_dev = train_test_split(X_train, test_size=0.25, random_state=1)
Rozmiar pozbiorów
print("Set length: "+str(length))
print("Train subset length: "+str(len(X_train))+" "+str("{:.2f}".format(len(X_train)/length*100))+" %")
print("Dev subset length: "+str(len(X_dev))+" "+str("{:.2f}".format(len(X_dev)/length*100))+" %")
print("Test subset length: "+str(len(X_test))+" "+str("{:.2f}".format(len(X_test)/length*100))+" %")
Set length: 3169 Train subset length: 1901 59.99 % Dev subset length: 634 20.01 % Test subset length: 634 20.01 %