ium_464903/IUM_02_Dane.ipynb

378 lines
145 KiB
Plaintext
Raw Normal View History

2024-03-19 23:22:43 +01:00
{
"cells": [
{
"cell_type": "markdown",
"id": "9102b9a5-ded2-43f5-8c10-b5ca14e150a1",
"metadata": {},
"source": [
" ### Importowanie bibliotek"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "e28d63b8-3d68-443f-a478-2047240f1e83",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import opendatasets as od\n",
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn import preprocessing\n",
"import chardet"
]
},
{
"cell_type": "markdown",
"id": "87a21ef6-e939-4e70-9c97-27250e75041c",
"metadata": {},
"source": [
"### Pobieranie zbioru danych"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "5d30b742-1c8a-4020-a578-de7eff3b532e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Skipping, found downloaded files in \".\\lettuce-growth-days\" (use force=True to force download)\n"
]
}
],
"source": [
"od.download('https://www.kaggle.com/datasets/jjayfabor/lettuce-growth-days')\n",
"#{\"username\":\"jakubbg\",\"key\":\"e42b293c818e4ecd7b9365ee037af428\"}"
]
},
{
"cell_type": "markdown",
"id": "dddc9962-111c-4157-8a50-911a43644642",
"metadata": {},
"source": [
"### Czytanie zbioru danych z pliku csv"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "76e4acad-eb3c-467e-8794-ef168eee9764",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Plant_ID Date Temperature (�C) Humidity (%) TDS Value (ppm) \\\n",
"0 1 8/3/2023 33.4 53 582 \n",
"1 1 8/4/2023 33.5 53 451 \n",
"2 1 8/5/2023 33.4 59 678 \n",
"3 1 8/6/2023 33.4 68 420 \n",
"4 1 8/7/2023 33.4 74 637 \n",
"\n",
" pH Level Growth Days Temperature (F) Humidity \n",
"0 6.4 1 92.12 0.53 \n",
"1 6.1 2 92.30 0.53 \n",
"2 6.4 3 92.12 0.59 \n",
"3 6.4 4 92.12 0.68 \n",
"4 6.5 5 92.12 0.74 \n"
]
}
],
"source": [
"with open('./lettuce-growth-days/lettuce_dataset_updated.csv', 'rb') as f:\n",
" result = chardet.detect(f.read())\n",
"\n",
"dataset = pd.read_csv('./lettuce-growth-days/lettuce_dataset_updated.csv', encoding=result['encoding'])\n",
"length = len(dataset)\n",
"\n",
"print(dataset.head())"
]
},
{
"cell_type": "markdown",
"id": "7d2d136d-518f-499b-8e7c-47a69cca30b6",
"metadata": {},
"source": [
"### Wyświetlenie informacji o zbiorze danych"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "012e41dc-ff15-4608-8758-08554f386c83",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 3169 entries, 0 to 3168\n",
"Data columns (total 9 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Plant_ID 3169 non-null int64 \n",
" 1 Date 3169 non-null object \n",
" 2 Temperature (�C) 3169 non-null float64\n",
" 3 Humidity (%) 3169 non-null int64 \n",
" 4 TDS Value (ppm) 3169 non-null int64 \n",
" 5 pH Level 3169 non-null float64\n",
" 6 Growth Days 3169 non-null int64 \n",
" 7 Temperature (F) 3169 non-null float64\n",
" 8 Humidity 3169 non-null float64\n",
"dtypes: float64(4), int64(4), object(1)\n",
"memory usage: 222.9+ KB\n",
"None\n"
]
}
],
"source": [
"print(dataset.info())"
]
},
{
"cell_type": "markdown",
"id": "b5c92a1b-9841-4d76-b07b-26cbf4ea9eea",
"metadata": {},
"source": [
"### Sprawdzenie czy występują puste wiersze"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "48ec1cba-81c4-4d8b-8905-7870e18ecd24",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Plant_ID 0\n",
"Date 0\n",
"Temperature (�C) 0\n",
"Humidity (%) 0\n",
"TDS Value (ppm) 0\n",
"pH Level 0\n",
"Growth Days 0\n",
"Temperature (F) 0\n",
"Humidity 0\n",
"dtype: int64\n"
]
}
],
"source": [
"print(dataset.isnull().sum())"
]
},
{
"cell_type": "markdown",
"id": "1c10d470-8da4-4fe7-bfce-1f9f25913894",
"metadata": {},
"source": [
"### Statystyki zbioru"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "9eaed893-22cb-457e-bc22-5a4ad7b872d6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Plant_ID Temperature (�C) Humidity (%) TDS Value (ppm) \\\n",
"count 3169.000000 3169.000000 3169.000000 3169.000000 \n",
"mean 35.441780 28.142222 64.873462 598.045440 \n",
"std 20.243433 4.670521 8.988985 115.713047 \n",
"min 1.000000 18.000000 50.000000 400.000000 \n",
"25% 18.000000 23.600000 57.000000 498.000000 \n",
"50% 35.000000 30.200000 65.000000 593.000000 \n",
"75% 53.000000 31.500000 73.000000 699.000000 \n",
"max 70.000000 33.500000 80.000000 800.000000 \n",
"\n",
" pH Level Growth Days Temperature (F) Humidity \n",
"count 3169.000000 3169.000000 3169.000000 3169.000000 \n",
"mean 6.399211 23.141054 82.655999 0.648735 \n",
"std 0.234418 13.077107 8.406938 0.089890 \n",
"min 6.000000 1.000000 64.400000 0.500000 \n",
"25% 6.200000 12.000000 74.480000 0.570000 \n",
"50% 6.400000 23.000000 86.360000 0.650000 \n",
"75% 6.600000 34.000000 88.700000 0.730000 \n",
"max 6.800000 48.000000 92.300000 0.800000 \n"
]
}
],
"source": [
"print(dataset.describe())"
]
},
{
"cell_type": "markdown",
"id": "0485737a-f8bc-4c39-88d9-62ba23b4de72",
"metadata": {},
"source": [
"### Rozkład wartości poszczególnych parametrów"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "d89d3e2d-dad9-446f-bd8e-615c5cad3ffc",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAekAAAPdCAYAAACuupAFAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8o6BhiAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd1gUVxcH4N9sZekdRKqI2LsxxF6xG0vsCgY1MfaSRE3RaKKJxh5L8iWBWKLGEjX2hr1FY4kVQRSRJlKXsvV+fyAbl13assCC532efZa5c2fmzLKzZ+bOnRmOMcZACCGEEJPDq+wACCGEEKIfJWlCCCHERFGSJoQQQkwUJWlCCCHERFGSJoQQQkwUJWlCCCHERFGSJoQQQkwUJWlCCCHERFGSJoQQQkwUJWlishYsWACO4ypkWR07dkTHjh01w6dPnwbHcdi1a1eFLD84OBje3t4VsixDSaVSjBs3Dq6uruA4DtOnT6/skAxS0s/6yZMn4DgOYWFhmrKK/E4CwB9//AF7e3tIpVKjzO/IkSOwtLTEixcvjDI/Uv4oSZMKERYWBo7jNC8zMzO4ubkhMDAQa9asQWZmplGWExcXhwULFuDmzZtGmZ8xmXJsJbF48WKEhYVh4sSJ2Lx5M0aPHl1oXW9vb/Tp00fvuIreASpvixcvxt69e40+X5VKhfnz52PKlCmwtLTUlP/444/w8fGBvb09Ro8ejYyMDK3p1Go1mjVrhsWLF+vMs0ePHqhduzaWLFli9HhJOWGEVIDQ0FAGgC1cuJBt3ryZ/frrr2zx4sWse/fujOM45uXlxW7duqU1jUKhYDk5OaVazt9//80AsNDQ0FJNJ5PJmEwm0wyHh4czAGznzp2lmo+hscnlcpabm2u0ZZWH1q1bszZt2pSorpeXF+vdu7feceXx2ZZGST/r6Ohonf+Xvu+khYUFCwoKMnKUjP3555+M4zgWGxurKTt37hzjOI5NmzaNrV69mrm6urIJEyZoTbdx40bm4+NT6DquX7+emZubs4yMDKPHTIyPjqRJherZsydGjRqFsWPHYu7cuTh69ChOnDiBpKQk9OvXDzk5OZq6AoEAZmZm5RpPdnY2AEAkEkEkEpXrsooiFAohFosrbfklkZSUBFtb28oOo8zK8llXxHcyX2hoKNq0aYOaNWtqyg4cOICOHTti1apVmDp1KpYsWYL9+/drxqelpeHzzz/H999/X+g6Dho0CDKZDDt37iz3dSBlR0maVLrOnTvjiy++wNOnT7FlyxZNub7zf8ePH0fbtm1ha2sLS0tL+Pv7Y968eQDymlFbtWoFABg7dqymaT3/nGLHjh3RsGFDXL9+He3bt4e5ublm2oLnpPOpVCrMmzcPrq6usLCwQL9+/fDs2TOtOt7e3ggODtaZ9vV5FhebvvOkWVlZmDVrFjw8PCAWi+Hv74/vv/8erMCD6ziOw+TJk7F37140bNgQYrEYDRo0wJEjR/R/4AUkJSUhJCQELi4uMDMzQ5MmTfDbb79pxuc3T0dHR+PgwYOa2J88eVKi+ZdEYeeJ9X0H8td3586dqF+/PiQSCQICAvDvv/8CyGsOrl27NszMzNCxY0edOPUtKy0tDcHBwbCxsYGtrS2CgoKQlpZWbDwcxyErKwu//fab5nMJDg5GeHg4OI7Dn3/+qTOP33//HRzH4dKlS4V+Hrm5uThy5Ai6du2qVZ6TkwM7OzvNsL29vWZHMz++Ro0aYeDAgYXO29nZGY0bN8a+ffsKrUNMh6CyAyAEAEaPHo158+bh2LFjGD9+vN46d+/eRZ8+fdC4cWMsXLgQYrEYkZGRuHDhAgCgXr16WLhwIb788ktMmDAB7dq1AwC88847mnm8fPkSPXv2xLBhwzBq1Ci4uLgUGdc333wDjuPw6aefIikpCatWrULXrl1x8+ZNSCSSEq9fSWJ7HWMM/fr1Q3h4OEJCQtC0aVMcPXoUH3/8MZ4/f46VK1dq1T9//jz27NmDjz76CFZWVlizZg0GDRqEmJgYODg4FBpXTk4OOnbsiMjISEyePBk+Pj7YuXMngoODkZaWhmnTpqFevXrYvHkzZsyYAXd3d8yaNQsA4OTkVOQ6KxQKJCcn65Snp6cXOV1JnDt3Dvv378ekSZMAAEuWLEGfPn3wySefYP369fjoo4+QmpqKpUuX4v3338epU6cKnRdjDP3798f58+fx4Ycfol69evjzzz8RFBRUbBybN2/GuHHj8NZbb2HChAkAAF9fX7z99tvw8PDA1q1bMWDAAK1ptm7dCl9fXwQEBBQ63+vXr0Mul6N58+Za5a1atcLPP/+MY8eOwcfHB8uXL8dbb70FALh37x42btyIq1evFht3ixYtyuU8OikHldzcTt4Q+eek//7770Lr2NjYsGbNmmmG58+fz17/iq5cuZIBYC9evCh0HkWd9+3QoQMDwDZu3Kh3XIcOHTTD+edNa9asqXXu7o8//mAA2OrVqzVlXl5ees9JFpxnUbEFBQUxLy8vzfDevXsZAPb1119r1Rs8eDDjOI5FRkZqygAwkUikVXbr1i0GgK1du1ZnWa9btWoVA8C2bNmiKZPL5SwgIIBZWlpqrXtR55kL8vLyYgCKfL1+Trrg+ucr+B3IX1+xWMyio6M1ZT/++CMDwFxdXbVinjt3LgOgVbewz3rp0qWaMqVSydq1a6fz/9IXT2HnpOfOncvEYjFLS0vTlCUlJTGBQMDmz5+vU/91P//8MwPA/v33X61ypVLJBg4cqPkMPTw82O3btxljjHXv3p19+OGHRc433+LFixkAlpiYWKL6pPJQczcxGZaWlkX28s4/H7pv3z6o1WqDliEWizF27NgS1x8zZgysrKw0w4MHD0aNGjVw6NAhg5ZfUocOHQKfz8fUqVO1ymfNmgXGGA4fPqxV3rVrV/j6+mqGGzduDGtrazx+/LjY5bi6umL48OGaMqFQiKlTp0IqleLMmTMGr0Pr1q1x/Phxndf3339v8DzzdenSRavJunXr1gDyzre+/v/KLy/qczh06BAEAgEmTpyoKePz+ZgyZUqZYhwzZgxkMplWL/YdO3ZAqVRi1KhRRU778uVLANBq2s6Pa/fu3Xj06BGuXbuGiIgINGrUCPv378fVq1exaNEiPH/+HH379oWbmxv69u2LuLg4nfnnz1dfSwcxLZSkicmQSqVaP7AFDR06FG3atMG4cePg4uKCYcOG4Y8//ihVwq5Zs2apOoj5+flpDXMch9q1axv1fKw+T58+hZubm87nUa9ePc3413l6eurMw87ODqmpqcUux8/PDzye9k9BYcspDUdHR3Tt2lXn1aJFC4Pnma/g+trY2AAAPDw89JYX9Tk8ffoUNWrU0LrMCQD8/f3LFGPdunXRqlUrbN26VVO2detWvP3226hdu3aJ5sEK9D/IV7t2bbRo0QJmZmaQy+WYNWsW5s+fD0dHRwwbNgwSiQR//fUXzMzMMGLEiELnW5HXfBPDUJImJiE2Nhbp6elF/nhJJBKcPXsWJ06cwOjRo3H79m0MHToU3bp1g0qlKtFySnMeuaQK+6EraUzGwOfz9ZYX9iNvakr7GRa2vqb2OYwZMwZnzpxBbGwsoqKicPny5WKPogFo+hEUt5MFACtXroRAIMDkyZPx7NkznD9/HkuXLkWLFi2wdOlSzfJflz9fR0dHA9aKVCRK0sQkbN68GQAQGBhYZD0ej4cuXbpgxYoVuHfvHr755hucOnUK4eHhAIx/ZPDo0SOtYcYYIiMjtZpa7ezs9PYELngUWprYvLy8EBcXp9P8/+DBA814Y/Dy8sKjR490WiOMvZzilPQzLA9eXl6Ij4/XuavXw4cPSzR9Uf/XYcOGgc/nY9u2bdi6dSuEQiGGDh1a7Dzr1q0LAIiOji6yXnx8PL7++mtNos5v2nZzc9N6f/78udZ00dHRcHR0LLbzH6l8lKRJpTt16hQWLVoEHx8fjBw5stB6KSkpOmVNmzYFAMhkMgCAhYUFAOj9wTfEpk2btBLlrl27EB8fj549e2rKfH19cfnyZcjlck3ZgQMHdC7VKk1svXr1gkqlwg8//KBVvnLlSnAcp7X
"text/plain": [
"<Figure size 500x1000 with 7 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"feature_to_plot = ['Humidity (%)','Temperature (�C)','TDS Value (ppm)','pH Level','Growth Days','Temperature (F)','Humidity']\n",
"\n",
"fig, axs = plt.subplots(len(feature_to_plot), figsize=(5, 10))\n",
"\n",
"for i, feature in enumerate(feature_to_plot):\n",
" sns.histplot(dataset[feature], ax=axs[i], kde=True)\n",
" axs[i].set_title(f'Distribution of {feature}')\n",
" axs[i].set_ylabel('Frequency')\n",
" \n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "72d4c814-6138-4e81-a102-2dbb403bc0e7",
"metadata": {},
"source": [
"### Normalizacja danych liczbowych do zakresu [0,1]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "1f23b28b-a5c0-49be-bc61-10a9199deb69",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Plant_ID Date Temperature (�C) Humidity (%) TDS Value (ppm) \\\n",
"0 1 8/3/2023 0.993548 0.1 0.4550 \n",
"1 1 8/4/2023 1.000000 0.1 0.1275 \n",
"2 1 8/5/2023 0.993548 0.3 0.6950 \n",
"3 1 8/6/2023 0.993548 0.6 0.0500 \n",
"4 1 8/7/2023 0.993548 0.8 0.5925 \n",
"\n",
" pH Level Growth Days Temperature (F) Humidity \n",
"0 0.500 0.000000 0.993548 0.1 \n",
"1 0.125 0.021277 1.000000 0.1 \n",
"2 0.500 0.042553 0.993548 0.3 \n",
"3 0.500 0.063830 0.993548 0.6 \n",
"4 0.625 0.085106 0.993548 0.8 \n"
]
}
],
"source": [
"columns = ['Humidity (%)','Temperature (�C)','TDS Value (ppm)','pH Level','Growth Days','Temperature (F)','Humidity']\n",
"\n",
"for col in columns:\n",
" dataset[col] = preprocessing.MinMaxScaler().fit_transform(dataset[col].values.reshape(-1, 1))\n",
"\n",
"print(dataset.head())"
]
},
{
"cell_type": "markdown",
"id": "bb40a889-90de-4a44-87ad-cb24bdceea01",
"metadata": {},
"source": [
"### Podział danych na podzbiory train/dev/test"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "bd8daa4c-baaf-4c17-a10a-36b18e891974",
"metadata": {},
"outputs": [],
"source": [
"# 60 / 20 / 20\n",
"X_train, X_test = train_test_split(dataset, train_size=0.8, random_state=1)\n",
"X_train, X_dev = train_test_split(X_train, test_size=0.25, random_state=1)"
]
},
{
"cell_type": "markdown",
"id": "c83c0db8-4d3b-4d2c-8b6e-9d2fbfed0ab2",
"metadata": {},
"source": [
"### Rozmiar pozbiorów"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "dca99c75-9b22-400f-9e89-a0a6e9a75573",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Set length: 3169\n",
"Train subset length: 1901 59.99 %\n",
"Dev subset length: 634 20.01 %\n",
"Test subset length: 634 20.01 %\n"
]
}
],
"source": [
"print(\"Set length: \"+str(length))\n",
"print(\"Train subset length: \"+str(len(X_train))+\" \"+str(\"{:.2f}\".format(len(X_train)/length*100))+\" %\")\n",
"print(\"Dev subset length: \"+str(len(X_dev))+\" \"+str(\"{:.2f}\".format(len(X_dev)/length*100))+\" %\")\n",
"print(\"Test subset length: \"+str(len(X_test))+\" \"+str(\"{:.2f}\".format(len(X_test)/length*100))+\" %\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}