{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "IUM_1_434788.ipynb", "provenance": [], "collapsed_sections": [], "toc_visible": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "shaFKPEixPn4" }, "source": [ "# 1. Pobranie zbioru danych z Repozytorium" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-03GDjWtxD7W", "outputId": "3cefd33d-3ef4-4c16-963e-ffa6e9e781de" }, "source": [ "!curl -OL https://git.wmi.amu.edu.pl/s434788/ium_434788/raw/branch/master/winequality-red.csv" ], "execution_count": 1, "outputs": [ { "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 98k 0 98k 0 0 74502 0 --:--:-- 0:00:01 --:--:-- 74502\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 419 }, "id": "sAUNi0ylxWUm", "outputId": "fe879388-072d-4845-f3b5-f06a4fca5f1e" }, "source": [ "import pandas as pd\n", "wine=pd.read_csv('winequality-red.csv')\n", "wine" ], "execution_count": 2, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
07.40.7000.001.90.07611.034.00.997803.510.569.45
17.80.8800.002.60.09825.067.00.996803.200.689.85
27.80.7600.042.30.09215.054.00.997003.260.659.85
311.20.2800.561.90.07517.060.00.998003.160.589.86
47.40.7000.001.90.07611.034.00.997803.510.569.45
.......................................
15946.20.6000.082.00.09032.044.00.994903.450.5810.55
15955.90.5500.102.20.06239.051.00.995123.520.7611.26
15966.30.5100.132.30.07629.040.00.995743.420.7511.06
15975.90.6450.122.00.07532.044.00.995473.570.7110.25
15986.00.3100.473.60.06718.042.00.995493.390.6611.06
\n", "

1599 rows × 12 columns

\n", "
" ], "text/plain": [ " fixed acidity volatile acidity citric acid ... sulphates alcohol quality\n", "0 7.4 0.700 0.00 ... 0.56 9.4 5\n", "1 7.8 0.880 0.00 ... 0.68 9.8 5\n", "2 7.8 0.760 0.04 ... 0.65 9.8 5\n", "3 11.2 0.280 0.56 ... 0.58 9.8 6\n", "4 7.4 0.700 0.00 ... 0.56 9.4 5\n", "... ... ... ... ... ... ... ...\n", "1594 6.2 0.600 0.08 ... 0.58 10.5 5\n", "1595 5.9 0.550 0.10 ... 0.76 11.2 6\n", "1596 6.3 0.510 0.13 ... 0.75 11.0 6\n", "1597 5.9 0.645 0.12 ... 0.71 10.2 5\n", "1598 6.0 0.310 0.47 ... 0.66 11.0 6\n", "\n", "[1599 rows x 12 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 2 } ] }, { "cell_type": "markdown", "metadata": { "id": "4H-i6DJlxduP" }, "source": [ "# 2. Podział na zbiory test/train przy pomocy SciKit + (poprawka z 26.03.2021 przy pomocy basha)" ] }, { "cell_type": "markdown", "metadata": { "id": "Rf49qKC-eqEU" }, "source": [ "## 2.1 SciKit" ] }, { "cell_type": "markdown", "metadata": { "id": "nZO_naLatT0o" }, "source": [ "Próbowałem również podzielić na podzbiory Train:Dev:Test 6:2:2 Przy pomocy basha ale uznałem, że wygodniejsze jest korzystanie z \"train_test_split()\". Docelowo podział będzie dokonywany na 4 zmienne ` X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)`, jednak chciałem zachować konwencje z przykładu, z ćwiczeń." ] }, { "cell_type": "markdown", "metadata": { "id": "ebHl5Aw1uuK1" }, "source": [ "https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html" ] }, { "cell_type": "code", "metadata": { "id": "X88VMhb0x3gJ" }, "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "wine_train, wine_test = train_test_split(wine, test_size=360,train_size=959, random_state=1)" ], "execution_count": 3, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "OzjEfgNOyAWs", "outputId": "7e7bb70f-2b1e-422c-9500-d411884d8d5a" }, "source": [ "wine_test[\"quality\"].value_counts()" ], "execution_count": 4, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "5 155\n", "6 149\n", "7 37\n", "4 16\n", "8 2\n", "3 1\n", "Name: quality, dtype: int64" ] }, "metadata": { "tags": [] }, "execution_count": 4 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SpQZIuSxyAd0", "outputId": "96505a9a-d2e7-44a1-b2cf-ee40d6d7d3d0" }, "source": [ "wine_train[\"quality\"].value_counts()" ], "execution_count": 5, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "5 400\n", "6 388\n", "7 125\n", "4 30\n", "8 11\n", "3 5\n", "Name: quality, dtype: int64" ] }, "metadata": { "tags": [] }, "execution_count": 5 } ] }, { "cell_type": "markdown", "metadata": { "id": "YK0491tAeupD" }, "source": [ "## 2.2 Bash" ] }, { "cell_type": "code", "metadata": { "id": "1idNUz-9eyfJ" }, "source": [ "!head -n 1 winequality-red.csv > header.csv\n", "!tail -n +2 winequality-red.csv | shuf > data.shuffled\n", "\n", "!head -n 266 data.shuffled > wine.data.test\n", "!head -n 532 data.shuffled | tail -n 266 > wine.data.dev\n", "!tail -n +333 data.shuffled > wine.data.train\n", "\n", "!cat header.csv wine.data.test > test.csv\n", "!cat header.csv wine.data.dev > dev.csv\n", "!cat header.csv wine.data.train > train.csv" ], "execution_count": 6, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-C4RRDH2fFEp", "outputId": "93944a72-838c-4e2b-a907-de4b0902fcb1" }, "source": [ "!wc -l test.csv\n", "!wc -l dev.csv\n", "!wc -l train.csv" ], "execution_count": 7, "outputs": [ { "output_type": "stream", "text": [ "267 test.csv\n", "267 dev.csv\n", "1268 train.csv\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "wLlI-k_jfb70" }, "source": [ "wine_test_bash=pd.read_csv('test.csv')\n", "wine_dev_bash=pd.read_csv('dev.csv')\n", "wine_train_bash=pd.read_csv('train.csv')" ], "execution_count": 8, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "wAq8KmNdyNOm" }, "source": [ "# 3. Statystyki dla zbiorów" ] }, { "cell_type": "code", "metadata": { "id": "Wcq9YSTfXbs1" }, "source": [ "from matplotlib import pyplot as plt\n", "import seaborn as sns" ], "execution_count": 9, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "EjDFpgdPy_of" }, "source": [ "## 3.1. Zbiór Train (bash)" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 419 }, "id": "SscUak3AydG0", "outputId": "5f0bd8df-1753-4211-e3a6-8ce2685146f9" }, "source": [ "wine_train_bash" ], "execution_count": 10, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
010.00.3800.381.60.16927.090.00.999143.150.658.55
16.70.4600.241.70.07718.034.00.994803.390.6010.66
27.20.6950.132.00.07612.020.00.995463.290.5410.15
312.50.6000.494.30.1005.014.01.001003.250.7411.96
48.30.5600.222.40.08210.086.00.998303.370.629.55
.......................................
12627.80.5600.122.00.0827.028.00.997003.370.509.46
12635.80.6800.021.80.08721.094.00.994403.540.5210.05
12647.70.6300.081.90.07615.027.00.996703.320.549.56
12657.10.6000.001.80.07416.034.00.997203.470.709.96
126610.40.6100.492.10.2005.016.00.999403.160.638.43
\n", "

1267 rows × 12 columns

\n", "
" ], "text/plain": [ " fixed acidity volatile acidity citric acid ... sulphates alcohol quality\n", "0 10.0 0.380 0.38 ... 0.65 8.5 5\n", "1 6.7 0.460 0.24 ... 0.60 10.6 6\n", "2 7.2 0.695 0.13 ... 0.54 10.1 5\n", "3 12.5 0.600 0.49 ... 0.74 11.9 6\n", "4 8.3 0.560 0.22 ... 0.62 9.5 5\n", "... ... ... ... ... ... ... ...\n", "1262 7.8 0.560 0.12 ... 0.50 9.4 6\n", "1263 5.8 0.680 0.02 ... 0.52 10.0 5\n", "1264 7.7 0.630 0.08 ... 0.54 9.5 6\n", "1265 7.1 0.600 0.00 ... 0.70 9.9 6\n", "1266 10.4 0.610 0.49 ... 0.63 8.4 3\n", "\n", "[1267 rows x 12 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 10 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "hZAn8j4byMF2", "outputId": "c47596aa-0d54-490f-c892-6ee5987a372d" }, "source": [ "wine_train_bash[\"quality\"].value_counts()" ], "execution_count": 11, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "5 550\n", "6 498\n", "7 157\n", "4 39\n", "8 15\n", "3 8\n", "Name: quality, dtype: int64" ] }, "metadata": { "tags": [] }, "execution_count": 11 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 297 }, "id": "EOEuj8sRyL8v", "outputId": "d2f102f6-d10c-4dc4-ae3f-fd34dc4e5985" }, "source": [ "wine_train_bash.describe(include='all')" ], "execution_count": 12, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
count1267.0000001267.0000001267.0000001267.0000001267.0000001267.0000001267.0000001267.0000001267.0000001267.0000001267.0000001267.000000
mean8.3441990.5258880.2738912.5740330.08741915.88989746.1460140.9967993.3100160.65573010.3967255.632991
std1.7892530.1778040.1961411.4534630.04675410.60367432.7348180.0018930.1540470.1662061.0423530.806931
min4.7000000.1200000.0000000.9000000.0120001.0000006.0000000.9900702.7400000.3700008.4000003.000000
25%7.1000000.3900000.0900001.9000000.0710007.00000022.0000000.9956603.2100000.5500009.5000005.000000
50%7.9000000.5200000.2600002.2000000.08000013.00000037.0000000.9968003.3100000.62000010.2000006.000000
75%9.3000000.6400000.4300002.6000000.09000022.00000062.0000000.9978703.4000000.73000011.0000006.000000
max15.9000001.5800001.00000015.5000000.61100072.000000278.0000001.0036904.0100002.00000014.9000008.000000
\n", "
" ], "text/plain": [ " fixed acidity volatile acidity ... alcohol quality\n", "count 1267.000000 1267.000000 ... 1267.000000 1267.000000\n", "mean 8.344199 0.525888 ... 10.396725 5.632991\n", "std 1.789253 0.177804 ... 1.042353 0.806931\n", "min 4.700000 0.120000 ... 8.400000 3.000000\n", "25% 7.100000 0.390000 ... 9.500000 5.000000\n", "50% 7.900000 0.520000 ... 10.200000 6.000000\n", "75% 9.300000 0.640000 ... 11.000000 6.000000\n", "max 15.900000 1.580000 ... 14.900000 8.000000\n", "\n", "[8 rows x 12 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 12 } ] }, { "cell_type": "markdown", "metadata": { "id": "JWXJ2CZQuylE" }, "source": [ "Testowy Wykres (quality, volatile acidity)" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 408 }, "id": "HbsfwCL7XpNe", "outputId": "249d8110-1b17-41ad-e1b1-18b0aa12ff06" }, "source": [ "fig = plt.figure(figsize = (10,6))\n", "sns.barplot(x = 'quality', y = 'volatile acidity', data = wine_train_bash)" ], "execution_count": 13, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": { "tags": [] }, "execution_count": 13 }, { "output_type": "display_data", "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "tags": [], "needs_background": "light" } } ] }, { "cell_type": "markdown", "metadata": { "id": "1W_oRCVczIgJ" }, "source": [ "## 3.2. Zbiór Test (bash)" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 419 }, "id": "LJzygNqKzOWY", "outputId": "d4f8dd3b-793c-4e02-a6ea-fbdb8fbf7a19" }, "source": [ "wine_test_bash" ], "execution_count": 14, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
07.10.600.012.30.07924.037.00.995143.400.6110.96
17.80.610.291.60.1149.029.00.997403.261.569.15
27.10.630.062.00.0838.029.00.998553.670.739.65
39.10.300.412.00.06810.024.00.995233.270.8511.77
49.00.460.312.80.09319.098.00.998153.320.639.56
.......................................
2617.20.600.042.50.07618.088.00.997453.530.559.55
2628.40.670.192.20.09311.075.00.997363.200.599.24
2638.80.610.194.00.09430.069.00.997873.220.5010.06
2649.60.680.242.20.0875.028.00.998803.140.6010.25
26510.50.430.353.30.09224.070.00.997983.210.6910.56
\n", "

266 rows × 12 columns

\n", "
" ], "text/plain": [ " fixed acidity volatile acidity citric acid ... sulphates alcohol quality\n", "0 7.1 0.60 0.01 ... 0.61 10.9 6\n", "1 7.8 0.61 0.29 ... 1.56 9.1 5\n", "2 7.1 0.63 0.06 ... 0.73 9.6 5\n", "3 9.1 0.30 0.41 ... 0.85 11.7 7\n", "4 9.0 0.46 0.31 ... 0.63 9.5 6\n", ".. ... ... ... ... ... ... ...\n", "261 7.2 0.60 0.04 ... 0.55 9.5 5\n", "262 8.4 0.67 0.19 ... 0.59 9.2 4\n", "263 8.8 0.61 0.19 ... 0.50 10.0 6\n", "264 9.6 0.68 0.24 ... 0.60 10.2 5\n", "265 10.5 0.43 0.35 ... 0.69 10.5 6\n", "\n", "[266 rows x 12 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 14 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1IAtBylEzS8w", "outputId": "1f047c20-f723-490d-ada3-474f5d14db3a" }, "source": [ "wine_test_bash[\"quality\"].value_counts()" ], "execution_count": 15, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "6 109\n", "5 108\n", "7 37\n", "4 8\n", "8 2\n", "3 2\n", "Name: quality, dtype: int64" ] }, "metadata": { "tags": [] }, "execution_count": 15 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 297 }, "id": "V-9cwcrczS-3", "outputId": "a8a26e7f-a2c4-4a44-c91a-6ce57be85386" }, "source": [ "wine_test_bash.describe(include='all')" ], "execution_count": 16, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
count266.000000266.000000266.000000266.000000266.000000266.000000266.000000266.000000266.000000266.000000266.000000266.000000
mean8.2458650.5294550.2662032.3733080.08682315.84022647.4473680.9964993.3131950.67624110.5699255.665414
std1.5261750.1815830.1919681.0053450.04615910.16309634.6103790.0017720.1588710.1877861.1497280.808497
min4.6000000.1800000.0000001.2000000.0390001.0000007.0000000.9908402.8800000.3900009.0000003.000000
25%7.2000000.3925000.1000001.9000000.0680007.00000022.2500000.9953183.2000000.5600009.5000005.000000
50%8.0000000.5200000.2600002.1000000.07800014.00000040.0000000.9965203.3100000.64000010.2500006.000000
75%9.1000000.6300000.4000002.5000000.09200021.00000062.7500000.9976003.4000000.75000011.4000006.000000
max13.3000001.3300000.7400008.8000000.46700051.000000289.0000001.0026003.9000001.98000014.0000008.000000
\n", "
" ], "text/plain": [ " fixed acidity volatile acidity ... alcohol quality\n", "count 266.000000 266.000000 ... 266.000000 266.000000\n", "mean 8.245865 0.529455 ... 10.569925 5.665414\n", "std 1.526175 0.181583 ... 1.149728 0.808497\n", "min 4.600000 0.180000 ... 9.000000 3.000000\n", "25% 7.200000 0.392500 ... 9.500000 5.000000\n", "50% 8.000000 0.520000 ... 10.250000 6.000000\n", "75% 9.100000 0.630000 ... 11.400000 6.000000\n", "max 13.300000 1.330000 ... 14.000000 8.000000\n", "\n", "[8 rows x 12 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 16 } ] }, { "cell_type": "markdown", "metadata": { "id": "wzaUXARnu824" }, "source": [ "Testowy Wykres (quality, volatile acidity)" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 405 }, "id": "3GksWzExaHV7", "outputId": "21b77c09-445c-4e06-fcea-6f26d3717870" }, "source": [ "fig = plt.figure(figsize = (10,6))\n", "sns.barplot(x = 'quality', y = 'volatile acidity', data = wine_test_bash)" ], "execution_count": 17, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": { "tags": [] }, "execution_count": 17 }, { "output_type": "display_data", "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "tags": [], "needs_background": "light" } } ] }, { "cell_type": "markdown", "metadata": { "id": "w5xmkUgGzdxs" }, "source": [ "## 3.3. Cały zbiór" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 419 }, "id": "thGHHVJXzeGe", "outputId": "a1bbe5c6-3aef-4a70-82ec-adc2b9d6daf5" }, "source": [ "wine" ], "execution_count": 18, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
07.40.7000.001.90.07611.034.00.997803.510.569.45
17.80.8800.002.60.09825.067.00.996803.200.689.85
27.80.7600.042.30.09215.054.00.997003.260.659.85
311.20.2800.561.90.07517.060.00.998003.160.589.86
47.40.7000.001.90.07611.034.00.997803.510.569.45
.......................................
15946.20.6000.082.00.09032.044.00.994903.450.5810.55
15955.90.5500.102.20.06239.051.00.995123.520.7611.26
15966.30.5100.132.30.07629.040.00.995743.420.7511.06
15975.90.6450.122.00.07532.044.00.995473.570.7110.25
15986.00.3100.473.60.06718.042.00.995493.390.6611.06
\n", "

1599 rows × 12 columns

\n", "
" ], "text/plain": [ " fixed acidity volatile acidity citric acid ... sulphates alcohol quality\n", "0 7.4 0.700 0.00 ... 0.56 9.4 5\n", "1 7.8 0.880 0.00 ... 0.68 9.8 5\n", "2 7.8 0.760 0.04 ... 0.65 9.8 5\n", "3 11.2 0.280 0.56 ... 0.58 9.8 6\n", "4 7.4 0.700 0.00 ... 0.56 9.4 5\n", "... ... ... ... ... ... ... ...\n", "1594 6.2 0.600 0.08 ... 0.58 10.5 5\n", "1595 5.9 0.550 0.10 ... 0.76 11.2 6\n", "1596 6.3 0.510 0.13 ... 0.75 11.0 6\n", "1597 5.9 0.645 0.12 ... 0.71 10.2 5\n", "1598 6.0 0.310 0.47 ... 0.66 11.0 6\n", "\n", "[1599 rows x 12 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 18 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Ua_ctPpVzeKJ", "outputId": "da95e47b-9e44-42e0-efc0-66631dba99f1" }, "source": [ "wine[\"quality\"].value_counts()" ], "execution_count": 19, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "5 681\n", "6 638\n", "7 199\n", "4 53\n", "8 18\n", "3 10\n", "Name: quality, dtype: int64" ] }, "metadata": { "tags": [] }, "execution_count": 19 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 297 }, "id": "-06v1i7XzeOz", "outputId": "b0da7e9b-98aa-4af6-8131-359a54c2ac69" }, "source": [ "wine.describe(include='all')" ], "execution_count": 20, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
count1599.0000001599.0000001599.0000001599.0000001599.0000001599.0000001599.0000001599.0000001599.0000001599.0000001599.0000001599.000000
mean8.3196370.5278210.2709762.5388060.08746715.87492246.4677920.9967473.3111130.65814910.4229835.636023
std1.7410960.1790600.1948011.4099280.04706510.46015732.8953240.0018870.1543860.1695071.0656680.807569
min4.6000000.1200000.0000000.9000000.0120001.0000006.0000000.9900702.7400000.3300008.4000003.000000
25%7.1000000.3900000.0900001.9000000.0700007.00000022.0000000.9956003.2100000.5500009.5000005.000000
50%7.9000000.5200000.2600002.2000000.07900014.00000038.0000000.9967503.3100000.62000010.2000006.000000
75%9.2000000.6400000.4200002.6000000.09000021.00000062.0000000.9978353.4000000.73000011.1000006.000000
max15.9000001.5800001.00000015.5000000.61100072.000000289.0000001.0036904.0100002.00000014.9000008.000000
\n", "
" ], "text/plain": [ " fixed acidity volatile acidity ... alcohol quality\n", "count 1599.000000 1599.000000 ... 1599.000000 1599.000000\n", "mean 8.319637 0.527821 ... 10.422983 5.636023\n", "std 1.741096 0.179060 ... 1.065668 0.807569\n", "min 4.600000 0.120000 ... 8.400000 3.000000\n", "25% 7.100000 0.390000 ... 9.500000 5.000000\n", "50% 7.900000 0.520000 ... 10.200000 6.000000\n", "75% 9.200000 0.640000 ... 11.100000 6.000000\n", "max 15.900000 1.580000 ... 14.900000 8.000000\n", "\n", "[8 rows x 12 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 20 } ] }, { "cell_type": "markdown", "metadata": { "id": "t8Y53QPyu_fO" }, "source": [ "Testowy Wykres (quality, volatile acidity)" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 405 }, "id": "hEe3BYcJaKnF", "outputId": "cd03275d-d09e-4517-ef76-22b40d9ffa9e" }, "source": [ "fig = plt.figure(figsize = (10,6))\n", "sns.barplot(x = 'quality', y = 'volatile acidity', data = wine)" ], "execution_count": 21, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": { "tags": [] }, "execution_count": 21 }, { "output_type": "display_data", "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "tags": [], "needs_background": "light" } } ] }, { "cell_type": "markdown", "metadata": { "id": "M4hd_N8EgH57" }, "source": [ "## 3.4. zbiór Dev (bash)" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 419 }, "id": "XT3hrfW3gOxH", "outputId": "98ef6303-7f2b-4341-e6ad-c19af8750ccc" }, "source": [ "wine_dev_bash" ], "execution_count": 22, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
08.00.7050.051.90.0748.019.00.996203.340.9510.56
17.60.6650.101.50.06627.055.00.996553.390.519.35
27.80.5500.352.20.07421.066.00.997403.250.569.25
313.00.3200.652.60.09315.047.00.999603.050.6110.65
48.80.6100.302.80.08817.046.00.997603.260.519.34
.......................................
26113.80.4900.673.00.0936.015.00.998603.020.9312.06
2627.10.7500.012.20.05911.018.00.992423.390.4012.86
2639.90.3500.412.30.08311.061.00.998203.210.509.55
2646.50.5200.111.80.07313.038.00.995503.340.529.35
2656.80.6700.001.90.08022.039.00.997013.400.749.75
\n", "

266 rows × 12 columns

\n", "
" ], "text/plain": [ " fixed acidity volatile acidity citric acid ... sulphates alcohol quality\n", "0 8.0 0.705 0.05 ... 0.95 10.5 6\n", "1 7.6 0.665 0.10 ... 0.51 9.3 5\n", "2 7.8 0.550 0.35 ... 0.56 9.2 5\n", "3 13.0 0.320 0.65 ... 0.61 10.6 5\n", "4 8.8 0.610 0.30 ... 0.51 9.3 4\n", ".. ... ... ... ... ... ... ...\n", "261 13.8 0.490 0.67 ... 0.93 12.0 6\n", "262 7.1 0.750 0.01 ... 0.40 12.8 6\n", "263 9.9 0.350 0.41 ... 0.50 9.5 5\n", "264 6.5 0.520 0.11 ... 0.52 9.3 5\n", "265 6.8 0.670 0.00 ... 0.74 9.7 5\n", "\n", "[266 rows x 12 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 22 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "lhRktuxPgOsC", "outputId": "612e6163-0b66-4495-fdc1-2a0813efe37e" }, "source": [ "wine_dev_bash[\"quality\"].value_counts()" ], "execution_count": 23, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "5 115\n", "6 113\n", "7 24\n", "4 9\n", "8 3\n", "3 2\n", "Name: quality, dtype: int64" ] }, "metadata": { "tags": [] }, "execution_count": 23 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 297 }, "id": "FmOQIZMSgOnK", "outputId": "a7f4b4e8-36a0-4a07-cce4-98caa71ff7d0" }, "source": [ "wine_dev_bash.describe(include='all')" ], "execution_count": 24, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
count266.000000266.000000266.000000266.000000266.000000266.000000266.000000266.000000266.000000266.000000266.000000266.000000
mean8.2736840.5400750.2530082.5233080.08862015.39849643.9736840.9967493.3178950.64977410.4533215.590226
std1.7205920.1938560.1903301.3804980.05582510.00221930.5187120.0019300.1520030.1769301.0580100.777841
min4.9000000.1200000.0000001.3000000.0120001.0000008.0000000.9906402.8700000.3300008.5000003.000000
25%7.1000000.3962500.0800001.9000000.0682508.00000020.0000000.9955253.2100000.5425009.5000005.000000
50%7.9000000.5200000.2400002.2000000.07900013.00000037.0000000.9967203.3200000.62000010.2000006.000000
75%9.2000000.6487500.3900002.6000000.09000020.00000060.0000000.9978773.4300000.72000011.2000006.000000
max15.6000001.5800000.76000013.8000000.61100066.000000141.0000001.0031503.7200001.95000014.0000008.000000
\n", "
" ], "text/plain": [ " fixed acidity volatile acidity ... alcohol quality\n", "count 266.000000 266.000000 ... 266.000000 266.000000\n", "mean 8.273684 0.540075 ... 10.453321 5.590226\n", "std 1.720592 0.193856 ... 1.058010 0.777841\n", "min 4.900000 0.120000 ... 8.500000 3.000000\n", "25% 7.100000 0.396250 ... 9.500000 5.000000\n", "50% 7.900000 0.520000 ... 10.200000 6.000000\n", "75% 9.200000 0.648750 ... 11.200000 6.000000\n", "max 15.600000 1.580000 ... 14.000000 8.000000\n", "\n", "[8 rows x 12 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 24 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 405 }, "id": "j3Z6noeZgOjC", "outputId": "de24703b-50d4-4059-d5e6-ddc0c0f3356c" }, "source": [ "fig = plt.figure(figsize = (10,6))\n", "sns.barplot(x = 'quality', y = 'volatile acidity', data = wine_dev_bash)" ], "execution_count": 25, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": { "tags": [] }, "execution_count": 25 }, { "output_type": "display_data", "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "tags": [], "needs_background": "light" } } ] }, { "cell_type": "markdown", "metadata": { "id": "ftWOC-do2Pq-" }, "source": [ "# 4. Normalizacja" ] }, { "cell_type": "markdown", "metadata": { "id": "Wm0EM2hj4s6V" }, "source": [ "Normalizacja kolumny 'quality' na wartości od 0 do 20. Nie jest ona konieczna ale została stworzona w celach demonstracyjnych" ] }, { "cell_type": "code", "metadata": { "id": "EkZQ6Hpy2Tj_" }, "source": [ "wine[\"quality\"]=((wine[\"quality\"]-wine[\"quality\"].min())/(wine[\"quality\"].max()-wine[\"quality\"].min()))*20" ], "execution_count": 26, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 419 }, "id": "_bQgYfct3Tir", "outputId": "8b50d411-b47b-4d4d-d3eb-606d7c134de0" }, "source": [ "wine" ], "execution_count": 27, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
07.40.7000.001.90.07611.034.00.997803.510.569.48.0
17.80.8800.002.60.09825.067.00.996803.200.689.88.0
27.80.7600.042.30.09215.054.00.997003.260.659.88.0
311.20.2800.561.90.07517.060.00.998003.160.589.812.0
47.40.7000.001.90.07611.034.00.997803.510.569.48.0
.......................................
15946.20.6000.082.00.09032.044.00.994903.450.5810.58.0
15955.90.5500.102.20.06239.051.00.995123.520.7611.212.0
15966.30.5100.132.30.07629.040.00.995743.420.7511.012.0
15975.90.6450.122.00.07532.044.00.995473.570.7110.28.0
15986.00.3100.473.60.06718.042.00.995493.390.6611.012.0
\n", "

1599 rows × 12 columns

\n", "
" ], "text/plain": [ " fixed acidity volatile acidity citric acid ... sulphates alcohol quality\n", "0 7.4 0.700 0.00 ... 0.56 9.4 8.0\n", "1 7.8 0.880 0.00 ... 0.68 9.8 8.0\n", "2 7.8 0.760 0.04 ... 0.65 9.8 8.0\n", "3 11.2 0.280 0.56 ... 0.58 9.8 12.0\n", "4 7.4 0.700 0.00 ... 0.56 9.4 8.0\n", "... ... ... ... ... ... ... ...\n", "1594 6.2 0.600 0.08 ... 0.58 10.5 8.0\n", "1595 5.9 0.550 0.10 ... 0.76 11.2 12.0\n", "1596 6.3 0.510 0.13 ... 0.75 11.0 12.0\n", "1597 5.9 0.645 0.12 ... 0.71 10.2 8.0\n", "1598 6.0 0.310 0.47 ... 0.66 11.0 12.0\n", "\n", "[1599 rows x 12 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 27 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "I1AwZoyN4RHs", "outputId": "15a7bca4-8bbe-4749-80b8-5eede667aa07" }, "source": [ "wine[\"quality\"].value_counts()" ], "execution_count": 28, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "8.0 681\n", "12.0 638\n", "16.0 199\n", "4.0 53\n", "20.0 18\n", "0.0 10\n", "Name: quality, dtype: int64" ] }, "metadata": { "tags": [] }, "execution_count": 28 } ] }, { "cell_type": "markdown", "metadata": { "id": "XBU3z_of414w" }, "source": [ "# 5. Usuwanie artefaktów" ] }, { "cell_type": "markdown", "metadata": { "id": "KCstRwQp5-X1" }, "source": [ "### Całe szczęscie nie ma w moim zbiorze ani pustych linijek, ani przykładów z niepoprawnymi wartościami" ] }, { "cell_type": "code", "metadata": { "id": "EJqksTP545UV" }, "source": [ "# Znajdźmy pustą linijkę:\n", "! grep -P \"^$\" -n winequality-red.csv" ], "execution_count": 29, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "8DuoPn3Fa0kP" }, "source": [ "Szukanie wartości \"NA\": https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "REYF2AWjz_lr", "outputId": "01c5cd70-a37e-433f-bde3-d0c855c96c2e" }, "source": [ "wine.isnull().sum()" ], "execution_count": 30, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "fixed acidity 0\n", "volatile acidity 0\n", "citric acid 0\n", "residual sugar 0\n", "chlorides 0\n", "free sulfur dioxide 0\n", "total sulfur dioxide 0\n", "density 0\n", "pH 0\n", "sulphates 0\n", "alcohol 0\n", "quality 0\n", "dtype: int64" ] }, "metadata": { "tags": [] }, "execution_count": 30 } ] }, { "cell_type": "code", "metadata": { "id": "RbkqNj9_akcU" }, "source": [ "wine.dropna(inplace=True) " ], "execution_count": 31, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 419 }, "id": "4WylJo9malyG", "outputId": "95a9b3f4-a7f5-4f61-fdbe-918dbca2d72c" }, "source": [ "wine" ], "execution_count": 32, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
07.40.7000.001.90.07611.034.00.997803.510.569.48.0
17.80.8800.002.60.09825.067.00.996803.200.689.88.0
27.80.7600.042.30.09215.054.00.997003.260.659.88.0
311.20.2800.561.90.07517.060.00.998003.160.589.812.0
47.40.7000.001.90.07611.034.00.997803.510.569.48.0
.......................................
15946.20.6000.082.00.09032.044.00.994903.450.5810.58.0
15955.90.5500.102.20.06239.051.00.995123.520.7611.212.0
15966.30.5100.132.30.07629.040.00.995743.420.7511.012.0
15975.90.6450.122.00.07532.044.00.995473.570.7110.28.0
15986.00.3100.473.60.06718.042.00.995493.390.6611.012.0
\n", "

1599 rows × 12 columns

\n", "
" ], "text/plain": [ " fixed acidity volatile acidity citric acid ... sulphates alcohol quality\n", "0 7.4 0.700 0.00 ... 0.56 9.4 8.0\n", "1 7.8 0.880 0.00 ... 0.68 9.8 8.0\n", "2 7.8 0.760 0.04 ... 0.65 9.8 8.0\n", "3 11.2 0.280 0.56 ... 0.58 9.8 12.0\n", "4 7.4 0.700 0.00 ... 0.56 9.4 8.0\n", "... ... ... ... ... ... ... ...\n", "1594 6.2 0.600 0.08 ... 0.58 10.5 8.0\n", "1595 5.9 0.550 0.10 ... 0.76 11.2 12.0\n", "1596 6.3 0.510 0.13 ... 0.75 11.0 12.0\n", "1597 5.9 0.645 0.12 ... 0.71 10.2 8.0\n", "1598 6.0 0.310 0.47 ... 0.66 11.0 12.0\n", "\n", "[1599 rows x 12 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 32 } ] }, { "cell_type": "code", "metadata": { "id": "iqsJ9Bfngy-m" }, "source": [ "" ], "execution_count": null, "outputs": [] } ] }