{ "cells": [ { "cell_type": "markdown", "id": "b8487a98-1b51-46f1-b727-719575945544", "metadata": { "tags": [] }, "source": [ "### Pobieranie zbioru i pakietów" ] }, { "cell_type": "code", "execution_count": 1, "id": "800bc7a7-aa60-4db8-b170-a5a7340520aa", "metadata": { "ExecuteTime": { "start_time": "2024-03-24T15:19:23.899243Z", "end_time": "2024-03-24T15:19:50.743948Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting kaggle\n", " Downloading kaggle-1.6.6.tar.gz (84 kB)\n", " ---------------------------------------- 84.6/84.6 kB 2.4 MB/s eta 0:00:00\n", " Preparing metadata (setup.py): started\n", " Preparing metadata (setup.py): finished with status 'done'\n", "Requirement already satisfied: six>=1.10 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from kaggle) (1.16.0)\n", "Requirement already satisfied: certifi in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from kaggle) (2022.12.7)\n", "Requirement already satisfied: python-dateutil in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from kaggle) (2.8.2)\n", "Requirement already satisfied: requests in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from kaggle) (2.28.1)\n", "Requirement already satisfied: tqdm in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from kaggle) (4.64.1)\n", "Requirement already satisfied: python-slugify in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from kaggle) (5.0.2)\n", "Requirement already satisfied: urllib3 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from kaggle) (1.26.14)\n", "Requirement already satisfied: bleach in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from kaggle) (4.1.0)\n", "Requirement already satisfied: webencodings in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from bleach->kaggle) (0.5.1)\n", "Requirement already satisfied: packaging in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from bleach->kaggle) (22.0)\n", "Requirement already satisfied: text-unidecode>=1.3 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from python-slugify->kaggle) (1.3)\n", "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from requests->kaggle) (3.4)\n", "Requirement already satisfied: charset-normalizer<3,>=2 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from requests->kaggle) (2.0.4)\n", "Requirement already satisfied: colorama in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from tqdm->kaggle) (0.4.6)\n", "Building wheels for collected packages: kaggle\n", " Building wheel for kaggle (setup.py): started\n", " Building wheel for kaggle (setup.py): finished with status 'done'\n", " Created wheel for kaggle: filename=kaggle-1.6.6-py3-none-any.whl size=111955 sha256=23592736409344e3027e92f5ac103680cd5efb348835a123a68118e729e02b66\n", " Stored in directory: c:\\users\\adamw\\appdata\\local\\pip\\cache\\wheels\\54\\6e\\ff\\d5ab6af2287a2d0c5b8cea9328fb14940ca253fe60214a99c8\n", "Successfully built kaggle\n", "Installing collected packages: kaggle\n", "Successfully installed kaggle-1.6.6\n", "Note: you may need to restart the kernel to use updated packages.\n", "Requirement already satisfied: pandas in c:\\users\\adamw\\anaconda3\\lib\\site-packages (1.5.3)\n", "Requirement already satisfied: pytz>=2020.1 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from pandas) (2022.7)\n", "Requirement already satisfied: numpy>=1.21.0 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from pandas) (1.23.5)\n", "Requirement already satisfied: python-dateutil>=2.8.1 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from pandas) (2.8.2)\n", "Requirement already satisfied: six>=1.5 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)\n", "Note: you may need to restart the kernel to use updated packages.\n", "Requirement already satisfied: numpy in c:\\users\\adamw\\anaconda3\\lib\\site-packages (1.23.5)\n", "Note: you may need to restart the kernel to use updated packages.\n", "Requirement already satisfied: scikit-learn in c:\\users\\adamw\\anaconda3\\lib\\site-packages (1.2.1)\n", "Requirement already satisfied: numpy>=1.17.3 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from scikit-learn) (1.23.5)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from scikit-learn) (2.2.0)\n", "Requirement already satisfied: joblib>=1.1.1 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from scikit-learn) (1.1.1)\n", "Requirement already satisfied: scipy>=1.3.2 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from scikit-learn) (1.10.0)\n", "Note: you may need to restart the kernel to use updated packages.\n", "Requirement already satisfied: seaborn in c:\\users\\adamw\\anaconda3\\lib\\site-packages (0.12.2)\n", "Requirement already satisfied: numpy!=1.24.0,>=1.17 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from seaborn) (1.23.5)\n", "Requirement already satisfied: matplotlib!=3.6.1,>=3.1 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from seaborn) (3.7.0)\n", "Requirement already satisfied: pandas>=0.25 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from seaborn) (1.5.3)\n", "Requirement already satisfied: pyparsing>=2.3.1 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (3.0.9)\n", "Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.0.5)\n", "Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (4.25.0)\n", "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (9.4.0)\n", "Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (2.8.2)\n", "Requirement already satisfied: packaging>=20.0 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (22.0)\n", "Requirement already satisfied: cycler>=0.10 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (0.11.0)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.4.4)\n", "Requirement already satisfied: pytz>=2020.1 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from pandas>=0.25->seaborn) (2022.7)\n", "Requirement already satisfied: six>=1.5 in c:\\users\\adamw\\anaconda3\\lib\\site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.1->seaborn) (1.16.0)\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "%pip install kaggle\n", "%pip install pandas\n", "%pip install numpy\n", "%pip install scikit-learn\n", "%pip install seaborn" ] }, { "cell_type": "code", "execution_count": 3, "id": "f132ca66-2325-48e0-8bf8-ff983d8ad1ce", "metadata": { "ExecuteTime": { "start_time": "2024-03-24T15:21:05.151558Z", "end_time": "2024-03-24T15:21:12.115431Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloading 1-5-million-beer-reviews-from-beer-advocate.zip to C:\\Users\\adamw\\REPOS\\ium_464979\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", " 0%| | 0.00/32.5M [00:00<?, ?B/s]\n", " 3%|3 | 1.00M/32.5M [00:00<00:21, 1.53MB/s]\n", " 6%|6 | 2.00M/32.5M [00:00<00:11, 2.78MB/s]\n", " 9%|9 | 3.00M/32.5M [00:00<00:07, 3.87MB/s]\n", " 12%|#2 | 4.00M/32.5M [00:01<00:06, 4.72MB/s]\n", " 15%|#5 | 5.00M/32.5M [00:01<00:05, 5.20MB/s]\n", " 18%|#8 | 6.00M/32.5M [00:01<00:05, 5.08MB/s]\n", " 22%|##1 | 7.00M/32.5M [00:01<00:05, 5.19MB/s]\n", " 25%|##4 | 8.00M/32.5M [00:01<00:04, 5.21MB/s]\n", " 28%|##7 | 9.00M/32.5M [00:02<00:04, 5.12MB/s]\n", " 31%|### | 10.0M/32.5M [00:02<00:04, 5.25MB/s]\n", " 34%|###3 | 11.0M/32.5M [00:02<00:04, 5.50MB/s]\n", " 37%|###6 | 12.0M/32.5M [00:02<00:03, 6.10MB/s]\n", " 40%|#### | 13.0M/32.5M [00:02<00:03, 6.57MB/s]\n", " 43%|####3 | 14.0M/32.5M [00:02<00:03, 6.39MB/s]\n", " 46%|####6 | 15.0M/32.5M [00:03<00:03, 6.10MB/s]\n", " 49%|####9 | 16.0M/32.5M [00:03<00:02, 5.83MB/s]\n", " 52%|#####2 | 17.0M/32.5M [00:03<00:02, 5.85MB/s]\n", " 55%|#####5 | 18.0M/32.5M [00:03<00:02, 5.87MB/s]\n", " 59%|#####8 | 19.0M/32.5M [00:03<00:02, 6.00MB/s]\n", " 62%|######1 | 20.0M/32.5M [00:03<00:01, 6.79MB/s]\n", " 65%|######4 | 21.0M/32.5M [00:04<00:01, 7.17MB/s]\n", " 71%|####### | 23.0M/32.5M [00:04<00:01, 8.01MB/s]\n", " 74%|#######3 | 24.0M/32.5M [00:04<00:01, 7.80MB/s]\n", " 77%|#######7 | 25.0M/32.5M [00:04<00:01, 7.72MB/s]\n", " 80%|######## | 26.0M/32.5M [00:04<00:00, 7.58MB/s]\n", " 83%|########3 | 27.0M/32.5M [00:05<00:01, 5.54MB/s]\n", " 86%|########6 | 28.0M/32.5M [00:05<00:00, 5.95MB/s]\n", " 89%|########9 | 29.0M/32.5M [00:05<00:00, 6.66MB/s]\n", " 95%|#########5| 31.0M/32.5M [00:05<00:00, 7.50MB/s]\n", "100%|##########| 32.5M/32.5M [00:05<00:00, 8.35MB/s]\n", "100%|##########| 32.5M/32.5M [00:05<00:00, 6.00MB/s]\n" ] } ], "source": [ "!kaggle datasets download -d thedevastator/1-5-million-beer-reviews-from-beer-advocate" ] }, { "cell_type": "code", "execution_count": null, "id": "cd162ce4-9ca6-4631-86fc-55aa3704a1fb", "metadata": { "tags": [] }, "outputs": [], "source": [ "!kaggle datasets download -d" ] }, { "cell_type": "code", "execution_count": null, "id": "8516dbcf-1628-4059-a212-7bb36641151f", "metadata": {}, "outputs": [], "source": [ "!unzip -o 1-5-million-beer-reviews-from-beer-advocate.zip" ] }, { "cell_type": "code", "execution_count": 43, "id": "399983f4-351b-485e-835f-03caf3302743", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "from sklearn.model_selection import train_test_split\n", "import matplotlib.pyplot as plt\n", "from sklearn.preprocessing import MinMaxScaler\n", "\n", "pd.set_option('float_format', '{:f}'.format)" ] }, { "cell_type": "markdown", "id": "205c3fcf-fa0e-4d62-92f0-e8c72737e8b3", "metadata": { "tags": [] }, "source": [ "## Wczytywanie danych" ] }, { "cell_type": "code", "execution_count": 8, "id": "69561c75-0140-4f3d-93de-391aa344755c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>index</th>\n", " <th>brewery_id</th>\n", " <th>brewery_name</th>\n", " <th>review_time</th>\n", " <th>review_overall</th>\n", " <th>review_aroma</th>\n", " <th>review_appearance</th>\n", " <th>review_profilename</th>\n", " <th>beer_style</th>\n", " <th>review_palate</th>\n", " <th>review_taste</th>\n", " <th>beer_name</th>\n", " <th>beer_abv</th>\n", " <th>beer_beerid</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0</td>\n", " <td>10325</td>\n", " <td>Vecchio Birraio</td>\n", " <td>1234817823</td>\n", " <td>1.500000</td>\n", " <td>2.000000</td>\n", " <td>2.500000</td>\n", " <td>stcules</td>\n", " <td>Hefeweizen</td>\n", " <td>1.500000</td>\n", " <td>1.500000</td>\n", " <td>Sausa Weizen</td>\n", " <td>5.000000</td>\n", " <td>47986</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1</td>\n", " <td>10325</td>\n", " <td>Vecchio Birraio</td>\n", " <td>1235915097</td>\n", " <td>3.000000</td>\n", " <td>2.500000</td>\n", " <td>3.000000</td>\n", " <td>stcules</td>\n", " <td>English Strong Ale</td>\n", " <td>3.000000</td>\n", " <td>3.000000</td>\n", " <td>Red Moon</td>\n", " <td>6.200000</td>\n", " <td>48213</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>2</td>\n", " <td>10325</td>\n", " <td>Vecchio Birraio</td>\n", " <td>1235916604</td>\n", " <td>3.000000</td>\n", " <td>2.500000</td>\n", " <td>3.000000</td>\n", " <td>stcules</td>\n", " <td>Foreign / Export Stout</td>\n", " <td>3.000000</td>\n", " <td>3.000000</td>\n", " <td>Black Horse Black Beer</td>\n", " <td>6.500000</td>\n", " <td>48215</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>3</td>\n", " <td>10325</td>\n", " <td>Vecchio Birraio</td>\n", " <td>1234725145</td>\n", " <td>3.000000</td>\n", " <td>3.000000</td>\n", " <td>3.500000</td>\n", " <td>stcules</td>\n", " <td>German Pilsener</td>\n", " <td>2.500000</td>\n", " <td>3.000000</td>\n", " <td>Sausa Pils</td>\n", " <td>5.000000</td>\n", " <td>47969</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>4</td>\n", " <td>1075</td>\n", " <td>Caldera Brewing Company</td>\n", " <td>1293735206</td>\n", " <td>4.000000</td>\n", " <td>4.500000</td>\n", " <td>4.000000</td>\n", " <td>johnmichaelsen</td>\n", " <td>American Double / Imperial IPA</td>\n", " <td>4.000000</td>\n", " <td>4.500000</td>\n", " <td>Cauldron DIPA</td>\n", " <td>7.700000</td>\n", " <td>64883</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " index brewery_id brewery_name review_time review_overall \\\n", "0 0 10325 Vecchio Birraio 1234817823 1.500000 \n", "1 1 10325 Vecchio Birraio 1235915097 3.000000 \n", "2 2 10325 Vecchio Birraio 1235916604 3.000000 \n", "3 3 10325 Vecchio Birraio 1234725145 3.000000 \n", "4 4 1075 Caldera Brewing Company 1293735206 4.000000 \n", "\n", " review_aroma review_appearance review_profilename \\\n", "0 2.000000 2.500000 stcules \n", "1 2.500000 3.000000 stcules \n", "2 2.500000 3.000000 stcules \n", "3 3.000000 3.500000 stcules \n", "4 4.500000 4.000000 johnmichaelsen \n", "\n", " beer_style review_palate review_taste \\\n", "0 Hefeweizen 1.500000 1.500000 \n", "1 English Strong Ale 3.000000 3.000000 \n", "2 Foreign / Export Stout 3.000000 3.000000 \n", "3 German Pilsener 2.500000 3.000000 \n", "4 American Double / Imperial IPA 4.000000 4.500000 \n", "\n", " beer_name beer_abv beer_beerid \n", "0 Sausa Weizen 5.000000 47986 \n", "1 Red Moon 6.200000 48213 \n", "2 Black Horse Black Beer 6.500000 48215 \n", "3 Sausa Pils 5.000000 47969 \n", "4 Cauldron DIPA 7.700000 64883 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "beers=pd.read_csv('beer_reviews.csv')\n", "\n", "beers.head()" ] }, { "cell_type": "code", "execution_count": 9, "id": "f54a599d-9cee-4b1f-9be1-c7bad6129760", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<class 'pandas.core.frame.DataFrame'>\n", "RangeIndex: 1586614 entries, 0 to 1586613\n", "Data columns (total 14 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 index 1586614 non-null int64 \n", " 1 brewery_id 1586614 non-null int64 \n", " 2 brewery_name 1586599 non-null object \n", " 3 review_time 1586614 non-null int64 \n", " 4 review_overall 1586614 non-null float64\n", " 5 review_aroma 1586614 non-null float64\n", " 6 review_appearance 1586614 non-null float64\n", " 7 review_profilename 1586266 non-null object \n", " 8 beer_style 1586614 non-null object \n", " 9 review_palate 1586614 non-null float64\n", " 10 review_taste 1586614 non-null float64\n", " 11 beer_name 1586614 non-null object \n", " 12 beer_abv 1518829 non-null float64\n", " 13 beer_beerid 1586614 non-null int64 \n", "dtypes: float64(6), int64(4), object(4)\n", "memory usage: 169.5+ MB\n" ] } ], "source": [ "beers.info()" ] }, { "cell_type": "markdown", "id": "81107f1b-bfd2-40ce-b1dd-a98be02c0e9f", "metadata": {}, "source": [ "### Czyszczenie " ] }, { "cell_type": "code", "execution_count": 49, "id": "a1c7ea8b-b9a4-4098-8e31-32ae0cf22075", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "index 0\n", "brewery_id 0\n", "brewery_name 0\n", "review_time 0\n", "review_overall 0\n", "review_aroma 0\n", "review_appearance 0\n", "review_profilename 0\n", "beer_style 0\n", "review_palate 0\n", "review_taste 0\n", "beer_name 0\n", "beer_abv 0\n", "beer_beerid 0\n", "dtype: int64" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "beers.dropna(subset=['brewery_name'], inplace=True)\n", "beers.dropna(subset=['review_profilename'], inplace=True)\n", "beers.dropna(subset=['beer_abv'], inplace=True)\n", "\n", "beers.isnull().sum()" ] }, { "cell_type": "markdown", "id": "7e79db21-7b02-4f76-972f-da3092a0d22c", "metadata": {}, "source": [ "### Normalizacja" ] }, { "cell_type": "code", "execution_count": 22, "id": "e83dd914-b8cf-4e72-a9ea-f4e7f2f63791", "metadata": {}, "outputs": [], "source": [ "scaler = MinMaxScaler()\n", "\n", "beers[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']] = scaler.fit_transform(beers[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']])" ] }, { "cell_type": "markdown", "id": "46e10b78-fb00-4f7e-9c40-7ee08ebeeffe", "metadata": {}, "source": [ "### Podział na podzbiory" ] }, { "cell_type": "code", "execution_count": 24, "id": "e5f6f028-dfcb-4cc5-9bd1-bc9bd51c0a31", "metadata": {}, "outputs": [], "source": [ "beers_train, beers_dev_test = train_test_split(beers, test_size=0.2, random_state=1234)\n", "beers_dev, beers_test = train_test_split(beers_dev_test, test_size=0.5, random_state=1234)" ] }, { "cell_type": "code", "execution_count": 25, "id": "c9feafcc-3591-4d7b-8282-f0f2e2ebd782", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Liczba kolumn w każdym zbiorze: 14 kolumn\n", "Całość: 1518478 rekordów \n", "Train: 1214782 rekordów\n", "Dev: 151848 rekordów\n", "Test: 151848 rekordów\n" ] } ], "source": [ "print(f\"Liczba kolumn w każdym zbiorze: {beers.shape[1]} kolumn\")\n", "print(f\"Całość: {beers.shape[0]} rekordów \")\n", "print(f\"Train: {beers_train.shape[0]} rekordów\")\n", "print(f\"Dev: {beers_dev.shape[0]} rekordów\")\n", "print(f\"Test: {beers_test.shape[0]} rekordów\")" ] }, { "cell_type": "markdown", "id": "c811f83b-351e-45c0-bb0f-c1cf68afd669", "metadata": {}, "source": [ "### Przegląd danych" ] }, { "cell_type": "code", "execution_count": 51, "id": "75ffb6e4-3780-4e5f-b151-4b2929237e2a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Suma różnych piw: 44075\n", "Suma różnych styli: 104\n", "Suma różnych browarów: 5155\n" ] } ], "source": [ "print(f\"Suma różnych piw: {beers['beer_name'].nunique()}\")\n", "print(f\"Suma różnych styli: {beers['beer_style'].nunique()}\")\n", "print(f\"Suma różnych browarów: {beers['brewery_name'].nunique()}\")" ] }, { "cell_type": "code", "execution_count": 76, "id": "02571b1b-471c-4339-8422-c8fc27ce6055", "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "<Figure size 640x480 with 1 Axes>" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "style_counts = beers['beer_style'].value_counts()\n", "\n", "top_15_styles = style_counts.head(15) \n", "\n", "plt.bar(top_15_styles.index, top_15_styles.values)\n", "plt.xlabel('Styl')\n", "plt.ylabel('Liczba piw')\n", "plt.title('Ilość piw dla naliczniejszych styli')\n", "plt.xticks(rotation=90)\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 91, "id": "0f1a2572-db91-4d8f-ad73-69327e60a606", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>review_overall</th>\n", " <th>Liczba opini</th>\n", " </tr>\n", " <tr>\n", " <th>beer_name</th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>90 Minute IPA</th>\n", " <td>0.829097</td>\n", " <td>3289</td>\n", " </tr>\n", " <tr>\n", " <th>Old Rasputin Russian Imperial Stout</th>\n", " <td>0.834823</td>\n", " <td>3110</td>\n", " </tr>\n", " <tr>\n", " <th>Sierra Nevada Celebration Ale</th>\n", " <td>0.833711</td>\n", " <td>2999</td>\n", " </tr>\n", " <tr>\n", " <th>India Pale Ale</th>\n", " <td>0.770777</td>\n", " <td>2960</td>\n", " </tr>\n", " <tr>\n", " <th>Two Hearted Ale</th>\n", " <td>0.866043</td>\n", " <td>2727</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " review_overall Liczba opini\n", "beer_name \n", "90 Minute IPA 0.829097 3289\n", "Old Rasputin Russian Imperial Stout 0.834823 3110\n", "Sierra Nevada Celebration Ale 0.833711 2999\n", "India Pale Ale 0.770777 2960\n", "Two Hearted Ale 0.866043 2727" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "reviews = pd.DataFrame(beers.groupby('beer_name')['review_overall'].mean())\n", "reviews['Liczba opini'] = pd.DataFrame(beers.groupby('beer_name')['review_overall'].count())\n", "reviews = reviews.sort_values(by=['Liczba opini'], ascending=False)\n", "reviews.head()" ] }, { "cell_type": "code", "execution_count": 32, "id": "20444c91-b0be-44c8-ba99-b24290a054a0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>review_overall</th>\n", " <th>review_aroma</th>\n", " <th>review_appearance</th>\n", " <th>review_palate</th>\n", " <th>review_taste</th>\n", " <th>beer_abv</th>\n", " <th>beer_beerid</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>count</th>\n", " <td>1518478.000</td>\n", " <td>1518478.000</td>\n", " <td>1518478.000</td>\n", " <td>1518478.000</td>\n", " <td>1518478.000</td>\n", " <td>1518478.000</td>\n", " <td>1518478.000</td>\n", " </tr>\n", " <tr>\n", " <th>mean</th>\n", " <td>0.765</td>\n", " <td>0.687</td>\n", " <td>0.770</td>\n", " <td>0.688</td>\n", " <td>0.701</td>\n", " <td>0.122</td>\n", " <td>0.277</td>\n", " </tr>\n", " <tr>\n", " <th>std</th>\n", " <td>0.143</td>\n", " <td>0.174</td>\n", " <td>0.123</td>\n", " <td>0.170</td>\n", " <td>0.182</td>\n", " <td>0.040</td>\n", " <td>0.282</td>\n", " </tr>\n", " <tr>\n", " <th>min</th>\n", " <td>0.000</td>\n", " <td>0.000</td>\n", " <td>0.000</td>\n", " <td>0.000</td>\n", " <td>0.000</td>\n", " <td>0.000</td>\n", " <td>0.000</td>\n", " </tr>\n", " <tr>\n", " <th>25%</th>\n", " <td>0.700</td>\n", " <td>0.625</td>\n", " <td>0.700</td>\n", " <td>0.625</td>\n", " <td>0.625</td>\n", " <td>0.090</td>\n", " <td>0.021</td>\n", " </tr>\n", " <tr>\n", " <th>50%</th>\n", " <td>0.800</td>\n", " <td>0.750</td>\n", " <td>0.800</td>\n", " <td>0.750</td>\n", " <td>0.750</td>\n", " <td>0.112</td>\n", " <td>0.166</td>\n", " </tr>\n", " <tr>\n", " <th>75%</th>\n", " <td>0.900</td>\n", " <td>0.750</td>\n", " <td>0.800</td>\n", " <td>0.750</td>\n", " <td>0.875</td>\n", " <td>0.147</td>\n", " <td>0.507</td>\n", " </tr>\n", " <tr>\n", " <th>max</th>\n", " <td>1.000</td>\n", " <td>1.000</td>\n", " <td>1.000</td>\n", " <td>1.000</td>\n", " <td>1.000</td>\n", " <td>1.000</td>\n", " <td>1.000</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " review_overall review_aroma review_appearance review_palate \\\n", "count 1518478.000 1518478.000 1518478.000 1518478.000 \n", "mean 0.765 0.687 0.770 0.688 \n", "std 0.143 0.174 0.123 0.170 \n", "min 0.000 0.000 0.000 0.000 \n", "25% 0.700 0.625 0.700 0.625 \n", "50% 0.800 0.750 0.800 0.750 \n", "75% 0.900 0.750 0.800 0.750 \n", "max 1.000 1.000 1.000 1.000 \n", "\n", " review_taste beer_abv beer_beerid \n", "count 1518478.000 1518478.000 1518478.000 \n", "mean 0.701 0.122 0.277 \n", "std 0.182 0.040 0.282 \n", "min 0.000 0.000 0.000 \n", "25% 0.625 0.090 0.021 \n", "50% 0.750 0.112 0.166 \n", "75% 0.875 0.147 0.507 \n", "max 1.000 1.000 1.000 " ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "beers[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().applymap(lambda x: f\"{x:0.3f}\")" ] }, { "cell_type": "code", "execution_count": 33, "id": "98febfcb-f801-4fed-88c8-2c188cae111c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>review_overall</th>\n", " <th>review_aroma</th>\n", " <th>review_appearance</th>\n", " <th>review_palate</th>\n", " <th>review_taste</th>\n", " <th>beer_abv</th>\n", " <th>beer_beerid</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>count</th>\n", " <td>1214782.0</td>\n", " <td>1214782.0</td>\n", " <td>1214782.0</td>\n", " <td>1214782.0</td>\n", " <td>1214782.0</td>\n", " <td>1214782.0</td>\n", " <td>1214782.0</td>\n", " </tr>\n", " <tr>\n", " <th>mean</th>\n", " <td>0.8</td>\n", " <td>0.7</td>\n", " <td>0.8</td>\n", " <td>0.7</td>\n", " <td>0.7</td>\n", " <td>0.1</td>\n", " <td>0.3</td>\n", " </tr>\n", " <tr>\n", " <th>std</th>\n", " <td>0.1</td>\n", " <td>0.2</td>\n", " <td>0.1</td>\n", " <td>0.2</td>\n", " <td>0.2</td>\n", " <td>0.0</td>\n", " <td>0.3</td>\n", " </tr>\n", " <tr>\n", " <th>min</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>25%</th>\n", " <td>0.7</td>\n", " <td>0.6</td>\n", " <td>0.7</td>\n", " <td>0.6</td>\n", " <td>0.6</td>\n", " <td>0.1</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>50%</th>\n", " <td>0.8</td>\n", " <td>0.8</td>\n", " <td>0.8</td>\n", " <td>0.8</td>\n", " <td>0.8</td>\n", " <td>0.1</td>\n", " <td>0.2</td>\n", " </tr>\n", " <tr>\n", " <th>75%</th>\n", " <td>0.9</td>\n", " <td>0.8</td>\n", " <td>0.8</td>\n", " <td>0.8</td>\n", " <td>0.9</td>\n", " <td>0.1</td>\n", " <td>0.5</td>\n", " </tr>\n", " <tr>\n", " <th>max</th>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " review_overall review_aroma review_appearance review_palate \\\n", "count 1214782.0 1214782.0 1214782.0 1214782.0 \n", "mean 0.8 0.7 0.8 0.7 \n", "std 0.1 0.2 0.1 0.2 \n", "min 0.0 0.0 0.0 0.0 \n", "25% 0.7 0.6 0.7 0.6 \n", "50% 0.8 0.8 0.8 0.8 \n", "75% 0.9 0.8 0.8 0.8 \n", "max 1.0 1.0 1.0 1.0 \n", "\n", " review_taste beer_abv beer_beerid \n", "count 1214782.0 1214782.0 1214782.0 \n", "mean 0.7 0.1 0.3 \n", "std 0.2 0.0 0.3 \n", "min 0.0 0.0 0.0 \n", "25% 0.6 0.1 0.0 \n", "50% 0.8 0.1 0.2 \n", "75% 0.9 0.1 0.5 \n", "max 1.0 1.0 1.0 " ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "beers_train[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().applymap(lambda x: f\"{x:0.1f}\")" ] }, { "cell_type": "code", "execution_count": 34, "id": "9b675fc2-42d8-4d3a-b6b9-3b35a0b8ab08", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>review_overall</th>\n", " <th>review_aroma</th>\n", " <th>review_appearance</th>\n", " <th>review_palate</th>\n", " <th>review_taste</th>\n", " <th>beer_abv</th>\n", " <th>beer_beerid</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>count</th>\n", " <td>151848.0</td>\n", " <td>151848.0</td>\n", " <td>151848.0</td>\n", " <td>151848.0</td>\n", " <td>151848.0</td>\n", " <td>151848.0</td>\n", " <td>151848.0</td>\n", " </tr>\n", " <tr>\n", " <th>mean</th>\n", " <td>0.8</td>\n", " <td>0.7</td>\n", " <td>0.8</td>\n", " <td>0.7</td>\n", " <td>0.7</td>\n", " <td>0.1</td>\n", " <td>0.3</td>\n", " </tr>\n", " <tr>\n", " <th>std</th>\n", " <td>0.1</td>\n", " <td>0.2</td>\n", " <td>0.1</td>\n", " <td>0.2</td>\n", " <td>0.2</td>\n", " <td>0.0</td>\n", " <td>0.3</td>\n", " </tr>\n", " <tr>\n", " <th>min</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>25%</th>\n", " <td>0.7</td>\n", " <td>0.6</td>\n", " <td>0.7</td>\n", " <td>0.6</td>\n", " <td>0.6</td>\n", " <td>0.1</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>50%</th>\n", " <td>0.8</td>\n", " <td>0.8</td>\n", " <td>0.8</td>\n", " <td>0.8</td>\n", " <td>0.8</td>\n", " <td>0.1</td>\n", " <td>0.2</td>\n", " </tr>\n", " <tr>\n", " <th>75%</th>\n", " <td>0.9</td>\n", " <td>0.8</td>\n", " <td>0.8</td>\n", " <td>0.8</td>\n", " <td>0.9</td>\n", " <td>0.1</td>\n", " <td>0.5</td>\n", " </tr>\n", " <tr>\n", " <th>max</th>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>0.7</td>\n", " <td>1.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " review_overall review_aroma review_appearance review_palate \\\n", "count 151848.0 151848.0 151848.0 151848.0 \n", "mean 0.8 0.7 0.8 0.7 \n", "std 0.1 0.2 0.1 0.2 \n", "min 0.0 0.0 0.0 0.0 \n", "25% 0.7 0.6 0.7 0.6 \n", "50% 0.8 0.8 0.8 0.8 \n", "75% 0.9 0.8 0.8 0.8 \n", "max 1.0 1.0 1.0 1.0 \n", "\n", " review_taste beer_abv beer_beerid \n", "count 151848.0 151848.0 151848.0 \n", "mean 0.7 0.1 0.3 \n", "std 0.2 0.0 0.3 \n", "min 0.0 0.0 0.0 \n", "25% 0.6 0.1 0.0 \n", "50% 0.8 0.1 0.2 \n", "75% 0.9 0.1 0.5 \n", "max 1.0 0.7 1.0 " ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "beers_dev[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().applymap(lambda x: f\"{x:0.1f}\")" ] }, { "cell_type": "code", "execution_count": 35, "id": "fa018c6f-4093-414a-aef3-48cedb1d82d2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>review_overall</th>\n", " <th>review_aroma</th>\n", " <th>review_appearance</th>\n", " <th>review_palate</th>\n", " <th>review_taste</th>\n", " <th>beer_abv</th>\n", " <th>beer_beerid</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>count</th>\n", " <td>151848.0</td>\n", " <td>151848.0</td>\n", " <td>151848.0</td>\n", " <td>151848.0</td>\n", " <td>151848.0</td>\n", " <td>151848.0</td>\n", " <td>151848.0</td>\n", " </tr>\n", " <tr>\n", " <th>mean</th>\n", " <td>0.8</td>\n", " <td>0.7</td>\n", " <td>0.8</td>\n", " <td>0.7</td>\n", " <td>0.7</td>\n", " <td>0.1</td>\n", " <td>0.3</td>\n", " </tr>\n", " <tr>\n", " <th>std</th>\n", " <td>0.1</td>\n", " <td>0.2</td>\n", " <td>0.1</td>\n", " <td>0.2</td>\n", " <td>0.2</td>\n", " <td>0.0</td>\n", " <td>0.3</td>\n", " </tr>\n", " <tr>\n", " <th>min</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>25%</th>\n", " <td>0.7</td>\n", " <td>0.6</td>\n", " <td>0.7</td>\n", " <td>0.6</td>\n", " <td>0.6</td>\n", " <td>0.1</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>50%</th>\n", " <td>0.8</td>\n", " <td>0.8</td>\n", " <td>0.8</td>\n", " <td>0.8</td>\n", " <td>0.8</td>\n", " <td>0.1</td>\n", " <td>0.2</td>\n", " </tr>\n", " <tr>\n", " <th>75%</th>\n", " <td>0.9</td>\n", " <td>0.8</td>\n", " <td>0.8</td>\n", " <td>0.8</td>\n", " <td>0.9</td>\n", " <td>0.1</td>\n", " <td>0.5</td>\n", " </tr>\n", " <tr>\n", " <th>max</th>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>0.7</td>\n", " <td>1.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " review_overall review_aroma review_appearance review_palate \\\n", "count 151848.0 151848.0 151848.0 151848.0 \n", "mean 0.8 0.7 0.8 0.7 \n", "std 0.1 0.2 0.1 0.2 \n", "min 0.0 0.0 0.0 0.0 \n", "25% 0.7 0.6 0.7 0.6 \n", "50% 0.8 0.8 0.8 0.8 \n", "75% 0.9 0.8 0.8 0.8 \n", "max 1.0 1.0 1.0 1.0 \n", "\n", " review_taste beer_abv beer_beerid \n", "count 151848.0 151848.0 151848.0 \n", "mean 0.7 0.1 0.3 \n", "std 0.2 0.0 0.3 \n", "min 0.0 0.0 0.0 \n", "25% 0.6 0.1 0.0 \n", "50% 0.8 0.1 0.2 \n", "75% 0.9 0.1 0.5 \n", "max 1.0 0.7 1.0 " ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "beers_test[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().applymap(lambda x: f\"{x:0.1f}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "8ce7d432-8d8a-40b0-a247-300f9a39ad44", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" } }, "nbformat": 4, "nbformat_minor": 5 }