#!/usr/bin/env python # coding: utf-8 # ### Pobieranie zbioru i pakietów # In[1]: from kaggle.api.kaggle_api_extended import KaggleApi api = KaggleApi() api.authenticate() api.dataset_download_files('thedevastator/1-5-million-beer-reviews-from-beer-advocate', path="/app", unzip=True) # get_ipython().run_line_magic('pip', 'install kaggle') # get_ipython().run_line_magic('pip', 'install pandas') # get_ipython().run_line_magic('pip', 'install numpy') # get_ipython().run_line_magic('pip', 'install scikit-learn') # get_ipython().run_line_magic('pip', 'install seaborn') # # # # In[3]: # # # get_ipython().system('kaggle datasets download -d thedevastator/1-5-million-beer-reviews-from-beer-advocate') # #Change # # # In[ ]: # # # get_ipython().system('kaggle datasets download -d') # # # # In[ ]: # # # get_ipython().system('unzip -o 1-5-million-beer-reviews-from-beer-advocate.zip') # In[43]: import numpy as np import pandas as pd import seaborn as sns from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt from sklearn.preprocessing import MinMaxScaler pd.set_option('float_format', '{:f}'.format) # ## Wczytywanie danych # In[8]: beers=pd.read_csv('beer_reviews.csv') beers.head() # In[9]: beers.info() # ### Czyszczenie # In[49]: beers.dropna(subset=['brewery_name'], inplace=True) beers.dropna(subset=['review_profilename'], inplace=True) beers.dropna(subset=['beer_abv'], inplace=True) beers.isnull().sum() # ### Normalizacja # In[22]: scaler = MinMaxScaler() beers[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']] = scaler.fit_transform(beers[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']]) # ### Podział na podzbiory # In[24]: beers_train, beers_dev_test = train_test_split(beers, test_size=0.2, random_state=1234) beers_dev, beers_test = train_test_split(beers_dev_test, test_size=0.5, random_state=1234) # In[25]: print(f"Liczba kolumn w każdym zbiorze: {beers.shape[1]} kolumn") print(f"Całość: {beers.shape[0]} rekordów ") print(f"Train: {beers_train.shape[0]} rekordów") print(f"Dev: {beers_dev.shape[0]} rekordów") print(f"Test: {beers_test.shape[0]} rekordów") # ### Przegląd danych # In[51]: print(f"Suma różnych piw: {beers['beer_name'].nunique()}") print(f"Suma różnych styli: {beers['beer_style'].nunique()}") print(f"Suma różnych browarów: {beers['brewery_name'].nunique()}") # In[76]: style_counts = beers['beer_style'].value_counts() top_15_styles = style_counts.head(15) plt.bar(top_15_styles.index, top_15_styles.values) plt.xlabel('Styl') plt.ylabel('Liczba piw') plt.title('Ilość piw dla naliczniejszych styli') plt.xticks(rotation=90) plt.tight_layout() plt.show() # In[91]: reviews = pd.DataFrame(beers.groupby('beer_name')['review_overall'].mean()) reviews['Liczba opini'] = pd.DataFrame(beers.groupby('beer_name')['review_overall'].count()) reviews = reviews.sort_values(by=['Liczba opini'], ascending=False) reviews.head() # In[32]: beers[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().map(lambda x: f"{x:0.3f}") # In[33]: beers_train[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().map(lambda x: f"{x:0.1f}") # In[34]: beers_dev[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().map(lambda x: f"{x:0.1f}") # In[35]: beers_test[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().map(lambda x: f"{x:0.1f}") # In[ ]: