172 lines
3.7 KiB
Python
172 lines
3.7 KiB
Python
|
#!/usr/bin/env python
|
||
|
# coding: utf-8
|
||
|
|
||
|
# ### Pobieranie zbioru i pakietów
|
||
|
|
||
|
# In[1]:
|
||
|
from kaggle.api.kaggle_api_extended import KaggleApi
|
||
|
|
||
|
api = KaggleApi()
|
||
|
api.authenticate()
|
||
|
|
||
|
api.dataset_download_files('thedevastator/1-5-million-beer-reviews-from-beer-advocate', path="/app", unzip=True)
|
||
|
# get_ipython().run_line_magic('pip', 'install kaggle')
|
||
|
# get_ipython().run_line_magic('pip', 'install pandas')
|
||
|
# get_ipython().run_line_magic('pip', 'install numpy')
|
||
|
# get_ipython().run_line_magic('pip', 'install scikit-learn')
|
||
|
# get_ipython().run_line_magic('pip', 'install seaborn')
|
||
|
#
|
||
|
#
|
||
|
# # In[3]:
|
||
|
#
|
||
|
#
|
||
|
# get_ipython().system('kaggle datasets download -d thedevastator/1-5-million-beer-reviews-from-beer-advocate')
|
||
|
#
|
||
|
#
|
||
|
# # In[ ]:
|
||
|
#
|
||
|
#
|
||
|
# get_ipython().system('kaggle datasets download -d')
|
||
|
#
|
||
|
#
|
||
|
# # In[ ]:
|
||
|
#
|
||
|
#
|
||
|
# get_ipython().system('unzip -o 1-5-million-beer-reviews-from-beer-advocate.zip')
|
||
|
|
||
|
|
||
|
# In[43]:
|
||
|
|
||
|
|
||
|
import numpy as np
|
||
|
import pandas as pd
|
||
|
import seaborn as sns
|
||
|
from sklearn.model_selection import train_test_split
|
||
|
import matplotlib.pyplot as plt
|
||
|
from sklearn.preprocessing import MinMaxScaler
|
||
|
|
||
|
pd.set_option('float_format', '{:f}'.format)
|
||
|
|
||
|
|
||
|
# ## Wczytywanie danych
|
||
|
|
||
|
# In[8]:
|
||
|
|
||
|
|
||
|
beers=pd.read_csv('beer_reviews.csv')
|
||
|
|
||
|
beers.head()
|
||
|
|
||
|
|
||
|
# In[9]:
|
||
|
|
||
|
|
||
|
beers.info()
|
||
|
|
||
|
|
||
|
# ### Czyszczenie
|
||
|
|
||
|
# In[49]:
|
||
|
|
||
|
|
||
|
beers.dropna(subset=['brewery_name'], inplace=True)
|
||
|
beers.dropna(subset=['review_profilename'], inplace=True)
|
||
|
beers.dropna(subset=['beer_abv'], inplace=True)
|
||
|
|
||
|
beers.isnull().sum()
|
||
|
|
||
|
|
||
|
# ### Normalizacja
|
||
|
|
||
|
# In[22]:
|
||
|
|
||
|
|
||
|
scaler = MinMaxScaler()
|
||
|
|
||
|
beers[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']] = scaler.fit_transform(beers[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']])
|
||
|
|
||
|
|
||
|
# ### Podział na podzbiory
|
||
|
|
||
|
# In[24]:
|
||
|
|
||
|
|
||
|
beers_train, beers_dev_test = train_test_split(beers, test_size=0.2, random_state=1234)
|
||
|
beers_dev, beers_test = train_test_split(beers_dev_test, test_size=0.5, random_state=1234)
|
||
|
|
||
|
|
||
|
# In[25]:
|
||
|
|
||
|
|
||
|
print(f"Liczba kolumn w każdym zbiorze: {beers.shape[1]} kolumn")
|
||
|
print(f"Całość: {beers.shape[0]} rekordów ")
|
||
|
print(f"Train: {beers_train.shape[0]} rekordów")
|
||
|
print(f"Dev: {beers_dev.shape[0]} rekordów")
|
||
|
print(f"Test: {beers_test.shape[0]} rekordów")
|
||
|
|
||
|
|
||
|
# ### Przegląd danych
|
||
|
|
||
|
# In[51]:
|
||
|
|
||
|
|
||
|
print(f"Suma różnych piw: {beers['beer_name'].nunique()}")
|
||
|
print(f"Suma różnych styli: {beers['beer_style'].nunique()}")
|
||
|
print(f"Suma różnych browarów: {beers['brewery_name'].nunique()}")
|
||
|
|
||
|
|
||
|
# In[76]:
|
||
|
|
||
|
|
||
|
style_counts = beers['beer_style'].value_counts()
|
||
|
|
||
|
top_15_styles = style_counts.head(15)
|
||
|
|
||
|
plt.bar(top_15_styles.index, top_15_styles.values)
|
||
|
plt.xlabel('Styl')
|
||
|
plt.ylabel('Liczba piw')
|
||
|
plt.title('Ilość piw dla naliczniejszych styli')
|
||
|
plt.xticks(rotation=90)
|
||
|
plt.tight_layout()
|
||
|
plt.show()
|
||
|
|
||
|
|
||
|
# In[91]:
|
||
|
|
||
|
|
||
|
reviews = pd.DataFrame(beers.groupby('beer_name')['review_overall'].mean())
|
||
|
reviews['Liczba opini'] = pd.DataFrame(beers.groupby('beer_name')['review_overall'].count())
|
||
|
reviews = reviews.sort_values(by=['Liczba opini'], ascending=False)
|
||
|
reviews.head()
|
||
|
|
||
|
|
||
|
# In[32]:
|
||
|
|
||
|
|
||
|
beers[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().map(lambda x: f"{x:0.3f}")
|
||
|
|
||
|
|
||
|
# In[33]:
|
||
|
|
||
|
|
||
|
beers_train[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().map(lambda x: f"{x:0.1f}")
|
||
|
|
||
|
|
||
|
# In[34]:
|
||
|
|
||
|
|
||
|
beers_dev[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().map(lambda x: f"{x:0.1f}")
|
||
|
|
||
|
|
||
|
# In[35]:
|
||
|
|
||
|
|
||
|
beers_test[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().map(lambda x: f"{x:0.1f}")
|
||
|
|
||
|
|
||
|
# In[ ]:
|
||
|
|
||
|
|
||
|
|
||
|
|