2022-03-20 15:07:09 +01:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
from kaggle.api.kaggle_api_extended import KaggleApi
|
|
|
|
import pandas as pd
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
|
|
|
|
pd.set_option("display.max_rows", None)
|
|
|
|
|
|
|
|
|
|
|
|
def column_stat(analyzed_set, column_name):
|
|
|
|
rating_min = analyzed_set[column_name].min()
|
|
|
|
rating_max = analyzed_set[column_name].max()
|
|
|
|
rating_mean = round(analyzed_set[column_name].mean(), 3)
|
|
|
|
rating_median = analyzed_set[column_name].median()
|
|
|
|
rating_std = round(analyzed_set[column_name].std(), 3)
|
|
|
|
|
|
|
|
print(f"Dla kolumny '{column_name}':")
|
|
|
|
print(f"Minimum: {rating_min}")
|
|
|
|
print(f"Maximum: {rating_max}")
|
|
|
|
print(f"Średnia: {rating_mean}")
|
|
|
|
print(f"Mediana: {rating_median}")
|
|
|
|
print(f"Odchylenie standardowe: {rating_std}")
|
|
|
|
|
|
|
|
|
|
|
|
# Pobieranie danych
|
2022-04-02 22:34:08 +02:00
|
|
|
#api = KaggleApi()
|
|
|
|
#api.authenticate()
|
|
|
|
#api.dataset_download_files('arushchillar/disneyland-reviews', unzip=True)
|
2022-03-20 15:07:09 +01:00
|
|
|
disney = pd.read_csv('DisneylandReviews.csv', encoding='latin-1')
|
|
|
|
|
|
|
|
# Nie zauważyłem w pliku żadnych artefaktów, które trzeba wyczyścić
|
|
|
|
|
|
|
|
# Normalizacja kolumny 'Ratings' z przedziału [1;5] do przedziału [0;1]
|
|
|
|
disney['Rating'] = (disney['Rating'] - 1) / 4
|
|
|
|
|
|
|
|
# Normalizacja kolumny 'Review_Text' do lowercase
|
|
|
|
disney['Review_Text'] = disney['Review_Text'].str.lower()
|
|
|
|
|
|
|
|
|
|
|
|
# Podział na podzbiory: d_train, d_test, d_dev
|
2022-03-25 23:31:54 +01:00
|
|
|
d_train, d_test = train_test_split(disney, test_size=0.2, random_state=1, stratify=disney["Branch"])
|
|
|
|
d_dev, d_test = train_test_split(d_test, test_size=0.5, random_state=1, stratify=d_test["Branch"])
|
2022-03-20 15:07:09 +01:00
|
|
|
|
2022-03-21 11:26:03 +01:00
|
|
|
# Zapis do plików
|
|
|
|
d_train.to_csv('d_train.csv', index=False)
|
|
|
|
d_test.to_csv('d_test.csv', index=False)
|
|
|
|
d_dev.to_csv('d_dev.csv', index=False)
|
|
|
|
|
2022-03-20 15:07:09 +01:00
|
|
|
# Statystyki
|
|
|
|
print(f"Wielkość całego zbioru: {disney.shape[0]}\n"
|
|
|
|
f"Inne statystyki:")
|
|
|
|
column_stat(disney, 'Rating')
|
|
|
|
print('')
|
|
|
|
|
|
|
|
print(f"Wielkość zbioru trenującego: {d_train.shape[0]}\n"
|
|
|
|
f"Inne statystyki:")
|
|
|
|
column_stat(d_train, 'Rating')
|
|
|
|
print('')
|
|
|
|
|
|
|
|
print(f"Wielkość zbioru walidującego: {d_dev.shape[0]}\n"
|
|
|
|
f"Inne statystyki:")
|
|
|
|
column_stat(d_dev, 'Rating')
|
|
|
|
print('')
|
|
|
|
|
|
|
|
print(f"Wielkość zbioru testowego: {d_test.shape[0]}\n"
|
|
|
|
f"Inne statystyki:")
|
|
|
|
column_stat(d_test, 'Rating')
|
|
|
|
print('')
|
|
|
|
|
|
|
|
# Rozkład ocen dla każdego oddziału
|
2022-03-20 15:24:27 +01:00
|
|
|
try:
|
|
|
|
disney.hist(column='Rating', by='Branch', legend=True)
|
|
|
|
plt.suptitle('Rozkład ocen w całym zbiorze')
|
|
|
|
plt.show()
|
|
|
|
except:
|
2022-03-21 11:26:03 +01:00
|
|
|
print("Error drawing hist plot (Powinno działać w Pycharmie)")
|