62 lines
1.7 KiB
Python
62 lines
1.7 KiB
Python
#!/usr/bin/env python3
|
|
|
|
#from kaggle.api.kaggle_api_extended import KaggleApi
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
#from sklearn.model_selection import train_test_split
|
|
|
|
pd.set_option("display.max_rows", None)
|
|
|
|
|
|
def column_stat(analyzed_set, column_name):
|
|
rating_min = analyzed_set[column_name].min()
|
|
rating_max = analyzed_set[column_name].max()
|
|
rating_mean = round(analyzed_set[column_name].mean(), 3)
|
|
rating_median = analyzed_set[column_name].median()
|
|
rating_std = round(analyzed_set[column_name].std(), 3)
|
|
|
|
output = ''
|
|
|
|
output += f"Dla kolumny '{column_name}':\n"
|
|
output += f"Minimum: {rating_min}\n"
|
|
output += f"Maximum: {rating_max}\n"
|
|
output += f"Średnia: {rating_mean}\n"
|
|
output += f"Mediana: {rating_median}\n"
|
|
output += f"Odchylenie standardowe: {rating_std}\n"
|
|
|
|
return output
|
|
|
|
|
|
d_train = pd.read_csv('d_train.csv', encoding='latin-1')
|
|
d_test = pd.read_csv('d_test.csv', encoding='latin-1')
|
|
d_dev = pd.read_csv('d_dev.csv', encoding='latin-1')
|
|
|
|
|
|
# Statystyki
|
|
temp = ''
|
|
#temp += f"Wielkość całego zbioru: {disney.shape[0]}\n"
|
|
#temp += f"Inne statystyki:\n"
|
|
#temp += column_stat(disney, 'Rating')
|
|
#temp += '\n'
|
|
|
|
temp += f"Wielkość zbioru trenującego: {d_train.shape[0]}\n"
|
|
temp += f"Inne statystyki:\n"
|
|
temp += column_stat(d_train, 'Rating')
|
|
temp += '\n'
|
|
|
|
temp += f"Wielkość zbioru walidującego: {d_dev.shape[0]}\n"
|
|
temp += f"Inne statystyki:\n"
|
|
temp += column_stat(d_dev, 'Rating')
|
|
temp += '\n'
|
|
|
|
temp += f"Wielkość zbioru testowego: {d_test.shape[0]}\n"
|
|
temp += f"Inne statystyki:\n"
|
|
temp += column_stat(d_test, 'Rating')
|
|
temp += '\n'
|
|
|
|
with open('stats.txt', 'w+', encoding="utf-8") as f:
|
|
print(temp)
|
|
f.write(temp)
|
|
|
|
|