commit 912a16b959c6df4ec7198c9f8311cbc5d448ac43 Author: szymonj98 Date: Sun Mar 20 17:35:12 2022 +0100 zadanie 1 diff --git a/kagle.py b/kagle.py new file mode 100644 index 0000000..1bf202e --- /dev/null +++ b/kagle.py @@ -0,0 +1,79 @@ +import os +import pandas as pd +from sklearn.model_selection import train_test_split + +os.system("kaggle datasets download -d tamber/steam-video-games") +os.system("unzip -o steam-video-games.zip") + +steam=pd.read_csv('Steam-200k.csv',usecols=[0,1,2,3],names=['userId','game','behavior','hoursPlayed']) +steam.isnull().values.any() +steam['userId'] = steam.userId.astype(str) + +print("Zbior danych:") +print(steam) + +print("Describe:") +print(steam.describe(include='all'),"\n\n") + +print("Gracze z najwieksza aktywnoscia:") +print(steam["userId"].value_counts(),"\n\n") + +print("Gracze z najwieksza liczba kupionych gier:") +print(steam[steam["behavior"] != "play"]["userId"].value_counts()) +print("Mediana:") +print(steam[steam["behavior"] != "play"]["userId"].value_counts().median(),"\n\n") + +print("Gracze ktorzy zagrali w najwieksza liczbe gier:") +print(steam[steam["behavior"] != "purchase"]["userId"].value_counts()) +print("Mediana:") +print(steam[steam["behavior"] != "purchase"]["userId"].value_counts().median(),"\n\n") + + +print("Gry:") +print(steam["game"].value_counts(),"\n\n") + +print("Sredni czas grania w grania w dana gre") +print(steam[steam["behavior"] != "purchase"].groupby("game").mean().sort_values(by="hoursPlayed",ascending=False)) +print("Mediana:") +print(steam[steam["behavior"] != "purchase"].groupby("game").mean().sort_values(by="hoursPlayed",ascending=False).median(),"\n\n") + +print("Najczesciej kupowana gra") +print(steam[steam["behavior"] != "play"]["game"].value_counts()) +print("Mediana:") +print(steam[steam["behavior"] != "play"]["game"].value_counts().median(),"\n\n") + +print("Gra w ktora zagralo najwiecej graczy") +print(steam[steam["behavior"] != "purchase"]["game"].value_counts()) +print("Mediana:") +print(steam[steam["behavior"] != "purchase"]["game"].value_counts().median(),"\n\n") + +print("Liczba kupionych gier i liczba gier w ktore gracze zagrali") +print(steam["behavior"].value_counts(),"\n\n") + + +print("Gra z najwieksza liczba godzin dla jednego gracza") +print(steam[steam["behavior"] != "purchase"][["userId","hoursPlayed","game"]].sort_values(by="hoursPlayed",ascending=False)) +print("Mediana:") +print(steam[steam["behavior"] != "purchase"]["hoursPlayed"].sort_values(ascending=False).median(),"\n\n") + +print("Suma rozegranych godzin dla danej gry") +print(steam[steam["behavior"] != "purchase"].groupby("game").sum().sort_values(by="hoursPlayed",ascending=False)) +print("Mediana:") +print(steam[steam["behavior"] != "purchase"].groupby("game").sum().sort_values(by="hoursPlayed",ascending=False).median(),"\n\n") + +#odrzucenie gier dla których jest mniej niż 10 wierszy +steam = steam.groupby("game").filter(lambda x: len(x)>10) +#rozmiar zbioru testowego i dev proporcje 8:1:1 +size=int(len(steam)/10) + +steam_train, steam_test = train_test_split(steam, test_size=size, random_state=1, stratify=steam["game"]) +steam_train, steam_dev = train_test_split(steam_train, test_size=size, random_state=1, stratify=steam_train["game"]) + +print("Zbior trenujacy") +print(steam_train["game"].value_counts(),"\n") + +print("Zbior testujacy") +print(steam_test["game"].value_counts(),"\n") + +print("Zbior dev") +print(steam_dev["game"].value_counts(),"\n") \ No newline at end of file