diff --git a/Dockerfile b/Dockerfile index 5a33781..7f87558 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,5 +7,6 @@ RUN mkdir /.kaggle && chmod o+w /.kaggle RUN pip3 install pandas RUN pip3 install numpy RUN pip3 install sklearn +RUN pip3 install tensorflow COPY ./steam-200k.csv ./ -COPY ./kagle.py ./ \ No newline at end of file +COPY ./biblioteki_dl.py ./ \ No newline at end of file diff --git a/biblioteki_dl.py b/biblioteki_dl.py new file mode 100644 index 0000000..6411384 --- /dev/null +++ b/biblioteki_dl.py @@ -0,0 +1,121 @@ +import tensorflow as tf +import os +import pandas as pd +import numpy as np +import csv +from sklearn.model_selection import train_test_split + +os.system("kaggle datasets download -d tamber/steam-video-games") +os.system("unzip -o steam-video-games.zip") + +steam=pd.read_csv('steam-200k.csv',usecols=[0,1,2,3],names=['userId','game','behavior','hoursPlayed']) +steam.isnull().values.any() +steam['userId'] = steam.userId.astype(str) +purchaseCount = steam[steam["behavior"] != "play"]["game"].value_counts() +playCount = steam[steam["behavior"] != "purchase"]["game"].value_counts() + +playerPurchaseCount = steam[steam["behavior"] != "play"]["userId"].value_counts() +playerPlayCount = steam[steam["behavior"] != "purchase"]["userId"].value_counts() + +steam = steam[steam['behavior'] != 'purchase'] +steam = steam.groupby("game").filter(lambda x: len(x)>10) +size=int(len(steam)/10) + +meanGame = steam[steam["behavior"] != "purchase"].groupby("game").mean() +meanGame = meanGame.to_dict() +meanGame = meanGame['hoursPlayed'] + +purchaseCount = purchaseCount.to_dict() +playCount = playCount.to_dict() +playerPurchaseCount = playerPurchaseCount.to_dict() +playerPlayCount = playerPlayCount.to_dict() + +steam['meanTime'] = 0; +steam['purchaseCount'] = 0; +steam['playCount'] = 0; +steam['playerPurchaseCount'] =0; +steam['playerPlayCount'] =0; +steam['playPercent'] =0; + +for i in steam.index: + steam.at[i,'meanTime'] = meanGame[steam.at[i,'game']] + steam.at[i,'purchaseCount'] = purchaseCount[steam.at[i,'game']] + steam.at[i,'playCount'] = playCount[steam.at[i,'game']] + steam.at[i,'playerPurchaseCount'] = playerPurchaseCount[steam.at[i,'userId']] + steam.at[i,'playerPlayCount'] = playerPlayCount[steam.at[i,'userId']] + steam.at[i,'playPercent'] = playerPlayCount[steam.at[i,'userId']]/playerPurchaseCount[steam.at[i,'userId']] + + +steam_train, steam_test = train_test_split(steam, test_size=size, random_state=1, stratify=steam["game"]) +steam_train, steam_dev = train_test_split(steam_train, test_size=size, random_state=1, stratify=steam_train["game"]) + +print(steam) + +games = {} +for i in steam['game']: + games[i] = 0 + +j=0 +for key,game in games.items(): + games[key]=j + j=j+1 + +for i in steam['game']: + i = games[i] + +invGames = {v: k for k, v in games.items()} + +x_train = steam_train[['hoursPlayed','purchaseCount','playCount','playerPlayCount','playerPurchaseCount']] +y_train = steam_train['game'] + +x_test = steam_test[['hoursPlayed','purchaseCount','playCount','playerPlayCount','playerPurchaseCount']] +y_test = steam_test['game'] + +x_train = np.array(x_train) +y_train = np.array(y_train) +x_test = np.array(x_test) +y_test = np.array(y_test) + +for i,j in enumerate(y_train): + y_train[i] = games[j] + +for i,j in enumerate(y_test): + y_test[i] = games[j] + + + +model = tf.keras.models.Sequential([ + tf.keras.layers.Flatten(input_shape=(5,1)), + tf.keras.layers.Dense(256, activation='relu'), + tf.keras.layers.Dropout(0.01), + tf.keras.layers.Dense(1000, activation='softmax') +]) + + + +model.compile(optimizer='adam', + loss='sparse_categorical_crossentropy', + metrics=['accuracy']) + +y_train = np.array(y_train).astype(np.float32) +y_test = np.array(y_test).astype(np.float32) + + + +model.fit(x_train, y_train, epochs=100) +model.evaluate(x_test, y_test) +prediction = model.predict(x_test) +classes_x=np.argmax(prediction,axis=1) + +rows = [] + +for j,i in enumerate(classes_x): + row = [invGames[i],invGames[y_test[j]]] + rows.append(row) +with open('results.csv','w',encoding='UTF-8',newline='') as f: + writer = csv.writer(f) + writer.writerow(["predicted", "expected"]) + for row in rows: + writer.writerow(row) + +