prepare
This commit is contained in:
parent
ea5a76edbc
commit
27c2cb7956
12
Dockerfile
12
Dockerfile
@ -1,12 +0,0 @@
|
|||||||
FROM ubuntu:latest
|
|
||||||
WORKDIR /ium
|
|
||||||
RUN apt update && apt install -y python3-pip
|
|
||||||
RUN apt install unzip
|
|
||||||
RUN pip3 install kaggle
|
|
||||||
RUN mkdir /.kaggle && chmod o+w /.kaggle
|
|
||||||
RUN pip3 install pandas
|
|
||||||
RUN pip3 install numpy
|
|
||||||
RUN pip3 install sklearn
|
|
||||||
RUN pip3 install tensorflow
|
|
||||||
COPY ./steam-200k.csv ./
|
|
||||||
COPY ./biblioteki_dl.py ./
|
|
41
Jenkinsfile
vendored
41
Jenkinsfile
vendored
@ -1,41 +0,0 @@
|
|||||||
pipeline {
|
|
||||||
parameters {
|
|
||||||
string(
|
|
||||||
defaultValue: 'szymonjadczak',
|
|
||||||
description: 'Kaggle username',
|
|
||||||
name: 'KAGGLE_USERNAME',
|
|
||||||
trim: false
|
|
||||||
)
|
|
||||||
password(
|
|
||||||
defaultValue: '',
|
|
||||||
description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
|
|
||||||
name: 'KAGGLE_KEY'
|
|
||||||
)
|
|
||||||
string(
|
|
||||||
defaultValue: '',
|
|
||||||
description: 'Value for head command',
|
|
||||||
name: 'CUTOFF'
|
|
||||||
)
|
|
||||||
}
|
|
||||||
environment {
|
|
||||||
KAGGLE_USERNAME="$params.KAGGLE_USERNAME"
|
|
||||||
KAGGLE_KEY="$params.KAGGLE_KEY"
|
|
||||||
CUTOFF="$params.CUTOFF"
|
|
||||||
}
|
|
||||||
agent {
|
|
||||||
dockerfile {
|
|
||||||
additionalBuildArgs "-t ium"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
stages {
|
|
||||||
stage('Stage 1') {
|
|
||||||
steps {
|
|
||||||
echo 'Hello world!!!'
|
|
||||||
checkout([$class: 'GitSCM', branches: [[name: '*/master']], extensions: [], userRemoteConfigs: [[url: 'https://git.wmi.amu.edu.pl/s444386/ium_444386']]])
|
|
||||||
sh "chmod u+x ./dataset_download.sh"
|
|
||||||
sh "KAGGLE_USERNAME=${KAGGLE_USERNAME} KAGGLE_KEY=${KAGGLE_KEY} CUTOFF=${CUTOFF} ./dataset_download.sh"
|
|
||||||
archiveArtifacts 'data.csv'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
21
Jenkinsfile2
21
Jenkinsfile2
@ -1,21 +0,0 @@
|
|||||||
pipeline{
|
|
||||||
agent {
|
|
||||||
docker { image 'ium' }
|
|
||||||
}
|
|
||||||
parameters {
|
|
||||||
buildSelector(
|
|
||||||
defaultSelector: lastSuccessful(),
|
|
||||||
description: 'Which build to use for copying artifacts',
|
|
||||||
name: 'BUILD_SELECTOR')
|
|
||||||
}
|
|
||||||
stages{
|
|
||||||
stage('copy artefacts') {
|
|
||||||
steps {
|
|
||||||
copyArtifacts filter: 'data.csv', fingerprintArtifacts: true, projectName: 's444386-create-dataset', selector: lastSuccessful()
|
|
||||||
sh 'chmod u+x ./kagle.py'
|
|
||||||
sh 'python3 kagle.py'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
121
biblioteki_dl.py
121
biblioteki_dl.py
@ -1,121 +0,0 @@
|
|||||||
import tensorflow as tf
|
|
||||||
import os
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import csv
|
|
||||||
from sklearn.model_selection import train_test_split
|
|
||||||
|
|
||||||
os.system("kaggle datasets download -d tamber/steam-video-games")
|
|
||||||
os.system("unzip -o steam-video-games.zip")
|
|
||||||
|
|
||||||
steam=pd.read_csv('steam-200k.csv',usecols=[0,1,2,3],names=['userId','game','behavior','hoursPlayed'])
|
|
||||||
steam.isnull().values.any()
|
|
||||||
steam['userId'] = steam.userId.astype(str)
|
|
||||||
purchaseCount = steam[steam["behavior"] != "play"]["game"].value_counts()
|
|
||||||
playCount = steam[steam["behavior"] != "purchase"]["game"].value_counts()
|
|
||||||
|
|
||||||
playerPurchaseCount = steam[steam["behavior"] != "play"]["userId"].value_counts()
|
|
||||||
playerPlayCount = steam[steam["behavior"] != "purchase"]["userId"].value_counts()
|
|
||||||
|
|
||||||
steam = steam[steam['behavior'] != 'purchase']
|
|
||||||
steam = steam.groupby("game").filter(lambda x: len(x)>10)
|
|
||||||
size=int(len(steam)/10)
|
|
||||||
|
|
||||||
meanGame = steam[steam["behavior"] != "purchase"].groupby("game").mean()
|
|
||||||
meanGame = meanGame.to_dict()
|
|
||||||
meanGame = meanGame['hoursPlayed']
|
|
||||||
|
|
||||||
purchaseCount = purchaseCount.to_dict()
|
|
||||||
playCount = playCount.to_dict()
|
|
||||||
playerPurchaseCount = playerPurchaseCount.to_dict()
|
|
||||||
playerPlayCount = playerPlayCount.to_dict()
|
|
||||||
|
|
||||||
steam['meanTime'] = 0;
|
|
||||||
steam['purchaseCount'] = 0;
|
|
||||||
steam['playCount'] = 0;
|
|
||||||
steam['playerPurchaseCount'] =0;
|
|
||||||
steam['playerPlayCount'] =0;
|
|
||||||
steam['playPercent'] =0;
|
|
||||||
|
|
||||||
for i in steam.index:
|
|
||||||
steam.at[i,'meanTime'] = meanGame[steam.at[i,'game']]
|
|
||||||
steam.at[i,'purchaseCount'] = purchaseCount[steam.at[i,'game']]
|
|
||||||
steam.at[i,'playCount'] = playCount[steam.at[i,'game']]
|
|
||||||
steam.at[i,'playerPurchaseCount'] = playerPurchaseCount[steam.at[i,'userId']]
|
|
||||||
steam.at[i,'playerPlayCount'] = playerPlayCount[steam.at[i,'userId']]
|
|
||||||
steam.at[i,'playPercent'] = playerPlayCount[steam.at[i,'userId']]/playerPurchaseCount[steam.at[i,'userId']]
|
|
||||||
|
|
||||||
|
|
||||||
steam_train, steam_test = train_test_split(steam, test_size=size, random_state=1, stratify=steam["game"])
|
|
||||||
steam_train, steam_dev = train_test_split(steam_train, test_size=size, random_state=1, stratify=steam_train["game"])
|
|
||||||
|
|
||||||
print(steam)
|
|
||||||
|
|
||||||
games = {}
|
|
||||||
for i in steam['game']:
|
|
||||||
games[i] = 0
|
|
||||||
|
|
||||||
j=0
|
|
||||||
for key,game in games.items():
|
|
||||||
games[key]=j
|
|
||||||
j=j+1
|
|
||||||
|
|
||||||
for i in steam['game']:
|
|
||||||
i = games[i]
|
|
||||||
|
|
||||||
invGames = {v: k for k, v in games.items()}
|
|
||||||
|
|
||||||
x_train = steam_train[['hoursPlayed','purchaseCount','playCount','playerPlayCount','playerPurchaseCount']]
|
|
||||||
y_train = steam_train['game']
|
|
||||||
|
|
||||||
x_test = steam_test[['hoursPlayed','purchaseCount','playCount','playerPlayCount','playerPurchaseCount']]
|
|
||||||
y_test = steam_test['game']
|
|
||||||
|
|
||||||
x_train = np.array(x_train)
|
|
||||||
y_train = np.array(y_train)
|
|
||||||
x_test = np.array(x_test)
|
|
||||||
y_test = np.array(y_test)
|
|
||||||
|
|
||||||
for i,j in enumerate(y_train):
|
|
||||||
y_train[i] = games[j]
|
|
||||||
|
|
||||||
for i,j in enumerate(y_test):
|
|
||||||
y_test[i] = games[j]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
model = tf.keras.models.Sequential([
|
|
||||||
tf.keras.layers.Flatten(input_shape=(5,1)),
|
|
||||||
tf.keras.layers.Dense(256, activation='relu'),
|
|
||||||
tf.keras.layers.Dropout(0.01),
|
|
||||||
tf.keras.layers.Dense(1000, activation='softmax')
|
|
||||||
])
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
model.compile(optimizer='adam',
|
|
||||||
loss='sparse_categorical_crossentropy',
|
|
||||||
metrics=['accuracy'])
|
|
||||||
|
|
||||||
y_train = np.array(y_train).astype(np.float32)
|
|
||||||
y_test = np.array(y_test).astype(np.float32)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
model.fit(x_train, y_train, epochs=100)
|
|
||||||
model.evaluate(x_test, y_test)
|
|
||||||
prediction = model.predict(x_test)
|
|
||||||
classes_x=np.argmax(prediction,axis=1)
|
|
||||||
|
|
||||||
rows = []
|
|
||||||
|
|
||||||
for j,i in enumerate(classes_x):
|
|
||||||
row = [invGames[i],invGames[y_test[j]]]
|
|
||||||
rows.append(row)
|
|
||||||
with open('results.csv','w',encoding='UTF-8',newline='') as f:
|
|
||||||
writer = csv.writer(f)
|
|
||||||
writer.writerow(["predicted", "expected"])
|
|
||||||
for row in rows:
|
|
||||||
writer.writerow(row)
|
|
||||||
|
|
||||||
|
|
23
data.txt
23
data.txt
@ -1,23 +0,0 @@
|
|||||||
151603712,"The Elder Scrolls V Skyrim",purchase,1.0,0
|
|
||||||
151603712,"The Elder Scrolls V Skyrim",play,273.0,0
|
|
||||||
151603712,"Fallout 4",purchase,1.0,0
|
|
||||||
151603712,"Fallout 4",play,87.0,0
|
|
||||||
151603712,"Spore",purchase,1.0,0
|
|
||||||
151603712,"Spore",play,14.9,0
|
|
||||||
151603712,"Fallout New Vegas",purchase,1.0,0
|
|
||||||
151603712,"Fallout New Vegas",play,12.1,0
|
|
||||||
151603712,"Left 4 Dead 2",purchase,1.0,0
|
|
||||||
151603712,"Left 4 Dead 2",play,8.9,0
|
|
||||||
151603712,"HuniePop",purchase,1.0,0
|
|
||||||
151603712,"HuniePop",play,8.5,0
|
|
||||||
151603712,"Path of Exile",purchase,1.0,0
|
|
||||||
151603712,"Path of Exile",play,8.1,0
|
|
||||||
151603712,"Poly Bridge",purchase,1.0,0
|
|
||||||
151603712,"Poly Bridge",play,7.5,0
|
|
||||||
151603712,"Left 4 Dead",purchase,1.0,0
|
|
||||||
151603712,"Left 4 Dead",play,3.3,0
|
|
||||||
151603712,"Team Fortress 2",purchase,1.0,0
|
|
||||||
151603712,"Team Fortress 2",play,2.8,0
|
|
||||||
151603712,"Tomb Raider",purchase,1.0,0
|
|
||||||
151603712,"Tomb Raider",play,2.5,0
|
|
||||||
151603712,"The Banner Saga",purchase,1.0,0
|
|
@ -1,6 +0,0 @@
|
|||||||
|
|
||||||
kaggle datasets download -d tamber/steam-video-games
|
|
||||||
unzip -o steam-video-games.zip
|
|
||||||
> data.csv
|
|
||||||
head -n $CUTOFF steam-200k.csv >> data.csv
|
|
||||||
|
|
@ -1 +0,0 @@
|
|||||||
wc -l data.csv >> number_of_lines.txt
|
|
79
kagle.py
79
kagle.py
@ -1,79 +0,0 @@
|
|||||||
import os
|
|
||||||
import pandas as pd
|
|
||||||
from sklearn.model_selection import train_test_split
|
|
||||||
|
|
||||||
#os.system("kaggle datasets download -d tamber/steam-video-games")
|
|
||||||
#os.system("unzip -o steam-video-games.zip")
|
|
||||||
|
|
||||||
steam=pd.read_csv('data.csv',usecols=[0,1,2,3],names=['userId','game','behavior','hoursPlayed'])
|
|
||||||
steam.isnull().values.any()
|
|
||||||
steam['userId'] = steam.userId.astype(str)
|
|
||||||
|
|
||||||
print("Zbior danych:")
|
|
||||||
print(steam)
|
|
||||||
|
|
||||||
print("Describe:")
|
|
||||||
print(steam.describe(include='all'),"\n\n")
|
|
||||||
|
|
||||||
print("Gracze z najwieksza aktywnoscia:")
|
|
||||||
print(steam["userId"].value_counts(),"\n\n")
|
|
||||||
|
|
||||||
print("Gracze z najwieksza liczba kupionych gier:")
|
|
||||||
print(steam[steam["behavior"] != "play"]["userId"].value_counts())
|
|
||||||
print("Mediana:")
|
|
||||||
print(steam[steam["behavior"] != "play"]["userId"].value_counts().median(),"\n\n")
|
|
||||||
|
|
||||||
print("Gracze ktorzy zagrali w najwieksza liczbe gier:")
|
|
||||||
print(steam[steam["behavior"] != "purchase"]["userId"].value_counts())
|
|
||||||
print("Mediana:")
|
|
||||||
print(steam[steam["behavior"] != "purchase"]["userId"].value_counts().median(),"\n\n")
|
|
||||||
|
|
||||||
|
|
||||||
print("Gry:")
|
|
||||||
print(steam["game"].value_counts(),"\n\n")
|
|
||||||
|
|
||||||
print("Sredni czas grania w grania w dana gre")
|
|
||||||
print(steam[steam["behavior"] != "purchase"].groupby("game").mean().sort_values(by="hoursPlayed",ascending=False))
|
|
||||||
print("Mediana:")
|
|
||||||
print(steam[steam["behavior"] != "purchase"].groupby("game").mean().sort_values(by="hoursPlayed",ascending=False).median(),"\n\n")
|
|
||||||
|
|
||||||
print("Najczesciej kupowana gra")
|
|
||||||
print(steam[steam["behavior"] != "play"]["game"].value_counts())
|
|
||||||
print("Mediana:")
|
|
||||||
print(steam[steam["behavior"] != "play"]["game"].value_counts().median(),"\n\n")
|
|
||||||
|
|
||||||
print("Gra w ktora zagralo najwiecej graczy")
|
|
||||||
print(steam[steam["behavior"] != "purchase"]["game"].value_counts())
|
|
||||||
print("Mediana:")
|
|
||||||
print(steam[steam["behavior"] != "purchase"]["game"].value_counts().median(),"\n\n")
|
|
||||||
|
|
||||||
print("Liczba kupionych gier i liczba gier w ktore gracze zagrali")
|
|
||||||
print(steam["behavior"].value_counts(),"\n\n")
|
|
||||||
|
|
||||||
|
|
||||||
print("Gra z najwieksza liczba godzin dla jednego gracza")
|
|
||||||
print(steam[steam["behavior"] != "purchase"][["userId","hoursPlayed","game"]].sort_values(by="hoursPlayed",ascending=False))
|
|
||||||
print("Mediana:")
|
|
||||||
print(steam[steam["behavior"] != "purchase"]["hoursPlayed"].sort_values(ascending=False).median(),"\n\n")
|
|
||||||
|
|
||||||
print("Suma rozegranych godzin dla danej gry")
|
|
||||||
print(steam[steam["behavior"] != "purchase"].groupby("game").sum().sort_values(by="hoursPlayed",ascending=False))
|
|
||||||
print("Mediana:")
|
|
||||||
print(steam[steam["behavior"] != "purchase"].groupby("game").sum().sort_values(by="hoursPlayed",ascending=False).median(),"\n\n")
|
|
||||||
|
|
||||||
#odrzucenie gier dla których jest mniej niż 10 wierszy
|
|
||||||
steam = steam.groupby("game").filter(lambda x: len(x)>10)
|
|
||||||
#rozmiar zbioru testowego i dev proporcje 8:1:1
|
|
||||||
size=int(len(steam)/10)
|
|
||||||
|
|
||||||
steam_train, steam_test = train_test_split(steam, test_size=size, random_state=1, stratify=steam["game"])
|
|
||||||
steam_train, steam_dev = train_test_split(steam_train, test_size=size, random_state=1, stratify=steam_train["game"])
|
|
||||||
|
|
||||||
print("Zbior trenujacy")
|
|
||||||
print(steam_train["game"].value_counts(),"\n")
|
|
||||||
|
|
||||||
print("Zbior testujacy")
|
|
||||||
print(steam_test["game"].value_counts(),"\n")
|
|
||||||
|
|
||||||
print("Zbior dev")
|
|
||||||
print(steam_dev["game"].value_counts(),"\n")
|
|
200000
steam-200k.csv
200000
steam-200k.csv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user