diff --git a/train_evaluate/Dockerfile b/train_evaluate/Dockerfile index 1dda6ab..c06dfb0 100644 --- a/train_evaluate/Dockerfile +++ b/train_evaluate/Dockerfile @@ -17,11 +17,11 @@ RUN pip3 install --user wget WORKDIR /app -COPY ./../train.py ./ -COPY ./../evaluate.py ./ -COPY ./../sacred1.py ./ -COPY ./../sacred2.py ./ -COPY ./../skrypt.sh ./ -COPY ./../zadanie2.py ./ -COPY ./../zadanie5.py ./ +COPY ./train.py ./ +COPY ./evaluate.py ./ +COPY ./sacred1.py ./ +COPY ./sacred2.py ./ +COPY ./skrypt.sh ./ +COPY ./zadanie2.py ./ +COPY ./zadanie5.py ./ diff --git a/train_evaluate/evaluate.py b/train_evaluate/evaluate.py new file mode 100644 index 0000000..e0369f7 --- /dev/null +++ b/train_evaluate/evaluate.py @@ -0,0 +1 @@ +print('test') \ No newline at end of file diff --git a/train_evaluate/sacred1.py b/train_evaluate/sacred1.py new file mode 100644 index 0000000..1140565 --- /dev/null +++ b/train_evaluate/sacred1.py @@ -0,0 +1,76 @@ +#! /usr/bin/python3 +from tensorflow.keras.models import Sequential, load_model +from tensorflow.keras.layers import Dense +from sklearn.metrics import accuracy_score, classification_report +import pandas as pd +from sklearn.model_selection import train_test_split +import wget +import numpy as np +import requests +from sacred.observers import FileStorageObserver +from sacred import Experiment +from datetime import datetime +import os + +ex = Experiment("ium_s434695", interactive=False) + +ex.observers.append(FileStorageObserver('ium_s434695/my_runs')) + +@ex.config +def my_config(): + train_size_param = 0.8 + test_size_param = 0.2 + +@ex.capture +def prepare_model(train_size_param, test_size_param, _run): + _run.info["prepare_model_ts"] = str(datetime.now()) + + url = 'https://git.wmi.amu.edu.pl/s434695/ium_434695/raw/commit/2301fb86e434734376f73503307a8f3255a75cc6/vgsales.csv' + r = requests.get(url, allow_redirects=True) + + open('vgsales.csv', 'wb').write(r.content) + df = pd.read_csv('vgsales.csv') + + + + def regression_model(): + model = Sequential() + model.add(Dense(32,activation = "relu", input_shape = (x_train.shape[1],))) + model.add(Dense(64,activation = "relu")) + model.add(Dense(1,activation = "relu")) + + model.compile(optimizer = "adam", loss = "mean_squared_error") + return model + + df['Nintendo'] = df['Publisher'].apply(lambda x: 1 if x=='Nintendo' else 0) + df = df.drop(['Rank','Name','Platform','Year','Genre','Publisher'],axis = 1) + df + + y = df.Nintendo + + df=((df-df.min())/(df.max()-df.min())) + + x = df.drop(['Nintendo'],axis = 1) + + x_train, x_test, y_train, y_test = train_test_split(x,y , test_size=0.2,train_size=0.8, random_state=21) + + model = regression_model() + model.fit(x_train, y_train, epochs = 600, verbose = 1) + + y_pred = model.predict(x_test) + + y_pred[:5] + + y_pred = np.around(y_pred, decimals=0) + + y_pred[:5] + + return(classification_report(y_test,y_pred)) + +@ex.main +def my_main(train_size_param, test_size_param): + print(prepare_model()) + + +r = ex.run() +ex.add_artifact("vgsales_model/saved_model/saved_model.pb") \ No newline at end of file diff --git a/train_evaluate/sacred2.py b/train_evaluate/sacred2.py new file mode 100644 index 0000000..36c4d51 --- /dev/null +++ b/train_evaluate/sacred2.py @@ -0,0 +1,78 @@ +#! /usr/bin/python3 +from tensorflow.keras.models import Sequential, load_model +from tensorflow.keras.layers import Dense +from sklearn.metrics import accuracy_score, classification_report +import pandas as pd +from sklearn.model_selection import train_test_split +import wget +import numpy as np +import requests +from sacred.observers import FileStorageObserver +from sacred import Experiment +from datetime import datetime +import os +from sacred.observers import MongoObserver + +ex = Experiment("ium_s434695", interactive=False) + +ex.observers.append(MongoObserver(url='mongodb://mongo_user:mongo_password_IUM_2021@172.17.0.1:27017', + db_name='sacred')) + +@ex.config +def my_config(): + train_size_param = 0.8 + test_size_param = 0.2 + +@ex.capture +def prepare_model(train_size_param, test_size_param, _run): + _run.info["prepare_model_ts"] = str(datetime.now()) + + url = 'https://git.wmi.amu.edu.pl/s434695/ium_434695/raw/commit/2301fb86e434734376f73503307a8f3255a75cc6/vgsales.csv' + r = requests.get(url, allow_redirects=True) + + open('vgsales.csv', 'wb').write(r.content) + df = pd.read_csv('vgsales.csv') + + + + def regression_model(): + model = Sequential() + model.add(Dense(32,activation = "relu", input_shape = (x_train.shape[1],))) + model.add(Dense(64,activation = "relu")) + model.add(Dense(1,activation = "relu")) + + model.compile(optimizer = "adam", loss = "mean_squared_error") + return model + + df['Nintendo'] = df['Publisher'].apply(lambda x: 1 if x=='Nintendo' else 0) + df = df.drop(['Rank','Name','Platform','Year','Genre','Publisher'],axis = 1) + df + + y = df.Nintendo + + df=((df-df.min())/(df.max()-df.min())) + + x = df.drop(['Nintendo'],axis = 1) + + x_train, x_test, y_train, y_test = train_test_split(x,y , test_size=0.2,train_size=0.8, random_state=21) + + model = regression_model() + model.fit(x_train, y_train, epochs = 600, verbose = 1) + + y_pred = model.predict(x_test) + + y_pred[:5] + + y_pred = np.around(y_pred, decimals=0) + + y_pred[:5] + + return(classification_report(y_test,y_pred)) + +@ex.main +def my_main(train_size_param, test_size_param): + print(prepare_model()) + + +r = ex.run() +ex.add_artifact("vgsales_model/saved_model/saved_model.pb") \ No newline at end of file diff --git a/train_evaluate/skrypt.sh b/train_evaluate/skrypt.sh new file mode 100644 index 0000000..ef29548 --- /dev/null +++ b/train_evaluate/skrypt.sh @@ -0,0 +1,19 @@ +#Pobranie pliku .csv +curl -OL https://git.wmi.amu.edu.pl/s434695/ium_434695/raw/branch/master/vgsales.csv + + +#Podzielenie pliku csv na test/dev/train +head -n 1 vgsales.csv > header.csv +tail -n +2 vgsales.csv | shuf > data.shuffled + +head -n 3320 data.shuffled > games.data.test +head -n 6640 data.shuffled | tail -n 3320 > games.data.dev +tail -n +6641 data.shuffled > games.data.train + +cat header.csv games.data.test > test.csv +cat header.csv games.data.dev > dev.csv +cat header.csv games.data.train > train.csv + +#Obcinanie danych +head -n $1 data.shuffled > obcietedane.data +cat header.csv obcietedane.data > obcietedane.csv diff --git a/train_evaluate/train.py b/train_evaluate/train.py new file mode 100644 index 0000000..e6c2479 --- /dev/null +++ b/train_evaluate/train.py @@ -0,0 +1,42 @@ +#! /usr/bin/python3 +from tensorflow.keras.models import Sequential, load_model +from tensorflow.keras.layers import Dense +from sklearn.metrics import accuracy_score, classification_report +import pandas as pd +from sklearn.model_selection import train_test_split +import numpy as np +import requests +url = 'https://git.wmi.amu.edu.pl/s434695/ium_434695/raw/commit/2301fb86e434734376f73503307a8f3255a75cc6/vgsales.csv' +r = requests.get(url, allow_redirects=True) + +open('vgsales.csv', 'wb').write(r.content) +df = pd.read_csv('vgsales.csv') + + + +def regression_model(): + model = Sequential() + model.add(Dense(16,activation = "relu", input_shape = (x_train.shape[1],))) + model.add(Dense(32,activation = "relu")) + model.add(Dense(1,activation = "relu")) + + model.compile(optimizer = "adam", loss = "mean_squared_error") + return model + +df['Nintendo'] = df['Publisher'].apply(lambda x: 1 if x=='Nintendo' else 0) +df = df.drop(['Rank','Name','Platform','Year','Genre','Publisher'],axis = 1) +df + +y = df.Nintendo + +df=((df-df.min())/(df.max()-df.min())) + +x = df.drop(['Nintendo'],axis = 1) + +x_train, x_test, y_train, y_test = train_test_split(x,y , test_size=0.2,train_size=0.8, random_state=21) + +model = regression_model() +model.fit(x_train, y_train, epochs = 600, verbose = 1) + +y_pred = model.predict(x_test) +model.save('model1') \ No newline at end of file diff --git a/train_evaluate/zadanie2.py b/train_evaluate/zadanie2.py new file mode 100755 index 0000000..978499e --- /dev/null +++ b/train_evaluate/zadanie2.py @@ -0,0 +1,34 @@ +#! /usr/bin/python3 + +import requests +url = 'https://git.wmi.amu.edu.pl/s434695/ium_434695/raw/commit/2301fb86e434734376f73503307a8f3255a75cc6/vgsales.csv' +r = requests.get(url, allow_redirects=True) + +open('vgsales.csv', 'wb').write(r.content) + +import pandas as pd +vgsales = pd.read_csv('vgsales.csv') +vgsales + +vgsales.describe(include='all') + +vgsales["Publisher"].value_counts() + +vgsales["Platform"].value_counts() + +vgsales["Platform"].value_counts().plot(kind="bar") + +vgsales[["Platform","JP_Sales"]].groupby("Platform").mean().plot(kind="bar") + +import seaborn as sns +sns.set_theme() +sns.relplot(data=vgsales, x="JP_Sales", y="NA_Sales", hue="Genre") + +from sklearn.model_selection import train_test_split +vgsales_train, vgsales_test = train_test_split(vgsales, test_size = 0.6, random_state = 1) +vgsales_train["Platform"].value_counts() + +vgsales_test["Platform"].value_counts() + +print(vgsales_train["Platform"]) + diff --git a/train_evaluate/zadanie5.py b/train_evaluate/zadanie5.py new file mode 100755 index 0000000..aec3b70 --- /dev/null +++ b/train_evaluate/zadanie5.py @@ -0,0 +1,53 @@ +#! /usr/bin/python3 +from tensorflow.keras.models import Sequential, load_model +from tensorflow.keras.layers import Dense +from sklearn.metrics import accuracy_score, classification_report +import pandas as pd +from sklearn.model_selection import train_test_split +import numpy as np +import requests +url = 'https://git.wmi.amu.edu.pl/s434695/ium_434695/raw/commit/2301fb86e434734376f73503307a8f3255a75cc6/vgsales.csv' +r = requests.get(url, allow_redirects=True) + +open('vgsales.csv', 'wb').write(r.content) +df = pd.read_csv('vgsales.csv') + + + +def regression_model(): + model = Sequential() + model.add(Dense(16,activation = "relu", input_shape = (x_train.shape[1],))) + model.add(Dense(32,activation = "relu")) + model.add(Dense(1,activation = "relu")) + + model.compile(optimizer = "adam", loss = "mean_squared_error") + return model + +df['Nintendo'] = df['Publisher'].apply(lambda x: 1 if x=='Nintendo' else 0) +df = df.drop(['Rank','Name','Platform','Year','Genre','Publisher'],axis = 1) +df + +y = df.Nintendo + +df=((df-df.min())/(df.max()-df.min())) + +x = df.drop(['Nintendo'],axis = 1) + +x_train, x_test, y_train, y_test = train_test_split(x,y , test_size=0.2,train_size=0.8, random_state=21) + +model = regression_model() +model.fit(x_train, y_train, epochs = 600, verbose = 1) + +y_pred = model.predict(x_test) + +y_pred[:5] + +y_pred = np.around(y_pred, decimals=0) + +y_pred[:5] + +print(accuracy_score(y_test, y_pred)) + +print(classification_report(y_test,y_pred)) + +pd.DataFrame(y_pred).to_csv("preds.csv")