diff --git a/Jenkinsfile_train b/Jenkinsfile_train index d6c6d1e..c6ff88d 100644 --- a/Jenkinsfile_train +++ b/Jenkinsfile_train @@ -13,13 +13,17 @@ pipeline { ) } stages { - stage('Script'){ - steps { - copyArtifacts filter: '*', projectName: 's444018-create-dataset' - sh 'python3 ./biblioteka_DL/dllib.py $EPOCHS' - archiveArtifacts artifacts: 'model.pkl', followSymlinks: false - } + stage('Copy artifacts'){ + steps { + copyArtifacts filter: '*', projectName: 's444018-create-dataset' + } } + stage('Train model with sacred') { + steps { + sh 'python3 ./biblioteka_DL/dllib.py $EPOCHS' + archiveArtifacts artifacts: 'model.pkl', s444018_sacred_FileObserver/**/*.*, result.csv, followSymlinks: false + } + } } post { success { diff --git a/biblioteka_DL/dllib.py b/biblioteka_DL/dllib.py index 11f7499..e65f95b 100644 --- a/biblioteka_DL/dllib.py +++ b/biblioteka_DL/dllib.py @@ -6,7 +6,20 @@ import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score +from sklearn.metrics import accuracy_score, mean_squared_error +from sacred.observers import MongoObserver, FileStorageObserver +from sacred import Experiment + + +ex = Experiment(save_git_info=False) +ex.observers.append(MongoObserver(url='mongodb://admin:IUM_2021@172.17.0.1:27017', + db_name='sacred')) + +ex.observers.append(FileStorageObserver('s444018_sacred_FileObserver')) + +@ex.config +def my_config(): + epochs = "1000" def drop_relevant_columns(imbd_data): @@ -75,63 +88,87 @@ class LinearRegressionModel(torch.nn.Module): return y_pred -df = prepare_dataset() -data_train, data_test = train_test_split(df, random_state=1, shuffle=True) +@ex.automain +def my_main(epochs, _run): + # num_epochs = 1000 + # num_epochs = int(sys.argv[1]) -X_train = pd.DataFrame(data_train["Meta_score"], dtype=np.float64) -X_train = X_train.to_numpy() + # number of epochs is parametrized + try: + num_epochs = int(epochs) + except Exception as e: + print(e) + print("Setting default epochs value to 1000.") + num_epochs = 1000 -y_train = pd.DataFrame(data_train["Gross"], dtype=np.float64) -y_train = y_train.to_numpy() + df = prepare_dataset() + data_train, data_test = train_test_split(df, random_state=1, shuffle=True) + X_train = pd.DataFrame(data_train["Meta_score"], dtype=np.float64) + X_train = X_train.to_numpy() + y_train = pd.DataFrame(data_train["Gross"], dtype=np.float64) + y_train = y_train.to_numpy() + X_train = X_train.reshape(-1, 1) + y_train = y_train.reshape(-1, 1) + X_train = torch.from_numpy(X_train.astype(np.float32)).view(-1, 1) + y_train = torch.from_numpy(y_train.astype(np.float32)).view(-1, 1) + input_size = 1 + output_size = 1 + model = nn.Linear(input_size, output_size) + learning_rate = 0.0001 + l = nn.MSELoss() + optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) -X_train = X_train.reshape(-1, 1) -y_train = y_train.reshape(-1, 1) + for epoch in range(num_epochs): + # forward feed + y_pred = model(X_train.requires_grad_()) -X_train = torch.from_numpy(X_train.astype(np.float32)).view(-1, 1) -y_train = torch.from_numpy(y_train.astype(np.float32)).view(-1, 1) + # calculate the loss + loss = l(y_pred, y_train) -input_size = 1 -output_size = 1 + # backward propagation: calculate gradients + loss.backward() -model = nn.Linear(input_size, output_size) + # update the weights + optimizer.step() -learning_rate = 0.0001 -l = nn.MSELoss() -optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) + # clear out the gradients from the last step loss.backward() + optimizer.zero_grad() + if epoch % 100 == 0: + print('epoch {}, loss {}'.format(epoch, loss.item())) -# num_epochs = 1000 -num_epochs = int(sys.argv[1]) + X_test = pd.DataFrame(data_test["Meta_score"], dtype=np.float64) + X_test = X_test.to_numpy() + X_test = X_test.reshape(-1, 1) + X_test = torch.from_numpy(X_test.astype(np.float32)).view(-1, 1) -for epoch in range(num_epochs): - # forward feed - y_pred = model(X_train.requires_grad_()) + predictedSet = model(X_test).detach().numpy() - # calculate the loss - loss = l(y_pred, y_train) + gross_test_g = pd.DataFrame(data_test["Gross"], dtype=np.float64) + gross_test_g = gross_test_g.to_numpy() + gross_test_g = gross_test_g.reshape(-1, 1) - # backward propagation: calculate gradients - loss.backward() + pred = pd.DataFrame(predictedSet) + pred.to_csv('result.csv') + # save model + torch.save(model, "model.pkl") - # update the weights - optimizer.step() + predicted = [] + expected = [] - # clear out the gradients from the last step loss.backward() - optimizer.zero_grad() + for i in range(0, len(X_test)): + predicted.append(np.argmax(model(X_test[i]).detach().numpy(), axis=0)) + expected.append(gross_test_g[i]) - if epoch % 100 == 0: - print('epoch {}, loss {}'.format(epoch, loss.item())) + for i in range(0, len(expected)): + expected[i] = expected[i][0] -predicted = model(X_train).detach().numpy() + rmse = mean_squared_error(gross_test_g, pred, squared=False) + mse = mean_squared_error(gross_test_g, pred) -pred = pd.DataFrame(predicted) -pred.to_csv('result.csv') + _run.log_scalar("RMSE", rmse) + _run.log_scalar("MSE", mse) -# save model -torch.save(model, "model.pkl") +# ex.run() +ex.add_artifact("model.pkl") -# plt.scatter(X_train.detach().numpy() , y_train.detach().numpy()) -# plt.plot(X_train.detach().numpy() , predicted , "red") -# plt.xlabel("Meta_score") -# plt.ylabel("Gross") -# plt.show() diff --git a/lab6/Dockerfile b/lab6/Dockerfile new file mode 100644 index 0000000..faa5ab9 --- /dev/null +++ b/lab6/Dockerfile @@ -0,0 +1,31 @@ +FROM ubuntu:latest + +RUN apt-get update +RUN apt-get install -y python3-pip +RUN apt-get install -y unzip + +RUN pip3 install kaggle +RUN pip3 install pandas +RUN pip3 install sklearn +RUN pip3 install numpy + +RUN pip3 install matplotlib +RUN pip3 install torch + +ARG CUTOFF +ARG KAGGLE_USERNAME +ARG KAGGLE_KEY +ENV CUTOFF=${CUTOFF} +ENV KAGGLE_USERNAME=${KAGGLE_USERNAME} +ENV KAGGLE_KEY=${KAGGLE_KEY} + +WORKDIR /app + +COPY lab2/download.sh . +COPY biblioteka_DL/dllib.py . +COPY biblioteka_DL/evaluate.py . +COPY biblioteka_DL/imdb_top_1000.csv . + +RUN chmod +x ./download.sh +RUN ./download.sh +#CMD python3 ./dllib.py diff --git a/lab6/Jenkinsfile b/lab6/Jenkinsfile new file mode 100644 index 0000000..1712583 --- /dev/null +++ b/lab6/Jenkinsfile @@ -0,0 +1,32 @@ +pipeline { + agent { + dockerfile { + additionalBuildArgs "--build-arg KAGGLE_USERNAME=${params.KAGGLE_USERNAME} --build-arg KAGGLE_KEY=${params.KAGGLE_KEY} --build-arg CUTOFF=${params.CUTOFF} -t docker_image" + } + } + parameters { + string( + defaultValue: 'szymonparafinski', + description: 'Kaggle username', + name: 'KAGGLE_USERNAME', + trim: false + ) + password( + defaultValue: '', + description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials', + name: 'KAGGLE_KEY' + ) + string( + defaultValue: '100', + description: 'Cutoff lines', + name: 'CUTOFF' + ) + } + stages { + stage('Script'){ + steps { + archiveArtifacts artifacts: 'data_test.csv, data_train.csv, data_dev.csv', followSymlinks: false + } + } + } +} \ No newline at end of file diff --git a/lab6/Jenkinsfile_eval b/lab6/Jenkinsfile_eval new file mode 100644 index 0000000..727faf9 --- /dev/null +++ b/lab6/Jenkinsfile_eval @@ -0,0 +1,46 @@ +pipeline { + agent { + docker { + image 'docker_image' + } + } + parameters { + gitParameter branchFilter: 'origin/(.*)', defaultValue: 'master', name: 'BRANCH', type: 'PT_BRANCH' + buildSelector( + defaultSelector: lastSuccessful(), + description: 'Which build to use for copying artifacts', + name: 'BUILD_SELECTOR' + ) + } + stages { + stage('Script'){ + steps { + copyArtifacts filter: '*', projectName: 's444018-create-dataset', selector: buildParameter('BUILD_SELECTOR') + copyArtifacts filter: '*', projectName: 's444018-training/${BRANCH}', selector: buildParameter('BUILD_SELECTOR') + copyArtifacts filter: '*', projectName: 's444018-evaluation/master', selector: buildParameter('BUILD_SELECTOR'), optional: true + sh 'python3 ./biblioteka_DL/evaluate.py' + archiveArtifacts artifacts: 'mae.txt, rmse.txt, mse.txt, evr.txt, metrics.png', followSymlinks: false + script { + MAE = sh ( + script: 'tail -1 mae.txt', + returnStdout: true + ).trim() + } + } + } + } + post { + success { + emailext body: "SUCCESS, MAE = ${MAE}", subject: 's444018-evaluation', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms' + } + failure { + emailext body: "FAILURE, MAE = ${MAE}", subject: 's444018-evaluation', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms' + } + unstable { + emailext body: "UNSTABLE, MAE = ${MAE}", subject: 's444018-evaluation', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms' + } + changed { + emailext body: "CHANGED, MAE = ${MAE}", subject: 's444018-evaluation', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms' + } + } +} diff --git a/lab6/Jenkinsfile_train b/lab6/Jenkinsfile_train new file mode 100644 index 0000000..d6c6d1e --- /dev/null +++ b/lab6/Jenkinsfile_train @@ -0,0 +1,38 @@ +pipeline { + agent { + dockerfile { + additionalBuildArgs "--build-arg KAGGLE_USERNAME=${params.KAGGLE_USERNAME} --build-arg KAGGLE_KEY=${params.KAGGLE_KEY} --build-arg CUTOFF=${params.CUTOFF} -t docker_image" + } + } + parameters { + string( + defaultValue: '1000', + description: 'Number of epochs', + name: 'EPOCHS', + trim: false + ) + } + stages { + stage('Script'){ + steps { + copyArtifacts filter: '*', projectName: 's444018-create-dataset' + sh 'python3 ./biblioteka_DL/dllib.py $EPOCHS' + archiveArtifacts artifacts: 'model.pkl', followSymlinks: false + } + } + } + post { + success { + emailext body: 'SUCCESS', subject: 's444018-training', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms' + } + failure { + emailext body: 'FAILURE', subject: 's444018-training', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms' + } + unstable { + emailext body: 'UNSTABLE', subject: 's444018-training', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms' + } + changed { + emailext body: 'CHANGED', subject: 's444018-training', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms' + } + } +} diff --git a/lab6/biblioteka_DL/dllib.py b/lab6/biblioteka_DL/dllib.py new file mode 100644 index 0000000..11f7499 --- /dev/null +++ b/lab6/biblioteka_DL/dllib.py @@ -0,0 +1,137 @@ +import sys + +import torch +import torch.nn as nn +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score + + +def drop_relevant_columns(imbd_data): + imbd_data.drop(columns=["Poster_Link"], inplace=True) + imbd_data.drop(columns=["Overview"], inplace=True) + imbd_data.drop(columns=["Certificate"], inplace=True) + return imbd_data + + +def lowercase_columns_names(imbd_data): + imbd_data["Series_Title"] = imbd_data["Series_Title"].str.lower() + imbd_data["Genre"] = imbd_data["Genre"].str.lower() + imbd_data["Director"] = imbd_data["Director"].str.lower() + imbd_data["Star1"] = imbd_data["Star1"].str.lower() + imbd_data["Star2"] = imbd_data["Star2"].str.lower() + imbd_data["Star3"] = imbd_data["Star3"].str.lower() + imbd_data["Star4"] = imbd_data["Star4"].str.lower() + return imbd_data + + +def data_to_numeric(imbd_data): + imbd_data = imbd_data.replace(np.nan, '', regex=True) + imbd_data["Gross"] = imbd_data["Gross"].str.replace(',', '') + imbd_data["Gross"] = pd.to_numeric(imbd_data["Gross"], errors='coerce') + imbd_data["Runtime"] = imbd_data["Runtime"].str.replace(' min', '') + imbd_data["Runtime"] = pd.to_numeric(imbd_data["Runtime"], errors='coerce') + imbd_data["IMDB_Rating"] = pd.to_numeric(imbd_data["IMDB_Rating"], errors='coerce') + imbd_data["Meta_score"] = pd.to_numeric(imbd_data["Meta_score"], errors='coerce') + imbd_data["Released_Year"] = pd.to_numeric(imbd_data["Released_Year"], errors='coerce') + imbd_data = imbd_data.dropna() + imbd_data = imbd_data.reset_index() + imbd_data.drop(columns=["index"], inplace=True) + return imbd_data + + +def create_train_dev_test(imbd_data): + data_train, data_test = train_test_split(imbd_data, test_size=230, random_state=1, shuffle=True) + data_test, data_dev = train_test_split(data_test, test_size=115, random_state=1, shuffle=True) + data_test.to_csv("data_test.csv", encoding="utf-8", index=False) + data_dev.to_csv("data_dev.csv", encoding="utf-8", index=False) + data_train.to_csv("data_train.csv", encoding="utf-8", index=False) + + +def normalize_gross(imbd_data): + imbd_data[["Gross"]] = imbd_data[["Gross"]] / 10000000 + return imbd_data + + +def prepare_dataset(): + df = pd.read_csv('biblioteka_DL/imdb_top_1000.csv') + df = drop_relevant_columns(df) + df_lowercase = lowercase_columns_names(df) + df = data_to_numeric(df_lowercase) + df = normalize_gross(df) + return df + + +class LinearRegressionModel(torch.nn.Module): + + def __init__(self): + super(LinearRegressionModel, self).__init__() + self.linear = torch.nn.Linear(1, 1) # One in and one out + + def forward(self, x): + y_pred = self.linear(x) + return y_pred + + +df = prepare_dataset() +data_train, data_test = train_test_split(df, random_state=1, shuffle=True) + +X_train = pd.DataFrame(data_train["Meta_score"], dtype=np.float64) +X_train = X_train.to_numpy() + +y_train = pd.DataFrame(data_train["Gross"], dtype=np.float64) +y_train = y_train.to_numpy() + +X_train = X_train.reshape(-1, 1) +y_train = y_train.reshape(-1, 1) + +X_train = torch.from_numpy(X_train.astype(np.float32)).view(-1, 1) +y_train = torch.from_numpy(y_train.astype(np.float32)).view(-1, 1) + +input_size = 1 +output_size = 1 + +model = nn.Linear(input_size, output_size) + +learning_rate = 0.0001 +l = nn.MSELoss() +optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) + + +# num_epochs = 1000 +num_epochs = int(sys.argv[1]) + +for epoch in range(num_epochs): + # forward feed + y_pred = model(X_train.requires_grad_()) + + # calculate the loss + loss = l(y_pred, y_train) + + # backward propagation: calculate gradients + loss.backward() + + # update the weights + optimizer.step() + + # clear out the gradients from the last step loss.backward() + optimizer.zero_grad() + + if epoch % 100 == 0: + print('epoch {}, loss {}'.format(epoch, loss.item())) + +predicted = model(X_train).detach().numpy() + +pred = pd.DataFrame(predicted) +pred.to_csv('result.csv') + +# save model +torch.save(model, "model.pkl") + +# plt.scatter(X_train.detach().numpy() , y_train.detach().numpy()) +# plt.plot(X_train.detach().numpy() , predicted , "red") +# plt.xlabel("Meta_score") +# plt.ylabel("Gross") +# plt.show() diff --git a/lab6/biblioteka_DL/evaluate.py b/lab6/biblioteka_DL/evaluate.py new file mode 100644 index 0000000..34b6acd --- /dev/null +++ b/lab6/biblioteka_DL/evaluate.py @@ -0,0 +1,164 @@ +import sys +import torch +import torch.nn as nn +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score, precision_recall_fscore_support, explained_variance_score, \ + mean_squared_error, mean_absolute_error + + +def drop_relevant_columns(imbd_data): + imbd_data.drop(columns=["Poster_Link"], inplace=True) + imbd_data.drop(columns=["Overview"], inplace=True) + imbd_data.drop(columns=["Certificate"], inplace=True) + return imbd_data + + +def lowercase_columns_names(imbd_data): + imbd_data["Series_Title"] = imbd_data["Series_Title"].str.lower() + imbd_data["Genre"] = imbd_data["Genre"].str.lower() + imbd_data["Director"] = imbd_data["Director"].str.lower() + imbd_data["Star1"] = imbd_data["Star1"].str.lower() + imbd_data["Star2"] = imbd_data["Star2"].str.lower() + imbd_data["Star3"] = imbd_data["Star3"].str.lower() + imbd_data["Star4"] = imbd_data["Star4"].str.lower() + return imbd_data + + +def data_to_numeric(imbd_data): + imbd_data = imbd_data.replace(np.nan, '', regex=True) + imbd_data["Gross"] = imbd_data["Gross"].str.replace(',', '') + imbd_data["Gross"] = pd.to_numeric(imbd_data["Gross"], errors='coerce') + imbd_data["Runtime"] = imbd_data["Runtime"].str.replace(' min', '') + imbd_data["Runtime"] = pd.to_numeric(imbd_data["Runtime"], errors='coerce') + imbd_data["IMDB_Rating"] = pd.to_numeric(imbd_data["IMDB_Rating"], errors='coerce') + imbd_data["Meta_score"] = pd.to_numeric(imbd_data["Meta_score"], errors='coerce') + imbd_data["Released_Year"] = pd.to_numeric(imbd_data["Released_Year"], errors='coerce') + imbd_data = imbd_data.dropna() + imbd_data = imbd_data.reset_index() + imbd_data.drop(columns=["index"], inplace=True) + return imbd_data + + +def create_train_dev_test(imbd_data): + data_train, data_test = train_test_split(imbd_data, test_size=230, random_state=1, shuffle=True) + data_test, data_dev = train_test_split(data_test, test_size=115, random_state=1, shuffle=True) + data_test.to_csv("data_test.csv", encoding="utf-8", index=False) + data_dev.to_csv("data_dev.csv", encoding="utf-8", index=False) + data_train.to_csv("data_train.csv", encoding="utf-8", index=False) + + +def normalize_gross(imbd_data): + imbd_data[["Gross"]] = imbd_data[["Gross"]] / 10000000 + return imbd_data + + +def prepare_dataset(): + df = pd.read_csv('biblioteka_DL/imdb_top_1000.csv') + df = drop_relevant_columns(df) + df_lowercase = lowercase_columns_names(df) + df = data_to_numeric(df_lowercase) + df = normalize_gross(df) + return df + + +class LinearRegressionModel(torch.nn.Module): + + def __init__(self): + super(LinearRegressionModel, self).__init__() + self.linear = torch.nn.Linear(1, 1) # One in and one out + + def forward(self, x): + y_pred = self.linear(x) + return y_pred + + +df = prepare_dataset() +data_train, data_test = train_test_split(df, random_state=1, shuffle=True) + +X_train = pd.DataFrame(data_train["Meta_score"], dtype=np.float64) +X_train = X_train.to_numpy() + +y_train = pd.DataFrame(data_train["Gross"], dtype=np.float64) +y_train = y_train.to_numpy() + +X_train = X_train.reshape(-1, 1) +y_train = y_train.reshape(-1, 1) + +X_train = torch.from_numpy(X_train.astype(np.float32)).view(-1, 1) +y_train = torch.from_numpy(y_train.astype(np.float32)).view(-1, 1) + +input_size = 1 +output_size = 1 + +model = torch.load("model.pkl") + +X_test = pd.DataFrame(data_test["Meta_score"], dtype=np.float64) +X_test = X_test.to_numpy() +X_test = X_test.reshape(-1, 1) +X_test = torch.from_numpy(X_test.astype(np.float32)).view(-1, 1) + +predicted = model(X_test).detach().numpy() + +gross_test_g = pd.DataFrame(data_test["Gross"], dtype=np.float64) +gross_test_g = gross_test_g.to_numpy() +gross_test_g = gross_test_g.reshape(-1, 1) + +pred = pd.DataFrame(predicted) + +predicted = [] +expected = [] + +for i in range(0, len(X_test)): + predicted.append(np.argmax(model(X_test[i]).detach().numpy(), axis=0)) + expected.append(gross_test_g[i]) + +for i in range(0, len(expected)): + expected[i] = expected[i][0] + +rmse = mean_squared_error(gross_test_g, pred, squared=False) +mse = mean_squared_error(gross_test_g, pred) +evr = explained_variance_score(gross_test_g, pred) +mae = mean_absolute_error(gross_test_g, pred) + +res = f"Explained variance regression score: {evr}, RMSE: {rmse}, MSE: {mse}, MAE: {mae}" + +with open('mae.txt', 'a+') as f: + f.write(str(mae) + '\n') + +with open('rmse.txt', 'a+') as f: + f.write(str(rmse) + '\n') + +with open('mse.txt', 'a+') as f: + f.write(str(mse) + '\n') + +with open('evr.txt', 'a+') as f: + f.write(str(evr) + '\n') + +with open('mae.txt') as f: + mae_val = [float(line) for line in f if line] + builds = list(range(1, len(mae_val) + 1)) + +with open('rmse.txt') as f: + rmse_val = [float(line) for line in f if line] + +with open('mse.txt') as f: + mse_val = [float(line) for line in f if line] + +with open('evr.txt') as f: + evr_val = [float(line) for line in f if line] + + +ax = plt.gca() +ax.set_title('Build') + +mae_line = ax.plot(mae_val, color='blue', label="MAE") +rmse_line = ax.plot(rmse_val, color='green', label="RMSE") +mse_line = ax.plot(mse_val, color='red', label="MSE") +evr_line = ax.plot(evr_val, color='orange', label="EVR") +ax.legend(bbox_to_anchor=(0., 1.01, 1.0, .1), loc=3, + ncol=2, mode="expand", borderaxespad=0.) +plt.show() +plt.savefig('metrics.png')