move sol to lab7

This commit is contained in:
Szymon Parafiński 2022-05-16 01:13:58 +02:00
parent 47d9776529
commit fa51d4a87a
7 changed files with 516 additions and 0 deletions

33
lab7/Dockerfile Normal file
View File

@ -0,0 +1,33 @@
FROM ubuntu:latest
RUN apt-get update
RUN apt-get install -y python3-pip
RUN apt-get install -y unzip
RUN pip3 install kaggle
RUN pip3 install pandas
RUN pip3 install sklearn
RUN pip3 install numpy
RUN pip3 install matplotlib
RUN pip3 install torch
RUN pip3 install sacred
RUN pip3 install pymongo
ARG CUTOFF
ARG KAGGLE_USERNAME
ARG KAGGLE_KEY
ENV CUTOFF=${CUTOFF}
ENV KAGGLE_USERNAME=${KAGGLE_USERNAME}
ENV KAGGLE_KEY=${KAGGLE_KEY}
WORKDIR /app
COPY lab2/download.sh .
COPY biblioteka_DL/dllib.py .
COPY biblioteka_DL/evaluate.py .
COPY biblioteka_DL/imdb_top_1000.csv .
RUN chmod +x ./download.sh
RUN ./download.sh
#CMD python3 ./dllib.py

32
lab7/Jenkinsfile vendored Normal file
View File

@ -0,0 +1,32 @@
pipeline {
agent {
dockerfile {
additionalBuildArgs "--build-arg KAGGLE_USERNAME=${params.KAGGLE_USERNAME} --build-arg KAGGLE_KEY=${params.KAGGLE_KEY} --build-arg CUTOFF=${params.CUTOFF} -t docker_image"
}
}
parameters {
string(
defaultValue: 'szymonparafinski',
description: 'Kaggle username',
name: 'KAGGLE_USERNAME',
trim: false
)
password(
defaultValue: '',
description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
name: 'KAGGLE_KEY'
)
string(
defaultValue: '100',
description: 'Cutoff lines',
name: 'CUTOFF'
)
}
stages {
stage('Script'){
steps {
archiveArtifacts artifacts: 'data_test.csv, data_train.csv, data_dev.csv', followSymlinks: false
}
}
}
}

46
lab7/Jenkinsfile_eval Normal file
View File

@ -0,0 +1,46 @@
pipeline {
agent {
docker {
image 'docker_image'
}
}
parameters {
gitParameter branchFilter: 'origin/(.*)', defaultValue: 'master', name: 'BRANCH', type: 'PT_BRANCH'
buildSelector(
defaultSelector: lastSuccessful(),
description: 'Which build to use for copying artifacts',
name: 'BUILD_SELECTOR'
)
}
stages {
stage('Script'){
steps {
copyArtifacts filter: '*', projectName: 's444018-create-dataset', selector: buildParameter('BUILD_SELECTOR')
copyArtifacts filter: '*', projectName: 's444018-training/${BRANCH}', selector: buildParameter('BUILD_SELECTOR')
copyArtifacts filter: '*', projectName: 's444018-evaluation/master', selector: buildParameter('BUILD_SELECTOR'), optional: true
sh 'python3 ./biblioteka_DL/evaluate.py'
archiveArtifacts artifacts: 'mae.txt, rmse.txt, mse.txt, evr.txt, metrics.png', followSymlinks: false
script {
MAE = sh (
script: 'tail -1 mae.txt',
returnStdout: true
).trim()
}
}
}
}
post {
success {
emailext body: "SUCCESS, MAE = ${MAE}", subject: 's444018-evaluation', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms'
}
failure {
emailext body: "FAILURE, MAE = ${MAE}", subject: 's444018-evaluation', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms'
}
unstable {
emailext body: "UNSTABLE, MAE = ${MAE}", subject: 's444018-evaluation', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms'
}
changed {
emailext body: "CHANGED, MAE = ${MAE}", subject: 's444018-evaluation', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms'
}
}
}

24
lab7/Jenkinsfile_stats Normal file
View File

@ -0,0 +1,24 @@
pipeline {
agent {
docker {
image 'docker_image'
}
}
parameters{
buildSelector(
defaultSelector: lastSuccessful(),
name: 'BUILD_SELECTOR',
description: 'Which build to use for copying artifacts'
)
}
stages {
stage("Script") {
steps {
copyArtifacts fingerprintArtifacts: true, projectName: 's444018-create-dataset', selector: buildParameter('BUILD_SELECTOR')
sh 'chmod +x ./lab2/stats.sh'
sh "./lab2/stats.sh"
archiveArtifacts 'stats.txt'
}
}
}
}

42
lab7/Jenkinsfile_train Normal file
View File

@ -0,0 +1,42 @@
pipeline {
agent {
dockerfile {
additionalBuildArgs "--build-arg KAGGLE_USERNAME=${params.KAGGLE_USERNAME} --build-arg KAGGLE_KEY=${params.KAGGLE_KEY} --build-arg CUTOFF=${params.CUTOFF} -t docker_image"
}
}
parameters {
string(
defaultValue: '1000',
description: 'Number of epochs',
name: 'EPOCHS',
trim: false
)
}
stages {
stage('Copy artifacts'){
steps {
copyArtifacts filter: '*', projectName: 's444018-create-dataset'
}
}
stage('Train model with sacred') {
steps {
sh 'python3 ./biblioteka_DL/dllib.py with "epochs=$EPOCHS"'
archiveArtifacts artifacts: 'model.pkl, s444018_sacred_FileObserver/**/*.*, result.csv', followSymlinks: false
}
}
}
post {
success {
emailext body: 'SUCCESS', subject: 's444018-training', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms'
}
failure {
emailext body: 'FAILURE', subject: 's444018-training', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms'
}
unstable {
emailext body: 'UNSTABLE', subject: 's444018-training', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms'
}
changed {
emailext body: 'CHANGED', subject: 's444018-training', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms'
}
}
}

175
lab7/biblioteka_DL/dllib.py Normal file
View File

@ -0,0 +1,175 @@
import sys
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from sacred.observers import MongoObserver, FileStorageObserver
from sacred import Experiment
ex = Experiment(save_git_info=False)
ex.observers.append(MongoObserver(url='mongodb://admin:IUM_2021@172.17.0.1:27017',
db_name='sacred'))
ex.observers.append(FileStorageObserver('s444018_sacred_FileObserver'))
@ex.config
def my_config():
epochs = "1000"
def drop_relevant_columns(imbd_data):
imbd_data.drop(columns=["Poster_Link"], inplace=True)
imbd_data.drop(columns=["Overview"], inplace=True)
imbd_data.drop(columns=["Certificate"], inplace=True)
return imbd_data
def lowercase_columns_names(imbd_data):
imbd_data["Series_Title"] = imbd_data["Series_Title"].str.lower()
imbd_data["Genre"] = imbd_data["Genre"].str.lower()
imbd_data["Director"] = imbd_data["Director"].str.lower()
imbd_data["Star1"] = imbd_data["Star1"].str.lower()
imbd_data["Star2"] = imbd_data["Star2"].str.lower()
imbd_data["Star3"] = imbd_data["Star3"].str.lower()
imbd_data["Star4"] = imbd_data["Star4"].str.lower()
return imbd_data
def data_to_numeric(imbd_data):
imbd_data = imbd_data.replace(np.nan, '', regex=True)
imbd_data["Gross"] = imbd_data["Gross"].str.replace(',', '')
imbd_data["Gross"] = pd.to_numeric(imbd_data["Gross"], errors='coerce')
imbd_data["Runtime"] = imbd_data["Runtime"].str.replace(' min', '')
imbd_data["Runtime"] = pd.to_numeric(imbd_data["Runtime"], errors='coerce')
imbd_data["IMDB_Rating"] = pd.to_numeric(imbd_data["IMDB_Rating"], errors='coerce')
imbd_data["Meta_score"] = pd.to_numeric(imbd_data["Meta_score"], errors='coerce')
imbd_data["Released_Year"] = pd.to_numeric(imbd_data["Released_Year"], errors='coerce')
imbd_data = imbd_data.dropna()
imbd_data = imbd_data.reset_index()
imbd_data.drop(columns=["index"], inplace=True)
return imbd_data
def create_train_dev_test(imbd_data):
data_train, data_test = train_test_split(imbd_data, test_size=230, random_state=1, shuffle=True)
data_test, data_dev = train_test_split(data_test, test_size=115, random_state=1, shuffle=True)
data_test.to_csv("data_test.csv", encoding="utf-8", index=False)
data_dev.to_csv("data_dev.csv", encoding="utf-8", index=False)
data_train.to_csv("data_train.csv", encoding="utf-8", index=False)
def normalize_gross(imbd_data):
imbd_data[["Gross"]] = imbd_data[["Gross"]] / 10000000
return imbd_data
def prepare_dataset():
df = pd.read_csv('biblioteka_DL/imdb_top_1000.csv')
df = drop_relevant_columns(df)
df_lowercase = lowercase_columns_names(df)
df = data_to_numeric(df_lowercase)
df = normalize_gross(df)
return df
class LinearRegressionModel(torch.nn.Module):
def __init__(self):
super(LinearRegressionModel, self).__init__()
self.linear = torch.nn.Linear(1, 1) # One in and one out
def forward(self, x):
y_pred = self.linear(x)
return y_pred
@ex.automain
def my_main(epochs, _run):
# num_epochs = 1000
# num_epochs = int(sys.argv[1])
# number of epochs is parametrized
try:
num_epochs = int(epochs)
except Exception as e:
print(e)
print("Setting default epochs value to 1000.")
num_epochs = 1000
df = prepare_dataset()
data_train, data_test = train_test_split(df, random_state=1, shuffle=True)
X_train = pd.DataFrame(data_train["Meta_score"], dtype=np.float64)
X_train = X_train.to_numpy()
y_train = pd.DataFrame(data_train["Gross"], dtype=np.float64)
y_train = y_train.to_numpy()
X_train = X_train.reshape(-1, 1)
y_train = y_train.reshape(-1, 1)
X_train = torch.from_numpy(X_train.astype(np.float32)).view(-1, 1)
y_train = torch.from_numpy(y_train.astype(np.float32)).view(-1, 1)
input_size = 1
output_size = 1
model = nn.Linear(input_size, output_size)
learning_rate = 0.0001
l = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
# forward feed
y_pred = model(X_train.requires_grad_())
# calculate the loss
loss = l(y_pred, y_train)
# backward propagation: calculate gradients
loss.backward()
# update the weights
optimizer.step()
# clear out the gradients from the last step loss.backward()
optimizer.zero_grad()
if epoch % 100 == 0:
print('epoch {}, loss {}'.format(epoch, loss.item()))
X_test = pd.DataFrame(data_test["Meta_score"], dtype=np.float64)
X_test = X_test.to_numpy()
X_test = X_test.reshape(-1, 1)
X_test = torch.from_numpy(X_test.astype(np.float32)).view(-1, 1)
predictedSet = model(X_test).detach().numpy()
gross_test_g = pd.DataFrame(data_test["Gross"], dtype=np.float64)
gross_test_g = gross_test_g.to_numpy()
gross_test_g = gross_test_g.reshape(-1, 1)
pred = pd.DataFrame(predictedSet)
pred.to_csv('result.csv')
# save model
torch.save(model, "model.pkl")
predicted = []
expected = []
for i in range(0, len(X_test)):
predicted.append(np.argmax(model(X_test[i]).detach().numpy(), axis=0))
expected.append(gross_test_g[i])
for i in range(0, len(expected)):
expected[i] = expected[i][0]
rmse = mean_squared_error(gross_test_g, pred, squared=False)
mse = mean_squared_error(gross_test_g, pred)
_run.log_scalar("RMSE", rmse)
_run.log_scalar("MSE", mse)
_run.info['epochs'] = epochs
# ex.run()
ex.add_artifact("model.pkl")

View File

@ -0,0 +1,164 @@
import sys
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, explained_variance_score, \
mean_squared_error, mean_absolute_error
def drop_relevant_columns(imbd_data):
imbd_data.drop(columns=["Poster_Link"], inplace=True)
imbd_data.drop(columns=["Overview"], inplace=True)
imbd_data.drop(columns=["Certificate"], inplace=True)
return imbd_data
def lowercase_columns_names(imbd_data):
imbd_data["Series_Title"] = imbd_data["Series_Title"].str.lower()
imbd_data["Genre"] = imbd_data["Genre"].str.lower()
imbd_data["Director"] = imbd_data["Director"].str.lower()
imbd_data["Star1"] = imbd_data["Star1"].str.lower()
imbd_data["Star2"] = imbd_data["Star2"].str.lower()
imbd_data["Star3"] = imbd_data["Star3"].str.lower()
imbd_data["Star4"] = imbd_data["Star4"].str.lower()
return imbd_data
def data_to_numeric(imbd_data):
imbd_data = imbd_data.replace(np.nan, '', regex=True)
imbd_data["Gross"] = imbd_data["Gross"].str.replace(',', '')
imbd_data["Gross"] = pd.to_numeric(imbd_data["Gross"], errors='coerce')
imbd_data["Runtime"] = imbd_data["Runtime"].str.replace(' min', '')
imbd_data["Runtime"] = pd.to_numeric(imbd_data["Runtime"], errors='coerce')
imbd_data["IMDB_Rating"] = pd.to_numeric(imbd_data["IMDB_Rating"], errors='coerce')
imbd_data["Meta_score"] = pd.to_numeric(imbd_data["Meta_score"], errors='coerce')
imbd_data["Released_Year"] = pd.to_numeric(imbd_data["Released_Year"], errors='coerce')
imbd_data = imbd_data.dropna()
imbd_data = imbd_data.reset_index()
imbd_data.drop(columns=["index"], inplace=True)
return imbd_data
def create_train_dev_test(imbd_data):
data_train, data_test = train_test_split(imbd_data, test_size=230, random_state=1, shuffle=True)
data_test, data_dev = train_test_split(data_test, test_size=115, random_state=1, shuffle=True)
data_test.to_csv("data_test.csv", encoding="utf-8", index=False)
data_dev.to_csv("data_dev.csv", encoding="utf-8", index=False)
data_train.to_csv("data_train.csv", encoding="utf-8", index=False)
def normalize_gross(imbd_data):
imbd_data[["Gross"]] = imbd_data[["Gross"]] / 10000000
return imbd_data
def prepare_dataset():
df = pd.read_csv('biblioteka_DL/imdb_top_1000.csv')
df = drop_relevant_columns(df)
df_lowercase = lowercase_columns_names(df)
df = data_to_numeric(df_lowercase)
df = normalize_gross(df)
return df
class LinearRegressionModel(torch.nn.Module):
def __init__(self):
super(LinearRegressionModel, self).__init__()
self.linear = torch.nn.Linear(1, 1) # One in and one out
def forward(self, x):
y_pred = self.linear(x)
return y_pred
df = prepare_dataset()
data_train, data_test = train_test_split(df, random_state=1, shuffle=True)
X_train = pd.DataFrame(data_train["Meta_score"], dtype=np.float64)
X_train = X_train.to_numpy()
y_train = pd.DataFrame(data_train["Gross"], dtype=np.float64)
y_train = y_train.to_numpy()
X_train = X_train.reshape(-1, 1)
y_train = y_train.reshape(-1, 1)
X_train = torch.from_numpy(X_train.astype(np.float32)).view(-1, 1)
y_train = torch.from_numpy(y_train.astype(np.float32)).view(-1, 1)
input_size = 1
output_size = 1
model = torch.load("model.pkl")
X_test = pd.DataFrame(data_test["Meta_score"], dtype=np.float64)
X_test = X_test.to_numpy()
X_test = X_test.reshape(-1, 1)
X_test = torch.from_numpy(X_test.astype(np.float32)).view(-1, 1)
predicted = model(X_test).detach().numpy()
gross_test_g = pd.DataFrame(data_test["Gross"], dtype=np.float64)
gross_test_g = gross_test_g.to_numpy()
gross_test_g = gross_test_g.reshape(-1, 1)
pred = pd.DataFrame(predicted)
predicted = []
expected = []
for i in range(0, len(X_test)):
predicted.append(np.argmax(model(X_test[i]).detach().numpy(), axis=0))
expected.append(gross_test_g[i])
for i in range(0, len(expected)):
expected[i] = expected[i][0]
rmse = mean_squared_error(gross_test_g, pred, squared=False)
mse = mean_squared_error(gross_test_g, pred)
evr = explained_variance_score(gross_test_g, pred)
mae = mean_absolute_error(gross_test_g, pred)
res = f"Explained variance regression score: {evr}, RMSE: {rmse}, MSE: {mse}, MAE: {mae}"
with open('mae.txt', 'a+') as f:
f.write(str(mae) + '\n')
with open('rmse.txt', 'a+') as f:
f.write(str(rmse) + '\n')
with open('mse.txt', 'a+') as f:
f.write(str(mse) + '\n')
with open('evr.txt', 'a+') as f:
f.write(str(evr) + '\n')
with open('mae.txt') as f:
mae_val = [float(line) for line in f if line]
builds = list(range(1, len(mae_val) + 1))
with open('rmse.txt') as f:
rmse_val = [float(line) for line in f if line]
with open('mse.txt') as f:
mse_val = [float(line) for line in f if line]
with open('evr.txt') as f:
evr_val = [float(line) for line in f if line]
ax = plt.gca()
ax.set_title('Build')
mae_line = ax.plot(mae_val, color='blue', label="MAE")
rmse_line = ax.plot(rmse_val, color='green', label="RMSE")
mse_line = ax.plot(mse_val, color='red', label="MSE")
evr_line = ax.plot(evr_val, color='orange', label="EVR")
ax.legend(bbox_to_anchor=(0., 1.01, 1.0, .1), loc=3,
ncol=2, mode="expand", borderaxespad=0.)
plt.show()
plt.savefig('metrics.png')