dvc
All checks were successful
s444417-training/pipeline/head This commit looks good
s444417-evaluation/pipeline/head This commit looks good

This commit is contained in:
s444417 2022-05-31 10:14:04 +02:00
parent 25fbe052c3
commit 84bb8ee94e
11 changed files with 211 additions and 4 deletions

3
.dvc/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
/config.local
/tmp
/cache

6
.dvc/config Normal file
View File

@ -0,0 +1,6 @@
[core]
remote = ium_ssh_remote
['remote "ium_ssh_remote"']
url = ssh://ium-sftp@tzietkiewicz.vm.wmi.amu.edu.pl
['remote "my_local_remote"']
url = /dvcstore

4
.dvcignore Normal file
View File

@ -0,0 +1,4 @@
# Add patterns of files dvc should ignore, which could improve
# the performance. Learn more at
# https://dvc.org/doc/user-guide/dvcignore
kaggle.json

3
.gitignore vendored
View File

@ -225,4 +225,5 @@ Participants_Data_HPP/
my_runs
saved_model
mlruns
mlruns
/Participants_Data_HPP

View File

@ -15,11 +15,12 @@ RUN python3 -m pip install kaggle
RUN python3 -m pip install pandas
RUN pip3 install matplotlib
RUN pip3 install sacred
RUN pip3 install sacred
RUN pip3 install pymongo
RUN pip3 install mlflow
RUN ln -s ~/.local/bin/kaggle /usr/bin/kaggle
# RUN ln -s ~/.local/bin/kaggle /usr/bin/kaggle
RUN python3 -m pip install dvc 'dvc[ssh]' paramiko
RUN useradd -r -u 111 jenkins
WORKDIR /app
COPY . .

View File

@ -0,0 +1,5 @@
outs:
- md5: 40550846a6d8eec3f49155996444fc15.dir
size: 5182261
nfiles: 4
path: Participants_Data_HPP

View File

@ -31,6 +31,6 @@ Zadanie 1
Zadanie 2
1. plik lab8/trainScript.py, używa MLflow, zawiera input_example,
2. model z artifactów pobierany w lab8/Jenkinsfile.artifact, predykcja zrobiona skryptem lab8/predictArtifact.py, output predykcji konsola w [projekcie](https://tzietkiewicz.vm.wmi.amu.edu.pl:8080/job/s444417-predict-s449288/)
2. model z artifactów pobierany w lab8/Jenkinsfile.artifact, predykcja zrobiona skryptem lab8/predictArtifact.py, output predykcji konsola w [projekcie](https://tzietkiewicz.vs444m.wmi.amu.edu.pl:8080/job/s444417-predict-s449288/)
3. zarejestronwany model np. http://tzietkiewicz.vm.wmi.amu.edu.pl/#/experiments/17/runs/811420769d2642b8be694693c75b3587/artifactPath/linear-model, model jest rejestrowany w pliku lab8/trainScript.py
4. [projekt](https://tzietkiewicz.vm.wmi.amu.edu.pl:8080/job/s444417-predict-s449288-from-registry/) predykcja realizowana skryptem lab8/predictMlflow.py i printowana w consoli builda,

4
dvc.yaml Normal file
View File

@ -0,0 +1,4 @@
stages:
prepare:
cmd: \ -p startscript1.sh \ -d .\lab10\task1python.py -d Participants_Data_HPP
\ python .\lab10\trainScript.py Participants_Data_HPP

18
lab10/Jenkinsfile vendored Normal file
View File

@ -0,0 +1,18 @@
pipeline {
agent {
dockerfile true
}
stages {
stage("Check out from version control") {
steps {
checkout scm
}
}
stage("DVC"){
withCredentials([sshUserPrivateKey(credentialsId: '48ac7004-216e-4260-abba-1fe5db753e18', keyFileVariable: 'IUM_SFTP_KEY')]) {
sh 'dvc remote modify --local ium_ssh_remote keyfile $IUM_SFTP_KEY'
sh "dvc pull"
}
}
}
}

68
lab10/task1python.py Normal file
View File

@ -0,0 +1,68 @@
import os
import sys
import pandas as pd
cwd = os.path.abspath(os.path.dirname(sys.argv[0]))
# paths
filePathTest = cwd + "./Participants_Data_HPP/Train.csv"
filePathTrain = cwd + "./Participants_Data_HPP/Test.csv"
dataTest = pd.read_csv(filePathTest)
dataTrain = pd.read_csv(filePathTrain)
number_lines = len(dataTest.index)
row_size = number_lines // 2
# start looping through data writing it to a new file for each set
# no of csv files with row size
k = 2
size = row_size
# split test data to test and dev
for i in range(k):
df = dataTest[size * i:size * (i + 1)]
name = ""
if i == 0:
name = "Dev"
else:
name = "Test"
df.to_csv(cwd + './Participants_Data_HPP/' + name + '.csv', index=False)
#df_1 = pd.read_csv("../Participants_Data_HPP/Dev.csv")
#df_2 = pd.read_csv("../Participants_Data_HPP/Test.csv")
#df_2 = pd.read_csv("../Participants_Data_HPP/Train.csv")
dataPath = cwd + './Participants_Data_HPP/Train.csv'
#data informations
data = pd.read_csv(dataPath)
description = data.describe(include="all")
corr = data.corr()
#select the most significant
data = data[['TARGET(PRICE_IN_LACS)', 'SQUARE_FT', 'BHK_NO.', 'RESALE']]
#normalize price column and flat area using min max technique
columnName1 = 'TARGET(PRICE_IN_LACS)'
columnName2 = 'SQUARE_FT'
column1Min = data[columnName1].min()
column1Max = data[columnName1].max()
column2Min = data[columnName2].min()
column2Max = data[columnName2].max()
data[columnName1] = (data[columnName1] - column1Min) / (column1Max - column1Min)
data[columnName2] = (data[columnName2] - column2Min) / (column2Max - column2Min)
print(description)
print(corr)
print(data.describe(include="all"))
print(data.head())

97
lab10/trainScript.py Normal file
View File

@ -0,0 +1,97 @@
import os
import sys
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt
#train params
numberOfEpoch = sys.argv[1]
cwd = os.path.abspath(os.path.dirname(sys.argv[0]))
pathTrain = cwd + "./Participants_Data_HPP/Train.csv"
pathTest = cwd + "./Participants_Data_HPP/Test.csv"
features = ["UNDER_CONSTRUCTION", "RERA", "BHK_NO.", "SQUARE_FT", "READY_TO_MOVE", "RESALE", "LONGITUDE", "LATITUDE", "TARGET(PRICE_IN_LACS)"]
# get dataset
house_price_train = pd.read_csv(pathTrain)[features]
# get test dataset
house_price_test = pd.read_csv(pathTest)[features]
house_price_features = house_price_train.copy()
# pop column
house_price_labels = house_price_features.pop('TARGET(PRICE_IN_LACS)')
# process data
normalize = layers.Normalization()
normalize.adapt(house_price_features)
feature_test_sample = house_price_test.sample(10)
labels_test_sample = feature_test_sample.pop('TARGET(PRICE_IN_LACS)')
house_price_test_features = house_price_test.copy()
# pop column
house_price_test_expected = house_price_test_features.pop('TARGET(PRICE_IN_LACS)')
# to np.array
# house_price_test = np.array(house_price_test)
# house_price_test_expected = np.array(house_price_test_expected)
house_price_features = np.array(house_price_features)
# checkoints
# checkpoint_path = "training_1/cp.ckpt"
# checkpoint_dir = os.path.dirname(checkpoint_path)
# Create a callback that saves the model's weights
# cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, verbose=1)
# model keras.Sequential
# one output tensor
modelPath = 'saved_model/MyModel_tf'
try:
linear_model = tf.keras.models.load_model(modelPath)
print("open existing model")
except Exception as ex:
print(ex)
linear_model = tf.keras.Sequential([
normalize,
layers.Dense(1)
])
linear_model.compile(loss = tf.losses.MeanSquaredError(),
optimizer = tf.optimizers.Adam(1))
print("creating new model")
# train model
history = linear_model.fit(
house_price_features,
house_price_labels,
epochs=int(numberOfEpoch),
validation_split=0.33,
verbose=1)
#callbacks=[cp_callback])
# save model
linear_model.save(modelPath, save_format='tf')
test_results = {}
test_results['linear_model'] = linear_model.evaluate(
house_price_test_features, house_price_test_expected, verbose=0)
def flatten(t):
return [item for sublist in t for item in sublist]
pred = np.array(linear_model.predict(feature_test_sample))
flatten_pred = flatten(pred)
# print("predictions: " + str(flatten_pred))
# print("expected: " + str(np.array(labels_test_sample)))
with open(cwd + "/../result.txt", "w+") as resultFile:
resultFile.write("predictions: " + str(flatten_pred) + '\n')
resultFile.write("expected: " + str(labels_test_sample.to_numpy()))