dvc
All checks were successful
s444417-training/pipeline/head This commit looks good
s444417-evaluation/pipeline/head This commit looks good

This commit is contained in:
s444417 2022-05-31 10:14:04 +02:00
parent 25fbe052c3
commit 84bb8ee94e
11 changed files with 211 additions and 4 deletions

3
.dvc/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
/config.local
/tmp
/cache

6
.dvc/config Normal file
View File

@ -0,0 +1,6 @@
[core]
remote = ium_ssh_remote
['remote "ium_ssh_remote"']
url = ssh://ium-sftp@tzietkiewicz.vm.wmi.amu.edu.pl
['remote "my_local_remote"']
url = /dvcstore

4
.dvcignore Normal file
View File

@ -0,0 +1,4 @@
# Add patterns of files dvc should ignore, which could improve
# the performance. Learn more at
# https://dvc.org/doc/user-guide/dvcignore
kaggle.json

3
.gitignore vendored
View File

@ -225,4 +225,5 @@ Participants_Data_HPP/
my_runs my_runs
saved_model saved_model
mlruns mlruns
/Participants_Data_HPP

View File

@ -15,11 +15,12 @@ RUN python3 -m pip install kaggle
RUN python3 -m pip install pandas RUN python3 -m pip install pandas
RUN pip3 install matplotlib RUN pip3 install matplotlib
RUN pip3 install sacred RUN pip3 install sacred
RUN pip3 install sacred
RUN pip3 install pymongo RUN pip3 install pymongo
RUN pip3 install mlflow RUN pip3 install mlflow
RUN ln -s ~/.local/bin/kaggle /usr/bin/kaggle # RUN ln -s ~/.local/bin/kaggle /usr/bin/kaggle
RUN python3 -m pip install dvc 'dvc[ssh]' paramiko
RUN useradd -r -u 111 jenkins
WORKDIR /app WORKDIR /app
COPY . . COPY . .

View File

@ -0,0 +1,5 @@
outs:
- md5: 40550846a6d8eec3f49155996444fc15.dir
size: 5182261
nfiles: 4
path: Participants_Data_HPP

View File

@ -31,6 +31,6 @@ Zadanie 1
Zadanie 2 Zadanie 2
1. plik lab8/trainScript.py, używa MLflow, zawiera input_example, 1. plik lab8/trainScript.py, używa MLflow, zawiera input_example,
2. model z artifactów pobierany w lab8/Jenkinsfile.artifact, predykcja zrobiona skryptem lab8/predictArtifact.py, output predykcji konsola w [projekcie](https://tzietkiewicz.vm.wmi.amu.edu.pl:8080/job/s444417-predict-s449288/) 2. model z artifactów pobierany w lab8/Jenkinsfile.artifact, predykcja zrobiona skryptem lab8/predictArtifact.py, output predykcji konsola w [projekcie](https://tzietkiewicz.vs444m.wmi.amu.edu.pl:8080/job/s444417-predict-s449288/)
3. zarejestronwany model np. http://tzietkiewicz.vm.wmi.amu.edu.pl/#/experiments/17/runs/811420769d2642b8be694693c75b3587/artifactPath/linear-model, model jest rejestrowany w pliku lab8/trainScript.py 3. zarejestronwany model np. http://tzietkiewicz.vm.wmi.amu.edu.pl/#/experiments/17/runs/811420769d2642b8be694693c75b3587/artifactPath/linear-model, model jest rejestrowany w pliku lab8/trainScript.py
4. [projekt](https://tzietkiewicz.vm.wmi.amu.edu.pl:8080/job/s444417-predict-s449288-from-registry/) predykcja realizowana skryptem lab8/predictMlflow.py i printowana w consoli builda, 4. [projekt](https://tzietkiewicz.vm.wmi.amu.edu.pl:8080/job/s444417-predict-s449288-from-registry/) predykcja realizowana skryptem lab8/predictMlflow.py i printowana w consoli builda,

4
dvc.yaml Normal file
View File

@ -0,0 +1,4 @@
stages:
prepare:
cmd: \ -p startscript1.sh \ -d .\lab10\task1python.py -d Participants_Data_HPP
\ python .\lab10\trainScript.py Participants_Data_HPP

18
lab10/Jenkinsfile vendored Normal file
View File

@ -0,0 +1,18 @@
pipeline {
agent {
dockerfile true
}
stages {
stage("Check out from version control") {
steps {
checkout scm
}
}
stage("DVC"){
withCredentials([sshUserPrivateKey(credentialsId: '48ac7004-216e-4260-abba-1fe5db753e18', keyFileVariable: 'IUM_SFTP_KEY')]) {
sh 'dvc remote modify --local ium_ssh_remote keyfile $IUM_SFTP_KEY'
sh "dvc pull"
}
}
}
}

68
lab10/task1python.py Normal file
View File

@ -0,0 +1,68 @@
import os
import sys
import pandas as pd
cwd = os.path.abspath(os.path.dirname(sys.argv[0]))
# paths
filePathTest = cwd + "./Participants_Data_HPP/Train.csv"
filePathTrain = cwd + "./Participants_Data_HPP/Test.csv"
dataTest = pd.read_csv(filePathTest)
dataTrain = pd.read_csv(filePathTrain)
number_lines = len(dataTest.index)
row_size = number_lines // 2
# start looping through data writing it to a new file for each set
# no of csv files with row size
k = 2
size = row_size
# split test data to test and dev
for i in range(k):
df = dataTest[size * i:size * (i + 1)]
name = ""
if i == 0:
name = "Dev"
else:
name = "Test"
df.to_csv(cwd + './Participants_Data_HPP/' + name + '.csv', index=False)
#df_1 = pd.read_csv("../Participants_Data_HPP/Dev.csv")
#df_2 = pd.read_csv("../Participants_Data_HPP/Test.csv")
#df_2 = pd.read_csv("../Participants_Data_HPP/Train.csv")
dataPath = cwd + './Participants_Data_HPP/Train.csv'
#data informations
data = pd.read_csv(dataPath)
description = data.describe(include="all")
corr = data.corr()
#select the most significant
data = data[['TARGET(PRICE_IN_LACS)', 'SQUARE_FT', 'BHK_NO.', 'RESALE']]
#normalize price column and flat area using min max technique
columnName1 = 'TARGET(PRICE_IN_LACS)'
columnName2 = 'SQUARE_FT'
column1Min = data[columnName1].min()
column1Max = data[columnName1].max()
column2Min = data[columnName2].min()
column2Max = data[columnName2].max()
data[columnName1] = (data[columnName1] - column1Min) / (column1Max - column1Min)
data[columnName2] = (data[columnName2] - column2Min) / (column2Max - column2Min)
print(description)
print(corr)
print(data.describe(include="all"))
print(data.head())

97
lab10/trainScript.py Normal file
View File

@ -0,0 +1,97 @@
import os
import sys
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt
#train params
numberOfEpoch = sys.argv[1]
cwd = os.path.abspath(os.path.dirname(sys.argv[0]))
pathTrain = cwd + "./Participants_Data_HPP/Train.csv"
pathTest = cwd + "./Participants_Data_HPP/Test.csv"
features = ["UNDER_CONSTRUCTION", "RERA", "BHK_NO.", "SQUARE_FT", "READY_TO_MOVE", "RESALE", "LONGITUDE", "LATITUDE", "TARGET(PRICE_IN_LACS)"]
# get dataset
house_price_train = pd.read_csv(pathTrain)[features]
# get test dataset
house_price_test = pd.read_csv(pathTest)[features]
house_price_features = house_price_train.copy()
# pop column
house_price_labels = house_price_features.pop('TARGET(PRICE_IN_LACS)')
# process data
normalize = layers.Normalization()
normalize.adapt(house_price_features)
feature_test_sample = house_price_test.sample(10)
labels_test_sample = feature_test_sample.pop('TARGET(PRICE_IN_LACS)')
house_price_test_features = house_price_test.copy()
# pop column
house_price_test_expected = house_price_test_features.pop('TARGET(PRICE_IN_LACS)')
# to np.array
# house_price_test = np.array(house_price_test)
# house_price_test_expected = np.array(house_price_test_expected)
house_price_features = np.array(house_price_features)
# checkoints
# checkpoint_path = "training_1/cp.ckpt"
# checkpoint_dir = os.path.dirname(checkpoint_path)
# Create a callback that saves the model's weights
# cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, verbose=1)
# model keras.Sequential
# one output tensor
modelPath = 'saved_model/MyModel_tf'
try:
linear_model = tf.keras.models.load_model(modelPath)
print("open existing model")
except Exception as ex:
print(ex)
linear_model = tf.keras.Sequential([
normalize,
layers.Dense(1)
])
linear_model.compile(loss = tf.losses.MeanSquaredError(),
optimizer = tf.optimizers.Adam(1))
print("creating new model")
# train model
history = linear_model.fit(
house_price_features,
house_price_labels,
epochs=int(numberOfEpoch),
validation_split=0.33,
verbose=1)
#callbacks=[cp_callback])
# save model
linear_model.save(modelPath, save_format='tf')
test_results = {}
test_results['linear_model'] = linear_model.evaluate(
house_price_test_features, house_price_test_expected, verbose=0)
def flatten(t):
return [item for sublist in t for item in sublist]
pred = np.array(linear_model.predict(feature_test_sample))
flatten_pred = flatten(pred)
# print("predictions: " + str(flatten_pred))
# print("expected: " + str(np.array(labels_test_sample)))
with open(cwd + "/../result.txt", "w+") as resultFile:
resultFile.write("predictions: " + str(flatten_pred) + '\n')
resultFile.write("expected: " + str(labels_test_sample.to_numpy()))