dvc

2022-05-31 10:14:04 +02:00 · 2022-05-31 10:14:04 +02:00 · 84bb8ee94e
commit 84bb8ee94e
parent 25fbe052c3
11 changed files with 211 additions and 4 deletions
--- a/.dvc/.gitignore
+++ b/.dvc/.gitignore
@ -0,0 +1,3 @@
+/config.local
+/tmp
+/cache
--- a/.dvc/config
+++ b/.dvc/config
@ -0,0 +1,6 @@
+[core]
+    remote = ium_ssh_remote
+['remote "ium_ssh_remote"']
+    url = ssh://ium-sftp@tzietkiewicz.vm.wmi.amu.edu.pl
+['remote "my_local_remote"']
+    url = /dvcstore
--- a/.dvcignore
+++ b/.dvcignore
@ -0,0 +1,4 @@
+# Add patterns of files dvc should ignore, which could improve
+# the performance. Learn more at
+# https://dvc.org/doc/user-guide/dvcignore
+kaggle.json
--- a/.gitignore
+++ b/.gitignore
@ -225,4 +225,5 @@ Participants_Data_HPP/
 my_runs
 saved_model

-mlruns
+mlruns
+/Participants_Data_HPP
--- a/5
+++ b/5
@ -15,11 +15,12 @@ RUN python3 -m pip install kaggle
 RUN python3 -m pip install pandas
 RUN pip3 install matplotlib
 RUN pip3 install sacred
-RUN pip3 install sacred
 RUN pip3 install pymongo
 RUN pip3 install mlflow
-RUN ln -s ~/.local/bin/kaggle /usr/bin/kaggle
+# RUN ln -s ~/.local/bin/kaggle /usr/bin/kaggle

+RUN python3 -m pip install dvc 'dvc[ssh]' paramiko
+RUN useradd -r -u 111 jenkins
 WORKDIR /app

 COPY . .
--- a/Participants_Data_HPP.dvc
+++ b/Participants_Data_HPP.dvc
@ -0,0 +1,5 @@
+outs:
+- md5: 40550846a6d8eec3f49155996444fc15.dir
+  size: 5182261
+  nfiles: 4
+  path: Participants_Data_HPP
--- a/README.md
+++ b/README.md
@ -31,6 +31,6 @@ Zadanie 1

 Zadanie 2
 1. plik lab8/trainScript.py, używa MLflow, zawiera input_example, 
-2. model z artifactów pobierany w lab8/Jenkinsfile.artifact, predykcja zrobiona skryptem lab8/predictArtifact.py, output predykcji konsola w [projekcie](https://tzietkiewicz.vm.wmi.amu.edu.pl:8080/job/s444417-predict-s449288/)
+2. model z artifactów pobierany w lab8/Jenkinsfile.artifact, predykcja zrobiona skryptem lab8/predictArtifact.py, output predykcji konsola w [projekcie](https://tzietkiewicz.vs444m.wmi.amu.edu.pl:8080/job/s444417-predict-s449288/)
 3. zarejestronwany model np. http://tzietkiewicz.vm.wmi.amu.edu.pl/#/experiments/17/runs/811420769d2642b8be694693c75b3587/artifactPath/linear-model, model jest rejestrowany w pliku lab8/trainScript.py
 4. [projekt](https://tzietkiewicz.vm.wmi.amu.edu.pl:8080/job/s444417-predict-s449288-from-registry/) predykcja realizowana skryptem lab8/predictMlflow.py i printowana w consoli builda,
--- a/dvc.yaml
+++ b/dvc.yaml
@ -0,0 +1,4 @@
+stages:
+  prepare:
+    cmd: \ -p startscript1.sh \ -d .\lab10\task1python.py -d Participants_Data_HPP
+      \ python .\lab10\trainScript.py Participants_Data_HPP
--- a/lab10/Jenkinsfile
+++ b/lab10/Jenkinsfile
@ -0,0 +1,18 @@
+pipeline {
+    agent {
+        dockerfile true
+    }
+    stages {
+        stage("Check out from version control") {
+            steps {
+                checkout scm
+            }
+        }
+        stage("DVC"){
+            withCredentials([sshUserPrivateKey(credentialsId: '48ac7004-216e-4260-abba-1fe5db753e18', keyFileVariable: 'IUM_SFTP_KEY')]) {
+                sh 'dvc remote modify --local ium_ssh_remote keyfile $IUM_SFTP_KEY'
+                sh "dvc pull"
+            }
+        }
+    }
+}
--- a/lab10/task1python.py
+++ b/lab10/task1python.py
@ -0,0 +1,68 @@
+import os
+import sys
+import pandas as pd
+
+
+cwd = os.path.abspath(os.path.dirname(sys.argv[0]))
+
+# paths
+filePathTest = cwd + "./Participants_Data_HPP/Train.csv"
+filePathTrain = cwd + "./Participants_Data_HPP/Test.csv"
+
+dataTest = pd.read_csv(filePathTest)
+dataTrain = pd.read_csv(filePathTrain)
+
+number_lines = len(dataTest.index)
+row_size = number_lines // 2
+
+# start looping through data writing it to a new file for each set
+# no of csv files with row size
+k = 2
+size = row_size
+
+# split test data to test and dev
+for i in range(k):
+    df = dataTest[size * i:size * (i + 1)]
+    name = ""
+    if i == 0:
+        name = "Dev"
+    else:
+        name = "Test"
+    df.to_csv(cwd + './Participants_Data_HPP/' + name + '.csv', index=False)
+
+#df_1 = pd.read_csv("../Participants_Data_HPP/Dev.csv")
+
+#df_2 = pd.read_csv("../Participants_Data_HPP/Test.csv")
+
+#df_2 = pd.read_csv("../Participants_Data_HPP/Train.csv")
+
+dataPath = cwd + './Participants_Data_HPP/Train.csv'
+
+#data informations
+data = pd.read_csv(dataPath)
+
+description = data.describe(include="all")
+
+corr = data.corr()
+
+#select the most significant
+data = data[['TARGET(PRICE_IN_LACS)', 'SQUARE_FT', 'BHK_NO.', 'RESALE']]
+#normalize price column and flat area using min max technique
+columnName1 = 'TARGET(PRICE_IN_LACS)'
+columnName2 = 'SQUARE_FT'
+
+column1Min = data[columnName1].min()
+column1Max = data[columnName1].max()
+column2Min = data[columnName2].min()
+column2Max = data[columnName2].max()
+
+data[columnName1] = (data[columnName1] - column1Min) / (column1Max - column1Min)
+data[columnName2] = (data[columnName2] - column2Min) / (column2Max - column2Min)
+
+print(description)
+
+print(corr)
+
+print(data.describe(include="all"))
+
+print(data.head())
--- a/lab10/trainScript.py
+++ b/lab10/trainScript.py
@ -0,0 +1,97 @@
+import os
+import sys
+import pandas as pd
+import numpy as np
+
+import tensorflow as tf
+from tensorflow.keras import layers
+import matplotlib.pyplot as plt
+
+#train params
+numberOfEpoch = sys.argv[1]
+
+cwd = os.path.abspath(os.path.dirname(sys.argv[0]))
+
+pathTrain = cwd + "./Participants_Data_HPP/Train.csv"
+pathTest = cwd + "./Participants_Data_HPP/Test.csv"
+
+features = ["UNDER_CONSTRUCTION", "RERA", "BHK_NO.", "SQUARE_FT", "READY_TO_MOVE", "RESALE", "LONGITUDE", "LATITUDE", "TARGET(PRICE_IN_LACS)"]
+
+# get dataset
+house_price_train = pd.read_csv(pathTrain)[features]
+
+# get test dataset
+house_price_test = pd.read_csv(pathTest)[features]
+
+
+house_price_features = house_price_train.copy()
+# pop column
+house_price_labels = house_price_features.pop('TARGET(PRICE_IN_LACS)')
+
+# process data
+normalize = layers.Normalization()
+normalize.adapt(house_price_features)
+
+feature_test_sample = house_price_test.sample(10)
+labels_test_sample = feature_test_sample.pop('TARGET(PRICE_IN_LACS)')
+
+house_price_test_features = house_price_test.copy()
+# pop column
+house_price_test_expected = house_price_test_features.pop('TARGET(PRICE_IN_LACS)')
+
+# to np.array
+# house_price_test =  np.array(house_price_test)
+# house_price_test_expected = np.array(house_price_test_expected)
+
+house_price_features = np.array(house_price_features)
+
+# checkoints
+# checkpoint_path = "training_1/cp.ckpt"
+# checkpoint_dir = os.path.dirname(checkpoint_path)
+# Create a callback that saves the model's weights
+# cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, verbose=1)
+# model keras.Sequential
+# one output tensor
+
+modelPath = 'saved_model/MyModel_tf'
+try: 
+  linear_model = tf.keras.models.load_model(modelPath)
+  print("open existing model")
+except Exception as ex:
+  print(ex)
+  linear_model = tf.keras.Sequential([
+    normalize,
+    layers.Dense(1)
+  ])
+  linear_model.compile(loss = tf.losses.MeanSquaredError(),
+                        optimizer = tf.optimizers.Adam(1))
+  print("creating new model")
+
+# train model
+history = linear_model.fit(
+  house_price_features, 
+  house_price_labels, 
+  epochs=int(numberOfEpoch), 
+  validation_split=0.33,
+  verbose=1)
+#callbacks=[cp_callback])
+
+# save model
+linear_model.save(modelPath, save_format='tf')
+
+test_results = {}
+test_results['linear_model'] = linear_model.evaluate(
+    house_price_test_features, house_price_test_expected, verbose=0)
+
+def flatten(t):
+    return [item for sublist in t for item in sublist]
+
+pred = np.array(linear_model.predict(feature_test_sample))
+flatten_pred = flatten(pred)
+
+# print("predictions: " + str(flatten_pred))
+# print("expected: " + str(np.array(labels_test_sample)))
+
+with open(cwd + "/../result.txt", "w+") as resultFile:
+  resultFile.write("predictions: " + str(flatten_pred) + '\n')
+  resultFile.write("expected: " + str(labels_test_sample.to_numpy()))