From 3db952a567c736c08f5573fe5ab3cb6c264657a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cezary=20Ga=C5=82=C4=85zkiewicz?= <cgalazkiewicz@gmail.com>
Date: Mon, 6 Jun 2022 00:28:02 +0200
Subject: [PATCH] Zad 10. DVC

---
 .gitignore                        |  2 ++
 evaluate.py                       | 41 +++++++++++++++++++++++++++
 process_dataset.py                | 46 +------------------------------
 steel_industry_data_test.csv.dvc  |  4 +++
 steel_industry_data_train.csv.dvc |  4 +++
 5 files changed, 52 insertions(+), 45 deletions(-)
 create mode 100644 evaluate.py
 create mode 100644 steel_industry_data_test.csv.dvc
 create mode 100644 steel_industry_data_train.csv.dvc

diff --git a/.gitignore b/.gitignore
index 8781153..26d8820 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,3 @@
 /Steel_industry_data.csv
+/steel_industry_data_train.csv
+/steel_industry_data_test.csv
diff --git a/evaluate.py b/evaluate.py
new file mode 100644
index 0000000..857eb78
--- /dev/null
+++ b/evaluate.py
@@ -0,0 +1,41 @@
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn import metrics
+import matplotlib.pyplot as plt
+import tensorflow as tf
+import math
+
+from tensorflow import keras
+from process_dataset import process_data_and_get_x_y
+
+
+def show_result(x, y):
+    plt.title('Usage kWh Model', fontsize=15, color='g', pad=12)
+    plt.plot(x, y, 'o', color='r')
+
+    m, b = np.polyfit(x, y, 1)
+    plt.plot(x, m * x + b, color='darkblue')
+    plt.xlabel('Actual')
+    plt.ylabel('Predicted')
+    plt.show()
+
+
+model = keras.models.load_model('steel_industry_model')
+
+energy_data_test = pd.read_csv('Steel_industry_data_test.csv')
+energy_data_test, x_test, y_test = process_data_and_get_x_y(energy_data_test)
+
+y_predicted = model.predict(x_test)
+test_results = {}
+test_results['usage_model'] = model.evaluate(
+    x_test,
+    y_test, verbose=0)
+
+print('Mean Absolute Error : ', metrics.mean_absolute_error(y_test, y_predicted))
+print('Mean Squared Error : ', metrics.mean_squared_error(y_test, y_predicted))
+print('Root Mean Squared Error : ', math.sqrt(metrics.mean_squared_error(y_test, y_predicted)))
+
+print(test_results['usage_model'])
+
+show_result(y_test, y_predicted)
diff --git a/process_dataset.py b/process_dataset.py
index 3869c43..3d8d413 100644
--- a/process_dataset.py
+++ b/process_dataset.py
@@ -34,28 +34,14 @@ def plot_loss(history):
     plt.show()
 
 
-def show_result(x, y):
-    plt.title('One variable Model', fontsize=15, color='g', pad=12)
-    plt.plot(x, y, 'o', color='r')
-
-    m, b = np.polyfit(x, y, 1)
-    plt.plot(x, m * x + b, color='darkblue')
-    plt.xlabel('Actual')
-    plt.ylabel('Predicted')
-    plt.show()
-
-
 energy_data_train = pd.read_csv('Steel_industry_data_train.csv')
-energy_data_test = pd.read_csv('Steel_industry_data_test.csv')
 
-energy_data_test, x_test, y_test = process_data_and_get_x_y(energy_data_test)
 energy_data_train, x_train, y_train = process_data_and_get_x_y(energy_data_train)
 
 #x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=1)
 # x_test, x_dev, y_test, y_dev = train_test_split(x_test, y_test, test_size=0.5, random_state=1)
 
 # stats
-print(energy_data_test.describe(include='all'))
 print(x_train.describe(include='all'))
 #print(np.array(x_train).reshape(-1, 1))
 
@@ -66,10 +52,6 @@ print(normalizer.mean.numpy())
 # powinno być niezmienione
 print(np.array(x_train[:1]))
 
-#usage = np.array(x_train)
-#usage_normalizer = keras.layers.Normalization(input_shape=[14, ], axis=1)
-#usage_normalizer.adapt(usage)
-
 usage_model = tf.keras.Sequential([
     normalizer,
     keras.layers.Dense(units=10, activation='relu'),
@@ -97,30 +79,4 @@ print(hist.tail())
 
 plot_loss(history)
 
-y_predicted = usage_model.predict(x_test)
-test_results = {}
-test_results['usage_model'] = usage_model.evaluate(
-    x_test,
-    y_test, verbose=0)
-
-print('Mean Absolute Error : ', metrics.mean_absolute_error(y_test, y_predicted))
-print('Mean Squared Error : ', metrics.mean_squared_error(y_test, y_predicted))
-print('Root Mean Squared Error : ', math.sqrt(metrics.mean_squared_error(y_test, y_predicted)))
-
-print(test_results['usage_model'])
-
-show_result(y_test, y_predicted)
-
-#print('Training set size:')
-#print(x_train.shape)
-#print(y_train.shape)
-#print('Testing set size:')
-#print(x_test.shape)
-#print(y_test.shape)
-# print('Dev set size:')
-# print(x_dev.shape)
-# print(y_dev.shape)
-
-# print(train_data.describe(include='all'))
-# print(test_data.describe(include='all'))
-# print(dev_data.describe(include='all'))
+usage_model.save('steel_industry_model')
diff --git a/steel_industry_data_test.csv.dvc b/steel_industry_data_test.csv.dvc
new file mode 100644
index 0000000..9fff34d
--- /dev/null
+++ b/steel_industry_data_test.csv.dvc
@@ -0,0 +1,4 @@
+outs:
+- md5: ba702b5ad2647abad7f297449a6ca273
+  size: 252454
+  path: steel_industry_data_test.csv
diff --git a/steel_industry_data_train.csv.dvc b/steel_industry_data_train.csv.dvc
new file mode 100644
index 0000000..80c842f
--- /dev/null
+++ b/steel_industry_data_train.csv.dvc
@@ -0,0 +1,4 @@
+outs:
+- md5: b9a05e4bc7ecf47bc3fb5ca7d92fd9fa
+  size: 2021682
+  path: steel_industry_data_train.csv