diff --git a/process_dataset.py b/process_dataset.py index 45e0340..3869c43 100644 --- a/process_dataset.py +++ b/process_dataset.py @@ -1,25 +1,126 @@ import pandas as pd +import numpy as np from sklearn.model_selection import train_test_split +from sklearn import metrics +import matplotlib.pyplot as plt +import tensorflow as tf +import math -energy_data = pd.read_csv('Steel_industry_data.csv') +from tensorflow import keras +#from tensorflow.keras import layers -train_data, test_data = train_test_split(energy_data, test_size=7008, random_state=1) -test_data, dev_data = train_test_split(test_data, test_size=3504, random_state=1) + +def process_data_and_get_x_y(data): + data.columns = ["date", "Usage_kWh", "Lagging_Current_Reactive.Power_kVarh", "Leading_Current_Reactive_Power_kVarh", + "CO2(tCO2)", "Lagging_Current_Power_Factor", "Leading_Current_Power_Factor", "WeekStatus", + "Day_of_week", "Load_Type"] #without NSM column + data = data.set_index('date') + data = pd.get_dummies(data, drop_first=True) + + x = data.drop('Usage_kWh', axis=1) + #x = data['Lagging_Current_Reactive.Power_kVarh'] + y = data['Usage_kWh'] + return data, x, y + + +def plot_loss(history): + plt.plot(history.history['loss'], label='loss') + plt.plot(history.history['val_loss'], label='val_loss') + plt.ylim([0, 10]) + plt.xlabel('Epoch') + plt.ylabel('Error') + plt.legend() + plt.grid(True) + plt.show() + + +def show_result(x, y): + plt.title('One variable Model', fontsize=15, color='g', pad=12) + plt.plot(x, y, 'o', color='r') + + m, b = np.polyfit(x, y, 1) + plt.plot(x, m * x + b, color='darkblue') + plt.xlabel('Actual') + plt.ylabel('Predicted') + plt.show() + + +energy_data_train = pd.read_csv('Steel_industry_data_train.csv') +energy_data_test = pd.read_csv('Steel_industry_data_test.csv') + +energy_data_test, x_test, y_test = process_data_and_get_x_y(energy_data_test) +energy_data_train, x_train, y_train = process_data_and_get_x_y(energy_data_train) + +#x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=1) +# x_test, x_dev, y_test, y_dev = train_test_split(x_test, y_test, test_size=0.5, random_state=1) # stats -print(energy_data.describe(include='all')) +print(energy_data_test.describe(include='all')) +print(x_train.describe(include='all')) +#print(np.array(x_train).reshape(-1, 1)) -print('Training set size:') -print(train_data.shape) -print('Testing set size:') -print(test_data.shape) -print('Dev set size:') -print(dev_data.shape) +normalizer = tf.keras.layers.Normalization(axis=1) +normalizer.adapt(np.array(x_train)) +print(normalizer.mean.numpy()) -#print(train_data.describe(include='all')) -#print(test_data.describe(include='all')) -#print(dev_data.describe(include='all')) +# powinno być niezmienione +print(np.array(x_train[:1])) -test_data.to_csv("steel_industry_data_test.csv", encoding="utf-8", index=False) -dev_data.to_csv("steel_industry_data_dev.csv", encoding="utf-8", index=False) -train_data.to_csv("steel_industry_data_train.csv", encoding="utf-8", index=False) +#usage = np.array(x_train) +#usage_normalizer = keras.layers.Normalization(input_shape=[14, ], axis=1) +#usage_normalizer.adapt(usage) + +usage_model = tf.keras.Sequential([ + normalizer, + keras.layers.Dense(units=10, activation='relu'), + keras.layers.Dense(units=1) +]) + +print(usage_model.summary()) + +usage_model.compile( + optimizer=tf.optimizers.Adam(learning_rate=0.1), + loss='mean_absolute_error') + +history = usage_model.fit( + x_train, + y_train, + epochs=100, + # Suppress logging. + verbose=0, + # Calculate validation results on 20% of the training data. + validation_split=0.2) + +hist = pd.DataFrame(history.history) +hist['epoch'] = history.epoch +print(hist.tail()) + +plot_loss(history) + +y_predicted = usage_model.predict(x_test) +test_results = {} +test_results['usage_model'] = usage_model.evaluate( + x_test, + y_test, verbose=0) + +print('Mean Absolute Error : ', metrics.mean_absolute_error(y_test, y_predicted)) +print('Mean Squared Error : ', metrics.mean_squared_error(y_test, y_predicted)) +print('Root Mean Squared Error : ', math.sqrt(metrics.mean_squared_error(y_test, y_predicted))) + +print(test_results['usage_model']) + +show_result(y_test, y_predicted) + +#print('Training set size:') +#print(x_train.shape) +#print(y_train.shape) +#print('Testing set size:') +#print(x_test.shape) +#print(y_test.shape) +# print('Dev set size:') +# print(x_dev.shape) +# print(y_dev.shape) + +# print(train_data.describe(include='all')) +# print(test_data.describe(include='all')) +# print(dev_data.describe(include='all'))