From 00e260e7659520ed57dcbeb33c01987ca6072a55 Mon Sep 17 00:00:00 2001 From: s444439 Date: Wed, 10 May 2023 22:50:54 +0200 Subject: [PATCH] fix(wip) --- Jenkinsfile_training | 58 ++++++++++++++ predictions.py | 12 +++ script.py | 185 +++++++++++++++++++++++++------------------ 3 files changed, 176 insertions(+), 79 deletions(-) create mode 100644 Jenkinsfile_training create mode 100644 predictions.py diff --git a/Jenkinsfile_training b/Jenkinsfile_training new file mode 100644 index 0000000..4cdea5b --- /dev/null +++ b/Jenkinsfile_training @@ -0,0 +1,58 @@ +pipeline { + parameters { + string( + defaultValue: '64', + description: 'Batch size used in gradient', + name: 'BATCHSIZE', + trim: true + ) + string( + defaultValue: '5', + description: 'Number of iterations', + name: 'EPOCHS', + trim: true + ) + gitParameter branchFilter: 'origin/(.*)', defaultValue: 'main', name: 'BRANCH', type: 'PT_BRANCH' + buildSelector( + defaultSelector: lastSuccessful(), + description: 'Which build to use for copying artifacts', + name: 'BUILD_SELECTOR' + ) + } + + agent { + docker { + image 's444439-create-dataset' + } + } + + stages { + stage('Train model') { + steps { + sh "python neutral_network.py -e ${params.EPOCHS} -b ${params.BATCHSIZE}" + } + } + } + + environment { + NOTIFICATION_ADDRESS = 'e19191c5.uam.onmicrosoft.com@emea.teams.ms' + } + + post { + success { + emailext body: 'SUCCESS', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}" + } + + failure { + emailext body: 'FAILURE', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}" + } + + unstable { + emailext body: 'UNSTABLE', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}" + } + + changed { + emailext body: 'CHANGED', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}" + } + } +} \ No newline at end of file diff --git a/predictions.py b/predictions.py new file mode 100644 index 0000000..8c8e433 --- /dev/null +++ b/predictions.py @@ -0,0 +1,12 @@ +import tensorflow +import pandas as pd + +model = tensorflow.keras.models.load_model('model.h5') +X_test_data = pd.read_csv("X_test.csv").astype(float) +Y_test_data = pd.read_csv("Y_test.csv").astype(float) + +model.evaluate(X_test_data, Y_test_data) + +predictions = model.predict(X_test_data) + +predictions.to_csv('predictions.csv', index=False) diff --git a/script.py b/script.py index 0da8754..46f4da0 100644 --- a/script.py +++ b/script.py @@ -1,120 +1,147 @@ import os import urllib.request +from os.path import exists + +import pandas +from keras.layers import Dense +from keras.models import Sequential import pandas as pd import numpy as np +from keras.utils import to_categorical from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler def download_file(): - url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data" - filename = "adult.data" - urllib.request.urlretrieve(url, filename) - csv_file = convert_data_to_csv() - return csv_file + file_exist = exists('/adult.csv') + if not file_exist: + url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data" + filename = "adult.data" + urllib.request.urlretrieve(url, filename) + convert_data_to_csv() def convert_data_to_csv(): - data_file = "adult.data" - csv_file = "adult.csv" - df = pd.read_csv(data_file, header=None) - df.to_csv(csv_file, index=False) - # delete_data_file() - return csv_file - - -def delete_data_file(): - filename = "adult.data" - os.remove(filename) + data_file = "adult.data" + csv_file = "adult.csv" + df = pd.read_csv(data_file, header=None) + df.to_csv(csv_file, index=False) + filename = "adult.data" + os.remove(filename) def add_subsets_to_csv_file(data): - data.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", - "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", - "income"] + data.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", + "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", + "income"] - train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) - if len(train_data) > len(test_data): - train_data, dev_data = train_test_split(train_data, test_size=0.25, random_state=42) - else: - dev_data = pd.DataFrame() + X_train_data, X_dev_data, X_test_data = train_dev_test(data) - train_data.to_csv("adult_train.csv", index=False) - dev_data.to_csv("adult_dev.csv", index=False) - test_data.to_csv("adult_test.csv", index=False) - - print("Data set: ", data.shape) - print("Train Data set: ", train_data.shape) - print("Dev Data set: ", dev_data.shape) - print("Test Data set: ", test_data.shape) - return data + print("Data set: ", data.shape) + print("Train Data set: ", X_train_data.shape) + print("Dev Data set: ", X_dev_data.shape) + print("Test Data set: ", X_test_data.shape) + return data def check_if_data_set_has_division_into_subsets(file_name): - data = pd.read_csv(file_name) + data = pd.read_csv(file_name) - if "train" not in data.columns or "dev" not in data.columns or "test" not in data.columns: - data_set = add_subsets_to_csv_file(data) - data_set.to_csv(file_name, index=False) + if "train" not in data.columns or "dev" not in data.columns or "test" not in data.columns: + data_set = add_subsets_to_csv_file(data) + data_set.to_csv(file_name, index=False) def get_statistics(data): - train_data = pd.read_csv("adult_train.csv", dtype={"income": "category"}) - dev_data = pd.read_csv("adult_dev.csv", dtype={"income": "category"}) - test_data = pd.read_csv("adult_test.csv", dtype={"income": "category"}) + train_data = pd.read_csv("X_train.csv", dtype={"income": "category"}) + dev_data = pd.read_csv("X_dev.csv", dtype={"income": "category"}) + test_data = pd.read_csv("X_test.csv", dtype={"income": "category"}) - print("Wielkość zbioru: ", len(data)) - print("Wielkość zbioru treningowego: ", len(train_data)) - print("Wielkość zbioru walidacyjnego: ", len(dev_data)) - print("Wielkość zbioru testowego: ", len(test_data)) - print("Średnia wartość wieku: ", np.mean(data["age"])) - print("Minimalna wartość wieku: ", np.min(data["age"])) - print("Maksymalna wartość wieku: ", np.max(data["age"])) - print("Odchylenie standardowe wartości wieku: ", np.std(data["age"])) - print("Mediana wartości wieku: ", np.median(data["age"])) + print("Wielkość zbioru: ", len(data)) + print("Wielkość zbioru treningowego: ", len(train_data)) + print("Wielkość zbioru walidacyjnego: ", len(dev_data)) + print("Wielkość zbioru testowego: ", len(test_data)) + print("Średnia wartość wieku: ", np.mean(data["age"])) + print("Minimalna wartość wieku: ", np.min(data["age"])) + print("Maksymalna wartość wieku: ", np.max(data["age"])) + print("Odchylenie standardowe wartości wieku: ", np.std(data["age"])) + print("Mediana wartości wieku: ", np.median(data["age"])) - print("Rozkład częstości klas: ") - freq_dist_all = data['income'].value_counts() - print('Rozkład częstości etykiet klas na całym zbiorze danych:') - print(freq_dist_all) + print("Rozkład częstości klas: ") + freq_dist_all = data['income'].value_counts() + print('Rozkład częstości etykiet klas na całym zbiorze danych:') + print(freq_dist_all) - freq_dist_train = train_data['income'].value_counts() - print('Rozkład częstości etykiet klas na zbiorze treningowym:') - print(freq_dist_train) + freq_dist_train = train_data['income'].value_counts() + print('Rozkład częstości etykiet klas na zbiorze treningowym:') + print(freq_dist_train) - freq_dist_test = test_data['income'].value_counts() - print('Rozkład częstości etykiet klas na zbiorze testowym:') - print(freq_dist_test) + freq_dist_test = test_data['income'].value_counts() + print('Rozkład częstości etykiet klas na zbiorze testowym:') + print(freq_dist_test) - freq_dist_dev = dev_data['income'].value_counts() - print('Rozkład częstości etykiet klas na zbiorze walidacyjnym:') - print(freq_dist_dev) + freq_dist_dev = dev_data['income'].value_counts() + print('Rozkład częstości etykiet klas na zbiorze walidacyjnym:') + print(freq_dist_dev) def normalization(data): - numeric_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'] - numeric_data = data[numeric_features] + numeric_features = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week'] + numeric_data = data[numeric_features] - scaler = StandardScaler() - normalized_data = scaler.fit_transform(numeric_data) + scaler = StandardScaler() + normalized_data = scaler.fit_transform(numeric_data) - data[numeric_features] = normalized_data + data[numeric_features] = normalized_data - print(data.head()) + print(data.head()) def clean(data): - data.replace('?', np.nan, inplace=True) - data.dropna(inplace=True) - data.drop_duplicates(inplace=True) - data[['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']] = data[ - ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']].apply(pd.to_numeric) + data.replace('?', np.nan, inplace=True) + data.dropna(inplace=True) + data.drop_duplicates(inplace=True) + data[['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']] = data[ + ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']].apply(pd.to_numeric) + + +def train_dev_test(data): + X = data.copy() + y = pandas.DataFrame(data.pop('education-num')) + X_train, X_temp, Y_train, Y_temp = train_test_split(X, y, test_size=0.3, random_state=1) + X_dev, X_test, Y_dev, Y_test = train_test_split(X_temp, Y_temp, test_size=0.3, random_state=1) + + X_train.to_csv('X_train.csv', index=False) + X_dev.to_csv('X_dev.csv', index=False) + X_test.to_csv('X_test.csv', index=False) + Y_test.to_csv('Y_test.csv', index=False) + Y_train.to_csv('Y_train.csv', index=False) + Y_dev.to_csv('Y_dev.csv', index=False) + return X_train, X_dev, X_test + + +def create_model(): + data = pd.read_csv('X_train.csv') + X = data.copy() + y = data["education-num"] + X_train_encoded = pd.get_dummies(X) + y_train_cat = to_categorical(y) + model = Sequential() + model.add(Dense(64, activation='relu', input_dim=X_train_encoded.shape[1])) + model.add(Dense(17, activation='softmax')) + model.compile(optimizer='adam', + loss='categorical_crossentropy', + metrics=['accuracy']) + model.fit(X_train_encoded, y_train_cat, epochs=10, batch_size=32, validation_data=(X_train_encoded, y_train_cat)) + model.save('model.h5') if __name__ == '__main__': - csv_file_name = download_file() - # check_if_data_set_has_division_into_subsets(csv_file_name) - # data = pd.read_csv(csv_file_name, dtype={"income": "category"}) - # get_statistics(data) - # normalization(data) - # clean(data) + download_file() + csv_file_name = 'adult.csv' + check_if_data_set_has_division_into_subsets('adult.csv') + data = pd.read_csv(csv_file_name, dtype={"income": "category"}) + get_statistics(data) + normalization(data) + clean(data) + create_model()