This commit is contained in:
s444439 2023-05-10 22:50:54 +02:00
parent 6f524c5903
commit 00e260e765
3 changed files with 176 additions and 79 deletions

58
Jenkinsfile_training Normal file
View File

@ -0,0 +1,58 @@
pipeline {
parameters {
string(
defaultValue: '64',
description: 'Batch size used in gradient',
name: 'BATCHSIZE',
trim: true
)
string(
defaultValue: '5',
description: 'Number of iterations',
name: 'EPOCHS',
trim: true
)
gitParameter branchFilter: 'origin/(.*)', defaultValue: 'main', name: 'BRANCH', type: 'PT_BRANCH'
buildSelector(
defaultSelector: lastSuccessful(),
description: 'Which build to use for copying artifacts',
name: 'BUILD_SELECTOR'
)
}
agent {
docker {
image 's444439-create-dataset'
}
}
stages {
stage('Train model') {
steps {
sh "python neutral_network.py -e ${params.EPOCHS} -b ${params.BATCHSIZE}"
}
}
}
environment {
NOTIFICATION_ADDRESS = 'e19191c5.uam.onmicrosoft.com@emea.teams.ms'
}
post {
success {
emailext body: 'SUCCESS', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}"
}
failure {
emailext body: 'FAILURE', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}"
}
unstable {
emailext body: 'UNSTABLE', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}"
}
changed {
emailext body: 'CHANGED', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}"
}
}
}

12
predictions.py Normal file
View File

@ -0,0 +1,12 @@
import tensorflow
import pandas as pd
model = tensorflow.keras.models.load_model('model.h5')
X_test_data = pd.read_csv("X_test.csv").astype(float)
Y_test_data = pd.read_csv("Y_test.csv").astype(float)
model.evaluate(X_test_data, Y_test_data)
predictions = model.predict(X_test_data)
predictions.to_csv('predictions.csv', index=False)

View File

@ -1,17 +1,24 @@
import os import os
import urllib.request import urllib.request
from os.path import exists
import pandas
from keras.layers import Dense
from keras.models import Sequential
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import StandardScaler
def download_file(): def download_file():
file_exist = exists('/adult.csv')
if not file_exist:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data" url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
filename = "adult.data" filename = "adult.data"
urllib.request.urlretrieve(url, filename) urllib.request.urlretrieve(url, filename)
csv_file = convert_data_to_csv() convert_data_to_csv()
return csv_file
def convert_data_to_csv(): def convert_data_to_csv():
@ -19,11 +26,6 @@ def convert_data_to_csv():
csv_file = "adult.csv" csv_file = "adult.csv"
df = pd.read_csv(data_file, header=None) df = pd.read_csv(data_file, header=None)
df.to_csv(csv_file, index=False) df.to_csv(csv_file, index=False)
# delete_data_file()
return csv_file
def delete_data_file():
filename = "adult.data" filename = "adult.data"
os.remove(filename) os.remove(filename)
@ -33,20 +35,12 @@ def add_subsets_to_csv_file(data):
"relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country",
"income"] "income"]
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) X_train_data, X_dev_data, X_test_data = train_dev_test(data)
if len(train_data) > len(test_data):
train_data, dev_data = train_test_split(train_data, test_size=0.25, random_state=42)
else:
dev_data = pd.DataFrame()
train_data.to_csv("adult_train.csv", index=False)
dev_data.to_csv("adult_dev.csv", index=False)
test_data.to_csv("adult_test.csv", index=False)
print("Data set: ", data.shape) print("Data set: ", data.shape)
print("Train Data set: ", train_data.shape) print("Train Data set: ", X_train_data.shape)
print("Dev Data set: ", dev_data.shape) print("Dev Data set: ", X_dev_data.shape)
print("Test Data set: ", test_data.shape) print("Test Data set: ", X_test_data.shape)
return data return data
@ -59,9 +53,9 @@ def check_if_data_set_has_division_into_subsets(file_name):
def get_statistics(data): def get_statistics(data):
train_data = pd.read_csv("adult_train.csv", dtype={"income": "category"}) train_data = pd.read_csv("X_train.csv", dtype={"income": "category"})
dev_data = pd.read_csv("adult_dev.csv", dtype={"income": "category"}) dev_data = pd.read_csv("X_dev.csv", dtype={"income": "category"})
test_data = pd.read_csv("adult_test.csv", dtype={"income": "category"}) test_data = pd.read_csv("X_test.csv", dtype={"income": "category"})
print("Wielkość zbioru: ", len(data)) print("Wielkość zbioru: ", len(data))
print("Wielkość zbioru treningowego: ", len(train_data)) print("Wielkość zbioru treningowego: ", len(train_data))
@ -92,7 +86,7 @@ def get_statistics(data):
def normalization(data): def normalization(data):
numeric_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'] numeric_features = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
numeric_data = data[numeric_features] numeric_data = data[numeric_features]
scaler = StandardScaler() scaler = StandardScaler()
@ -107,14 +101,47 @@ def clean(data):
data.replace('?', np.nan, inplace=True) data.replace('?', np.nan, inplace=True)
data.dropna(inplace=True) data.dropna(inplace=True)
data.drop_duplicates(inplace=True) data.drop_duplicates(inplace=True)
data[['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']] = data[ data[['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']] = data[
['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']].apply(pd.to_numeric) ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']].apply(pd.to_numeric)
def train_dev_test(data):
X = data.copy()
y = pandas.DataFrame(data.pop('education-num'))
X_train, X_temp, Y_train, Y_temp = train_test_split(X, y, test_size=0.3, random_state=1)
X_dev, X_test, Y_dev, Y_test = train_test_split(X_temp, Y_temp, test_size=0.3, random_state=1)
X_train.to_csv('X_train.csv', index=False)
X_dev.to_csv('X_dev.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
Y_test.to_csv('Y_test.csv', index=False)
Y_train.to_csv('Y_train.csv', index=False)
Y_dev.to_csv('Y_dev.csv', index=False)
return X_train, X_dev, X_test
def create_model():
data = pd.read_csv('X_train.csv')
X = data.copy()
y = data["education-num"]
X_train_encoded = pd.get_dummies(X)
y_train_cat = to_categorical(y)
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=X_train_encoded.shape[1]))
model.add(Dense(17, activation='softmax'))
model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
model.fit(X_train_encoded, y_train_cat, epochs=10, batch_size=32, validation_data=(X_train_encoded, y_train_cat))
model.save('model.h5')
if __name__ == '__main__': if __name__ == '__main__':
csv_file_name = download_file() download_file()
# check_if_data_set_has_division_into_subsets(csv_file_name) csv_file_name = 'adult.csv'
# data = pd.read_csv(csv_file_name, dtype={"income": "category"}) check_if_data_set_has_division_into_subsets('adult.csv')
# get_statistics(data) data = pd.read_csv(csv_file_name, dtype={"income": "category"})
# normalization(data) get_statistics(data)
# clean(data) normalization(data)
clean(data)
create_model()