This commit is contained in:
s444439 2023-05-10 22:50:54 +02:00
parent 6f524c5903
commit 00e260e765
3 changed files with 176 additions and 79 deletions

58
Jenkinsfile_training Normal file
View File

@ -0,0 +1,58 @@
pipeline {
parameters {
string(
defaultValue: '64',
description: 'Batch size used in gradient',
name: 'BATCHSIZE',
trim: true
)
string(
defaultValue: '5',
description: 'Number of iterations',
name: 'EPOCHS',
trim: true
)
gitParameter branchFilter: 'origin/(.*)', defaultValue: 'main', name: 'BRANCH', type: 'PT_BRANCH'
buildSelector(
defaultSelector: lastSuccessful(),
description: 'Which build to use for copying artifacts',
name: 'BUILD_SELECTOR'
)
}
agent {
docker {
image 's444439-create-dataset'
}
}
stages {
stage('Train model') {
steps {
sh "python neutral_network.py -e ${params.EPOCHS} -b ${params.BATCHSIZE}"
}
}
}
environment {
NOTIFICATION_ADDRESS = 'e19191c5.uam.onmicrosoft.com@emea.teams.ms'
}
post {
success {
emailext body: 'SUCCESS', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}"
}
failure {
emailext body: 'FAILURE', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}"
}
unstable {
emailext body: 'UNSTABLE', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}"
}
changed {
emailext body: 'CHANGED', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}"
}
}
}

12
predictions.py Normal file
View File

@ -0,0 +1,12 @@
import tensorflow
import pandas as pd
model = tensorflow.keras.models.load_model('model.h5')
X_test_data = pd.read_csv("X_test.csv").astype(float)
Y_test_data = pd.read_csv("Y_test.csv").astype(float)
model.evaluate(X_test_data, Y_test_data)
predictions = model.predict(X_test_data)
predictions.to_csv('predictions.csv', index=False)

185
script.py
View File

@ -1,120 +1,147 @@
import os
import urllib.request
from os.path import exists
import pandas
from keras.layers import Dense
from keras.models import Sequential
import pandas as pd
import numpy as np
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
def download_file():
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
filename = "adult.data"
urllib.request.urlretrieve(url, filename)
csv_file = convert_data_to_csv()
return csv_file
file_exist = exists('/adult.csv')
if not file_exist:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
filename = "adult.data"
urllib.request.urlretrieve(url, filename)
convert_data_to_csv()
def convert_data_to_csv():
data_file = "adult.data"
csv_file = "adult.csv"
df = pd.read_csv(data_file, header=None)
df.to_csv(csv_file, index=False)
# delete_data_file()
return csv_file
def delete_data_file():
filename = "adult.data"
os.remove(filename)
data_file = "adult.data"
csv_file = "adult.csv"
df = pd.read_csv(data_file, header=None)
df.to_csv(csv_file, index=False)
filename = "adult.data"
os.remove(filename)
def add_subsets_to_csv_file(data):
data.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation",
"relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country",
"income"]
data.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation",
"relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country",
"income"]
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
if len(train_data) > len(test_data):
train_data, dev_data = train_test_split(train_data, test_size=0.25, random_state=42)
else:
dev_data = pd.DataFrame()
X_train_data, X_dev_data, X_test_data = train_dev_test(data)
train_data.to_csv("adult_train.csv", index=False)
dev_data.to_csv("adult_dev.csv", index=False)
test_data.to_csv("adult_test.csv", index=False)
print("Data set: ", data.shape)
print("Train Data set: ", train_data.shape)
print("Dev Data set: ", dev_data.shape)
print("Test Data set: ", test_data.shape)
return data
print("Data set: ", data.shape)
print("Train Data set: ", X_train_data.shape)
print("Dev Data set: ", X_dev_data.shape)
print("Test Data set: ", X_test_data.shape)
return data
def check_if_data_set_has_division_into_subsets(file_name):
data = pd.read_csv(file_name)
data = pd.read_csv(file_name)
if "train" not in data.columns or "dev" not in data.columns or "test" not in data.columns:
data_set = add_subsets_to_csv_file(data)
data_set.to_csv(file_name, index=False)
if "train" not in data.columns or "dev" not in data.columns or "test" not in data.columns:
data_set = add_subsets_to_csv_file(data)
data_set.to_csv(file_name, index=False)
def get_statistics(data):
train_data = pd.read_csv("adult_train.csv", dtype={"income": "category"})
dev_data = pd.read_csv("adult_dev.csv", dtype={"income": "category"})
test_data = pd.read_csv("adult_test.csv", dtype={"income": "category"})
train_data = pd.read_csv("X_train.csv", dtype={"income": "category"})
dev_data = pd.read_csv("X_dev.csv", dtype={"income": "category"})
test_data = pd.read_csv("X_test.csv", dtype={"income": "category"})
print("Wielkość zbioru: ", len(data))
print("Wielkość zbioru treningowego: ", len(train_data))
print("Wielkość zbioru walidacyjnego: ", len(dev_data))
print("Wielkość zbioru testowego: ", len(test_data))
print("Średnia wartość wieku: ", np.mean(data["age"]))
print("Minimalna wartość wieku: ", np.min(data["age"]))
print("Maksymalna wartość wieku: ", np.max(data["age"]))
print("Odchylenie standardowe wartości wieku: ", np.std(data["age"]))
print("Mediana wartości wieku: ", np.median(data["age"]))
print("Wielkość zbioru: ", len(data))
print("Wielkość zbioru treningowego: ", len(train_data))
print("Wielkość zbioru walidacyjnego: ", len(dev_data))
print("Wielkość zbioru testowego: ", len(test_data))
print("Średnia wartość wieku: ", np.mean(data["age"]))
print("Minimalna wartość wieku: ", np.min(data["age"]))
print("Maksymalna wartość wieku: ", np.max(data["age"]))
print("Odchylenie standardowe wartości wieku: ", np.std(data["age"]))
print("Mediana wartości wieku: ", np.median(data["age"]))
print("Rozkład częstości klas: ")
freq_dist_all = data['income'].value_counts()
print('Rozkład częstości etykiet klas na całym zbiorze danych:')
print(freq_dist_all)
print("Rozkład częstości klas: ")
freq_dist_all = data['income'].value_counts()
print('Rozkład częstości etykiet klas na całym zbiorze danych:')
print(freq_dist_all)
freq_dist_train = train_data['income'].value_counts()
print('Rozkład częstości etykiet klas na zbiorze treningowym:')
print(freq_dist_train)
freq_dist_train = train_data['income'].value_counts()
print('Rozkład częstości etykiet klas na zbiorze treningowym:')
print(freq_dist_train)
freq_dist_test = test_data['income'].value_counts()
print('Rozkład częstości etykiet klas na zbiorze testowym:')
print(freq_dist_test)
freq_dist_test = test_data['income'].value_counts()
print('Rozkład częstości etykiet klas na zbiorze testowym:')
print(freq_dist_test)
freq_dist_dev = dev_data['income'].value_counts()
print('Rozkład częstości etykiet klas na zbiorze walidacyjnym:')
print(freq_dist_dev)
freq_dist_dev = dev_data['income'].value_counts()
print('Rozkład częstości etykiet klas na zbiorze walidacyjnym:')
print(freq_dist_dev)
def normalization(data):
numeric_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
numeric_data = data[numeric_features]
numeric_features = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
numeric_data = data[numeric_features]
scaler = StandardScaler()
normalized_data = scaler.fit_transform(numeric_data)
scaler = StandardScaler()
normalized_data = scaler.fit_transform(numeric_data)
data[numeric_features] = normalized_data
data[numeric_features] = normalized_data
print(data.head())
print(data.head())
def clean(data):
data.replace('?', np.nan, inplace=True)
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)
data[['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']] = data[
['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']].apply(pd.to_numeric)
data.replace('?', np.nan, inplace=True)
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)
data[['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']] = data[
['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']].apply(pd.to_numeric)
def train_dev_test(data):
X = data.copy()
y = pandas.DataFrame(data.pop('education-num'))
X_train, X_temp, Y_train, Y_temp = train_test_split(X, y, test_size=0.3, random_state=1)
X_dev, X_test, Y_dev, Y_test = train_test_split(X_temp, Y_temp, test_size=0.3, random_state=1)
X_train.to_csv('X_train.csv', index=False)
X_dev.to_csv('X_dev.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
Y_test.to_csv('Y_test.csv', index=False)
Y_train.to_csv('Y_train.csv', index=False)
Y_dev.to_csv('Y_dev.csv', index=False)
return X_train, X_dev, X_test
def create_model():
data = pd.read_csv('X_train.csv')
X = data.copy()
y = data["education-num"]
X_train_encoded = pd.get_dummies(X)
y_train_cat = to_categorical(y)
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=X_train_encoded.shape[1]))
model.add(Dense(17, activation='softmax'))
model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
model.fit(X_train_encoded, y_train_cat, epochs=10, batch_size=32, validation_data=(X_train_encoded, y_train_cat))
model.save('model.h5')
if __name__ == '__main__':
csv_file_name = download_file()
# check_if_data_set_has_division_into_subsets(csv_file_name)
# data = pd.read_csv(csv_file_name, dtype={"income": "category"})
# get_statistics(data)
# normalization(data)
# clean(data)
download_file()
csv_file_name = 'adult.csv'
check_if_data_set_has_division_into_subsets('adult.csv')
data = pd.read_csv(csv_file_name, dtype={"income": "category"})
get_statistics(data)
normalization(data)
clean(data)
create_model()