This commit is contained in:
s444439 2023-05-10 22:50:54 +02:00
parent 6f524c5903
commit 00e260e765
3 changed files with 176 additions and 79 deletions

58
Jenkinsfile_training Normal file
View File

@ -0,0 +1,58 @@
pipeline {
parameters {
string(
defaultValue: '64',
description: 'Batch size used in gradient',
name: 'BATCHSIZE',
trim: true
)
string(
defaultValue: '5',
description: 'Number of iterations',
name: 'EPOCHS',
trim: true
)
gitParameter branchFilter: 'origin/(.*)', defaultValue: 'main', name: 'BRANCH', type: 'PT_BRANCH'
buildSelector(
defaultSelector: lastSuccessful(),
description: 'Which build to use for copying artifacts',
name: 'BUILD_SELECTOR'
)
}
agent {
docker {
image 's444439-create-dataset'
}
}
stages {
stage('Train model') {
steps {
sh "python neutral_network.py -e ${params.EPOCHS} -b ${params.BATCHSIZE}"
}
}
}
environment {
NOTIFICATION_ADDRESS = 'e19191c5.uam.onmicrosoft.com@emea.teams.ms'
}
post {
success {
emailext body: 'SUCCESS', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}"
}
failure {
emailext body: 'FAILURE', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}"
}
unstable {
emailext body: 'UNSTABLE', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}"
}
changed {
emailext body: 'CHANGED', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}"
}
}
}

12
predictions.py Normal file
View File

@ -0,0 +1,12 @@
import tensorflow
import pandas as pd
model = tensorflow.keras.models.load_model('model.h5')
X_test_data = pd.read_csv("X_test.csv").astype(float)
Y_test_data = pd.read_csv("Y_test.csv").astype(float)
model.evaluate(X_test_data, Y_test_data)
predictions = model.predict(X_test_data)
predictions.to_csv('predictions.csv', index=False)

185
script.py
View File

@ -1,120 +1,147 @@
import os import os
import urllib.request import urllib.request
from os.path import exists
import pandas
from keras.layers import Dense
from keras.models import Sequential
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import StandardScaler
def download_file(): def download_file():
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data" file_exist = exists('/adult.csv')
filename = "adult.data" if not file_exist:
urllib.request.urlretrieve(url, filename) url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
csv_file = convert_data_to_csv() filename = "adult.data"
return csv_file urllib.request.urlretrieve(url, filename)
convert_data_to_csv()
def convert_data_to_csv(): def convert_data_to_csv():
data_file = "adult.data" data_file = "adult.data"
csv_file = "adult.csv" csv_file = "adult.csv"
df = pd.read_csv(data_file, header=None) df = pd.read_csv(data_file, header=None)
df.to_csv(csv_file, index=False) df.to_csv(csv_file, index=False)
# delete_data_file() filename = "adult.data"
return csv_file os.remove(filename)
def delete_data_file():
filename = "adult.data"
os.remove(filename)
def add_subsets_to_csv_file(data): def add_subsets_to_csv_file(data):
data.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", data.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation",
"relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country",
"income"] "income"]
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) X_train_data, X_dev_data, X_test_data = train_dev_test(data)
if len(train_data) > len(test_data):
train_data, dev_data = train_test_split(train_data, test_size=0.25, random_state=42)
else:
dev_data = pd.DataFrame()
train_data.to_csv("adult_train.csv", index=False) print("Data set: ", data.shape)
dev_data.to_csv("adult_dev.csv", index=False) print("Train Data set: ", X_train_data.shape)
test_data.to_csv("adult_test.csv", index=False) print("Dev Data set: ", X_dev_data.shape)
print("Test Data set: ", X_test_data.shape)
print("Data set: ", data.shape) return data
print("Train Data set: ", train_data.shape)
print("Dev Data set: ", dev_data.shape)
print("Test Data set: ", test_data.shape)
return data
def check_if_data_set_has_division_into_subsets(file_name): def check_if_data_set_has_division_into_subsets(file_name):
data = pd.read_csv(file_name) data = pd.read_csv(file_name)
if "train" not in data.columns or "dev" not in data.columns or "test" not in data.columns: if "train" not in data.columns or "dev" not in data.columns or "test" not in data.columns:
data_set = add_subsets_to_csv_file(data) data_set = add_subsets_to_csv_file(data)
data_set.to_csv(file_name, index=False) data_set.to_csv(file_name, index=False)
def get_statistics(data): def get_statistics(data):
train_data = pd.read_csv("adult_train.csv", dtype={"income": "category"}) train_data = pd.read_csv("X_train.csv", dtype={"income": "category"})
dev_data = pd.read_csv("adult_dev.csv", dtype={"income": "category"}) dev_data = pd.read_csv("X_dev.csv", dtype={"income": "category"})
test_data = pd.read_csv("adult_test.csv", dtype={"income": "category"}) test_data = pd.read_csv("X_test.csv", dtype={"income": "category"})
print("Wielkość zbioru: ", len(data)) print("Wielkość zbioru: ", len(data))
print("Wielkość zbioru treningowego: ", len(train_data)) print("Wielkość zbioru treningowego: ", len(train_data))
print("Wielkość zbioru walidacyjnego: ", len(dev_data)) print("Wielkość zbioru walidacyjnego: ", len(dev_data))
print("Wielkość zbioru testowego: ", len(test_data)) print("Wielkość zbioru testowego: ", len(test_data))
print("Średnia wartość wieku: ", np.mean(data["age"])) print("Średnia wartość wieku: ", np.mean(data["age"]))
print("Minimalna wartość wieku: ", np.min(data["age"])) print("Minimalna wartość wieku: ", np.min(data["age"]))
print("Maksymalna wartość wieku: ", np.max(data["age"])) print("Maksymalna wartość wieku: ", np.max(data["age"]))
print("Odchylenie standardowe wartości wieku: ", np.std(data["age"])) print("Odchylenie standardowe wartości wieku: ", np.std(data["age"]))
print("Mediana wartości wieku: ", np.median(data["age"])) print("Mediana wartości wieku: ", np.median(data["age"]))
print("Rozkład częstości klas: ") print("Rozkład częstości klas: ")
freq_dist_all = data['income'].value_counts() freq_dist_all = data['income'].value_counts()
print('Rozkład częstości etykiet klas na całym zbiorze danych:') print('Rozkład częstości etykiet klas na całym zbiorze danych:')
print(freq_dist_all) print(freq_dist_all)
freq_dist_train = train_data['income'].value_counts() freq_dist_train = train_data['income'].value_counts()
print('Rozkład częstości etykiet klas na zbiorze treningowym:') print('Rozkład częstości etykiet klas na zbiorze treningowym:')
print(freq_dist_train) print(freq_dist_train)
freq_dist_test = test_data['income'].value_counts() freq_dist_test = test_data['income'].value_counts()
print('Rozkład częstości etykiet klas na zbiorze testowym:') print('Rozkład częstości etykiet klas na zbiorze testowym:')
print(freq_dist_test) print(freq_dist_test)
freq_dist_dev = dev_data['income'].value_counts() freq_dist_dev = dev_data['income'].value_counts()
print('Rozkład częstości etykiet klas na zbiorze walidacyjnym:') print('Rozkład częstości etykiet klas na zbiorze walidacyjnym:')
print(freq_dist_dev) print(freq_dist_dev)
def normalization(data): def normalization(data):
numeric_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'] numeric_features = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
numeric_data = data[numeric_features] numeric_data = data[numeric_features]
scaler = StandardScaler() scaler = StandardScaler()
normalized_data = scaler.fit_transform(numeric_data) normalized_data = scaler.fit_transform(numeric_data)
data[numeric_features] = normalized_data data[numeric_features] = normalized_data
print(data.head()) print(data.head())
def clean(data): def clean(data):
data.replace('?', np.nan, inplace=True) data.replace('?', np.nan, inplace=True)
data.dropna(inplace=True) data.dropna(inplace=True)
data.drop_duplicates(inplace=True) data.drop_duplicates(inplace=True)
data[['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']] = data[ data[['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']] = data[
['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']].apply(pd.to_numeric) ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']].apply(pd.to_numeric)
def train_dev_test(data):
X = data.copy()
y = pandas.DataFrame(data.pop('education-num'))
X_train, X_temp, Y_train, Y_temp = train_test_split(X, y, test_size=0.3, random_state=1)
X_dev, X_test, Y_dev, Y_test = train_test_split(X_temp, Y_temp, test_size=0.3, random_state=1)
X_train.to_csv('X_train.csv', index=False)
X_dev.to_csv('X_dev.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
Y_test.to_csv('Y_test.csv', index=False)
Y_train.to_csv('Y_train.csv', index=False)
Y_dev.to_csv('Y_dev.csv', index=False)
return X_train, X_dev, X_test
def create_model():
data = pd.read_csv('X_train.csv')
X = data.copy()
y = data["education-num"]
X_train_encoded = pd.get_dummies(X)
y_train_cat = to_categorical(y)
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=X_train_encoded.shape[1]))
model.add(Dense(17, activation='softmax'))
model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
model.fit(X_train_encoded, y_train_cat, epochs=10, batch_size=32, validation_data=(X_train_encoded, y_train_cat))
model.save('model.h5')
if __name__ == '__main__': if __name__ == '__main__':
csv_file_name = download_file() download_file()
# check_if_data_set_has_division_into_subsets(csv_file_name) csv_file_name = 'adult.csv'
# data = pd.read_csv(csv_file_name, dtype={"income": "category"}) check_if_data_set_has_division_into_subsets('adult.csv')
# get_statistics(data) data = pd.read_csv(csv_file_name, dtype={"income": "category"})
# normalization(data) get_statistics(data)
# clean(data) normalization(data)
clean(data)
create_model()