fix(wip)
This commit is contained in:
parent
6f524c5903
commit
00e260e765
58
Jenkinsfile_training
Normal file
58
Jenkinsfile_training
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
pipeline {
|
||||||
|
parameters {
|
||||||
|
string(
|
||||||
|
defaultValue: '64',
|
||||||
|
description: 'Batch size used in gradient',
|
||||||
|
name: 'BATCHSIZE',
|
||||||
|
trim: true
|
||||||
|
)
|
||||||
|
string(
|
||||||
|
defaultValue: '5',
|
||||||
|
description: 'Number of iterations',
|
||||||
|
name: 'EPOCHS',
|
||||||
|
trim: true
|
||||||
|
)
|
||||||
|
gitParameter branchFilter: 'origin/(.*)', defaultValue: 'main', name: 'BRANCH', type: 'PT_BRANCH'
|
||||||
|
buildSelector(
|
||||||
|
defaultSelector: lastSuccessful(),
|
||||||
|
description: 'Which build to use for copying artifacts',
|
||||||
|
name: 'BUILD_SELECTOR'
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
agent {
|
||||||
|
docker {
|
||||||
|
image 's444439-create-dataset'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
stages {
|
||||||
|
stage('Train model') {
|
||||||
|
steps {
|
||||||
|
sh "python neutral_network.py -e ${params.EPOCHS} -b ${params.BATCHSIZE}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
environment {
|
||||||
|
NOTIFICATION_ADDRESS = 'e19191c5.uam.onmicrosoft.com@emea.teams.ms'
|
||||||
|
}
|
||||||
|
|
||||||
|
post {
|
||||||
|
success {
|
||||||
|
emailext body: 'SUCCESS', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}"
|
||||||
|
}
|
||||||
|
|
||||||
|
failure {
|
||||||
|
emailext body: 'FAILURE', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}"
|
||||||
|
}
|
||||||
|
|
||||||
|
unstable {
|
||||||
|
emailext body: 'UNSTABLE', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}"
|
||||||
|
}
|
||||||
|
|
||||||
|
changed {
|
||||||
|
emailext body: 'CHANGED', subject: "${env.JOB_NAME}", to: "${env.NOTIFICATION_ADDRESS}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
12
predictions.py
Normal file
12
predictions.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
import tensorflow
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
model = tensorflow.keras.models.load_model('model.h5')
|
||||||
|
X_test_data = pd.read_csv("X_test.csv").astype(float)
|
||||||
|
Y_test_data = pd.read_csv("Y_test.csv").astype(float)
|
||||||
|
|
||||||
|
model.evaluate(X_test_data, Y_test_data)
|
||||||
|
|
||||||
|
predictions = model.predict(X_test_data)
|
||||||
|
|
||||||
|
predictions.to_csv('predictions.csv', index=False)
|
89
script.py
89
script.py
@ -1,17 +1,24 @@
|
|||||||
import os
|
import os
|
||||||
import urllib.request
|
import urllib.request
|
||||||
|
from os.path import exists
|
||||||
|
|
||||||
|
import pandas
|
||||||
|
from keras.layers import Dense
|
||||||
|
from keras.models import Sequential
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from keras.utils import to_categorical
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
from sklearn.preprocessing import StandardScaler
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
|
||||||
|
|
||||||
def download_file():
|
def download_file():
|
||||||
|
file_exist = exists('/adult.csv')
|
||||||
|
if not file_exist:
|
||||||
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
|
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
|
||||||
filename = "adult.data"
|
filename = "adult.data"
|
||||||
urllib.request.urlretrieve(url, filename)
|
urllib.request.urlretrieve(url, filename)
|
||||||
csv_file = convert_data_to_csv()
|
convert_data_to_csv()
|
||||||
return csv_file
|
|
||||||
|
|
||||||
|
|
||||||
def convert_data_to_csv():
|
def convert_data_to_csv():
|
||||||
@ -19,11 +26,6 @@ def convert_data_to_csv():
|
|||||||
csv_file = "adult.csv"
|
csv_file = "adult.csv"
|
||||||
df = pd.read_csv(data_file, header=None)
|
df = pd.read_csv(data_file, header=None)
|
||||||
df.to_csv(csv_file, index=False)
|
df.to_csv(csv_file, index=False)
|
||||||
# delete_data_file()
|
|
||||||
return csv_file
|
|
||||||
|
|
||||||
|
|
||||||
def delete_data_file():
|
|
||||||
filename = "adult.data"
|
filename = "adult.data"
|
||||||
os.remove(filename)
|
os.remove(filename)
|
||||||
|
|
||||||
@ -33,20 +35,12 @@ def add_subsets_to_csv_file(data):
|
|||||||
"relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country",
|
"relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country",
|
||||||
"income"]
|
"income"]
|
||||||
|
|
||||||
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
|
X_train_data, X_dev_data, X_test_data = train_dev_test(data)
|
||||||
if len(train_data) > len(test_data):
|
|
||||||
train_data, dev_data = train_test_split(train_data, test_size=0.25, random_state=42)
|
|
||||||
else:
|
|
||||||
dev_data = pd.DataFrame()
|
|
||||||
|
|
||||||
train_data.to_csv("adult_train.csv", index=False)
|
|
||||||
dev_data.to_csv("adult_dev.csv", index=False)
|
|
||||||
test_data.to_csv("adult_test.csv", index=False)
|
|
||||||
|
|
||||||
print("Data set: ", data.shape)
|
print("Data set: ", data.shape)
|
||||||
print("Train Data set: ", train_data.shape)
|
print("Train Data set: ", X_train_data.shape)
|
||||||
print("Dev Data set: ", dev_data.shape)
|
print("Dev Data set: ", X_dev_data.shape)
|
||||||
print("Test Data set: ", test_data.shape)
|
print("Test Data set: ", X_test_data.shape)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
@ -59,9 +53,9 @@ def check_if_data_set_has_division_into_subsets(file_name):
|
|||||||
|
|
||||||
|
|
||||||
def get_statistics(data):
|
def get_statistics(data):
|
||||||
train_data = pd.read_csv("adult_train.csv", dtype={"income": "category"})
|
train_data = pd.read_csv("X_train.csv", dtype={"income": "category"})
|
||||||
dev_data = pd.read_csv("adult_dev.csv", dtype={"income": "category"})
|
dev_data = pd.read_csv("X_dev.csv", dtype={"income": "category"})
|
||||||
test_data = pd.read_csv("adult_test.csv", dtype={"income": "category"})
|
test_data = pd.read_csv("X_test.csv", dtype={"income": "category"})
|
||||||
|
|
||||||
print("Wielkość zbioru: ", len(data))
|
print("Wielkość zbioru: ", len(data))
|
||||||
print("Wielkość zbioru treningowego: ", len(train_data))
|
print("Wielkość zbioru treningowego: ", len(train_data))
|
||||||
@ -92,7 +86,7 @@ def get_statistics(data):
|
|||||||
|
|
||||||
|
|
||||||
def normalization(data):
|
def normalization(data):
|
||||||
numeric_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
|
numeric_features = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
|
||||||
numeric_data = data[numeric_features]
|
numeric_data = data[numeric_features]
|
||||||
|
|
||||||
scaler = StandardScaler()
|
scaler = StandardScaler()
|
||||||
@ -107,14 +101,47 @@ def clean(data):
|
|||||||
data.replace('?', np.nan, inplace=True)
|
data.replace('?', np.nan, inplace=True)
|
||||||
data.dropna(inplace=True)
|
data.dropna(inplace=True)
|
||||||
data.drop_duplicates(inplace=True)
|
data.drop_duplicates(inplace=True)
|
||||||
data[['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']] = data[
|
data[['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']] = data[
|
||||||
['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']].apply(pd.to_numeric)
|
['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']].apply(pd.to_numeric)
|
||||||
|
|
||||||
|
|
||||||
|
def train_dev_test(data):
|
||||||
|
X = data.copy()
|
||||||
|
y = pandas.DataFrame(data.pop('education-num'))
|
||||||
|
X_train, X_temp, Y_train, Y_temp = train_test_split(X, y, test_size=0.3, random_state=1)
|
||||||
|
X_dev, X_test, Y_dev, Y_test = train_test_split(X_temp, Y_temp, test_size=0.3, random_state=1)
|
||||||
|
|
||||||
|
X_train.to_csv('X_train.csv', index=False)
|
||||||
|
X_dev.to_csv('X_dev.csv', index=False)
|
||||||
|
X_test.to_csv('X_test.csv', index=False)
|
||||||
|
Y_test.to_csv('Y_test.csv', index=False)
|
||||||
|
Y_train.to_csv('Y_train.csv', index=False)
|
||||||
|
Y_dev.to_csv('Y_dev.csv', index=False)
|
||||||
|
return X_train, X_dev, X_test
|
||||||
|
|
||||||
|
|
||||||
|
def create_model():
|
||||||
|
data = pd.read_csv('X_train.csv')
|
||||||
|
X = data.copy()
|
||||||
|
y = data["education-num"]
|
||||||
|
X_train_encoded = pd.get_dummies(X)
|
||||||
|
y_train_cat = to_categorical(y)
|
||||||
|
model = Sequential()
|
||||||
|
model.add(Dense(64, activation='relu', input_dim=X_train_encoded.shape[1]))
|
||||||
|
model.add(Dense(17, activation='softmax'))
|
||||||
|
model.compile(optimizer='adam',
|
||||||
|
loss='categorical_crossentropy',
|
||||||
|
metrics=['accuracy'])
|
||||||
|
model.fit(X_train_encoded, y_train_cat, epochs=10, batch_size=32, validation_data=(X_train_encoded, y_train_cat))
|
||||||
|
model.save('model.h5')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
csv_file_name = download_file()
|
download_file()
|
||||||
# check_if_data_set_has_division_into_subsets(csv_file_name)
|
csv_file_name = 'adult.csv'
|
||||||
# data = pd.read_csv(csv_file_name, dtype={"income": "category"})
|
check_if_data_set_has_division_into_subsets('adult.csv')
|
||||||
# get_statistics(data)
|
data = pd.read_csv(csv_file_name, dtype={"income": "category"})
|
||||||
# normalization(data)
|
get_statistics(data)
|
||||||
# clean(data)
|
normalization(data)
|
||||||
|
clean(data)
|
||||||
|
create_model()
|
||||||
|
Loading…
Reference in New Issue
Block a user