diff --git a/Dockerfile b/Dockerfile index bd1d292..2c3e91d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,4 +3,6 @@ FROM python:latest RUN apt-get update && apt-get install -y RUN pip install pandas +RUN pip install tensorflow +RUN pip install matplotlib RUN pip install scikit-learn \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile index fad5aa4..43c980e 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -9,13 +9,13 @@ pipeline { ) } stages { - stage('clear') { + stage('Clear_Before') { steps { sh 'rm -rf *' } } - stage('Build') { + stage('Clone_and_Build') { steps { sh 'git clone https://git.wmi.amu.edu.pl/s444439/ium_z444439' sh 'curl -O https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' @@ -38,10 +38,18 @@ pipeline { sh 'ls -a' sh 'python ./ium_z444439/create-dataset.py' echo 'process finish' - archiveArtifacts 'adult_test.csv' - archiveArtifacts 'adult_dev.csv' - archiveArtifacts 'adult_train.csv' + archiveArtifacts 'X_test.csv' + archiveArtifacts 'X_dev.csv' + archiveArtifacts 'X_train.csv' + archiveArtifacts 'Y_test.csv' + archiveArtifacts 'Y_dev.csv' + archiveArtifacts 'Y_train.csv' } } + stage('Clear_After') { + steps { + sh 'rm -rf *' + } + } } } \ No newline at end of file diff --git a/Jenkinsfile_stats b/Jenkinsfile_stats index 0be73d8..1f590bc 100644 --- a/Jenkinsfile_stats +++ b/Jenkinsfile_stats @@ -33,9 +33,9 @@ pipeline { sh 'ls -a' sh 'python ./ium_z444439/stats.py' echo 'process finish' - archiveArtifacts 'adult_test_stats.csv' - archiveArtifacts 'adult_dev_stats.csv' - archiveArtifacts 'adult_train_stats.csv' + archiveArtifacts 'X_test_stats.csv' + archiveArtifacts 'X_dev_stats.csv' + archiveArtifacts 'X_train_stats.csv' } } stage('Goodbye!') { diff --git a/create-dataset.py b/create-dataset.py index a233a6d..049ac50 100644 --- a/create-dataset.py +++ b/create-dataset.py @@ -8,12 +8,15 @@ adults = adults.dropna() adults = adults.sample(CUTOFF) -adult_X, adult_Y = adults, adults -adult_X_train, adult_X_temp, adult_Y_train, adult_Y_temp = train_test_split(adult_X, adult_Y, test_size=0.3, - random_state=1) -adult_X_dev, adult_X_test, adult_Y_dev, adult_Y_test = train_test_split(adult_X_temp, adult_Y_temp, test_size=0.3, - random_state=1) +X = adults.copy() +Y = pd.DataFrame(adults.pop('age')) -adult_X_train.to_csv('adult_train.csv', index=False) -adult_X_dev.to_csv('adult_dev.csv', index=False) -adult_X_test.to_csv('adult_test.csv', index=False) +X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.3, random_state=1) +X_dev, X_test, Y_dev, Y_test = train_test_split(X_temp, Y_temp, test_size=0.3, random_state=1) + +X_train.to_csv('X_train.csv', index=False) +X_dev.to_csv('X_dev.csv', index=False) +X_test.to_csv('X_test.csv', index=False) +Y_test.to_csv('Y_test.csv', index=False) +Y_train.to_csv('Y_train.csv', index=False) +Y_dev.to_csv('Y_dev.csv', index=False) diff --git a/script.py b/script.py index e7fdad8..9e7f21a 100644 --- a/script.py +++ b/script.py @@ -2,12 +2,8 @@ import os import urllib.request from os.path import exists -import pandas -from keras.layers import Dense -from keras.models import Sequential import pandas as pd import numpy as np -from keras.utils import to_categorical from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler @@ -117,22 +113,6 @@ def train_dev_test(data): return train_data, dev_data, test_data -def create_model(): - data = pd.read_csv('adult_train.csv') - X = data.copy() - y = data["education-num"] - X_train_encoded = pd.get_dummies(X) - y_train_cat = to_categorical(y) - model = Sequential() - model.add(Dense(64, activation='relu', input_dim=X_train_encoded.shape[1])) - model.add(Dense(17, activation='sigmoid')) - model.compile(optimizer='adam', - loss='binary_crossentropy', - metrics=['accuracy']) - model.fit(X_train_encoded, y_train_cat, epochs=10, batch_size=32, validation_data=(X_train_encoded, y_train_cat)) - model.save('model.joblib') - - if __name__ == '__main__': download_file() csv_file_name = 'adult.csv' @@ -141,4 +121,3 @@ if __name__ == '__main__': get_statistics(data) normalization(data) clean(data) - create_model() diff --git a/stats.py b/stats.py index ebaf5c6..a5f004c 100644 --- a/stats.py +++ b/stats.py @@ -1,9 +1,8 @@ import pandas -adult_dev = pandas.read_csv('adult_dev.csv', engine='python', encoding='ISO-8859-1', sep=',') -adult_train = pandas.read_csv('adult_train.csv', engine='python', encoding='ISO-8859-1', sep=',') - -adult_test = pandas.read_csv('adult_test.csv', engine='python', encoding='ISO-8859-1', sep=',') +adult_dev = pandas.read_csv('X_dev.csv', engine='python', encoding='ISO-8859-1', sep=',') +adult_train = pandas.read_csv('X_train.csv', engine='python', encoding='ISO-8859-1', sep=',') +adult_test = pandas.read_csv('X_test.csv', engine='python', encoding='ISO-8859-1', sep=',') adult_dev.describe(include='all').to_csv('adult_dev_stats.csv', index=True) adult_train.describe(include='all').to_csv('adult_train_stats.csv', index=True) diff --git a/train.py b/train.py new file mode 100644 index 0000000..79987eb --- /dev/null +++ b/train.py @@ -0,0 +1,24 @@ +import pandas as pd +import tensorflow +from keras.applications.densenet import layers + +train_data_x = pd.read_csv('./X_train.csv') + +adults_train = train_data_x.copy() +adults_predict = train_data_x.pop('age') +normalize = layers.Normalization() +normalize.adapt(adults_train) + +adult_model = tensorflow.keras.Sequential([ + normalize, + layers.Dense(64), + layers.Dense(1) +]) + +adult_model.compile( + loss=tensorflow.keras.losses.MeanSquaredError(), + optimizer=tensorflow.keras.optimizers.Adam()) + +adult_model.fit(adults_train, adults_predict, epochs=500) + +adult_model.save('model') \ No newline at end of file