diff --git a/lab5/Jenkinsfile b/lab5/Jenkinsfile index 83c02a8..517d927 100644 --- a/lab5/Jenkinsfile +++ b/lab5/Jenkinsfile @@ -12,15 +12,15 @@ pipeline { stages { stage('Stage 1') { steps { - sh 'chmod u+x ./process_dataset.py ./simple_regression.py' + sh 'chmod u+x ./lab5/process_dataset.py ./lab5/simple_regression.py' echo 'Processing dataset...' - sh 'python3 process_dataset.py' + sh 'python3 lab5/process_dataset.py' echo 'Dataset processed' echo 'Conducting simple regression model test' - sh 'python3 simple_regression.py $EPOCHS_NUM' + sh 'python3 lab5/simple_regression.py $EPOCHS_NUM' echo 'Model predictions saved' - sh 'head lego_reg_results.csv' + sh 'head lab5/lego_reg_results.csv' } } } -} \ No newline at end of file +} diff --git a/lab5/Jenkinsfile1 b/lab5/Jenkinsfile1 index 3cacd17..fa2952d 100644 --- a/lab5/Jenkinsfile1 +++ b/lab5/Jenkinsfile1 @@ -5,12 +5,12 @@ pipeline { stages { stage('Stage 1') { steps { - sh 'chmod u+x ./process_dataset.py ./simple_regression.py' + sh 'chmod u+x ./lab5/process_dataset.py ./lab5/simple_regression.py' echo 'Processing dataset...' - sh 'python3 process_dataset.py' + sh 'python3 lab5/process_dataset.py' echo 'Dataset processed' echo 'Conducting simple regression model test' - sh 'python3 simple_regression.py' + sh 'python3 lab5/simple_regression.py' echo 'Model predictions saved' } } diff --git a/lab5/process_dataset.py b/lab5/process_dataset.py new file mode 100644 index 0000000..2f54b7b --- /dev/null +++ b/lab5/process_dataset.py @@ -0,0 +1,30 @@ +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split + +# usuwamy przy okazji puste pola +lego = pd.read_csv('lego_sets.csv', encoding='utf-8').dropna() + +# list_price moze byc do dwoch miejsc po przecinku +lego['list_price'] = lego['list_price'].round(2) + +# num_reviews, piece_count i prod_id moga byc wartosciami calkowitymi +lego['num_reviews'] = lego['num_reviews'].apply(np.int64) +lego['piece_count'] = lego['piece_count'].apply(np.int64) +lego['prod_id'] = lego['prod_id'].apply(np.int64) + +# wglad, statystyki +print(lego) +print(lego.describe(include='all')) + +# pierwszy podzial, wydzielamy zbior treningowy +lego_train, lego_rem = train_test_split(lego, train_size=0.8) + +# drugi podział, wydzielamy walidacyjny i testowy +lego_valid, lego_test = train_test_split(lego_rem, test_size=0.5) + +# zapis +lego.to_csv('lego_sets_clean.csv', index=None, header=True) +lego_train.to_csv('lego_sets_clean_train.csv', index=None, header=True) +lego_valid.to_csv('lego_sets_clean_valid.csv', index=None, header=True) +lego_test.to_csv('lego_sets_clean_test.csv', index=None, header=True) \ No newline at end of file diff --git a/lab5/simple_regression.py b/lab5/simple_regression.py new file mode 100644 index 0000000..2f2c77e --- /dev/null +++ b/lab5/simple_regression.py @@ -0,0 +1,69 @@ +import tensorflow as tf +from keras import layers +from keras.models import save_model +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import sys + +# Pobranie przykładowego argumentu trenowania +EPOCHS_NUM = int(sys.argv[1]) + +# Wczytanie danych +data_train = pd.read_csv('lego_sets_clean_train.csv') +data_test = pd.read_csv('lego_sets_clean_test.csv') + +# Wydzielenie zbiorów dla predykcji ceny zestawu na podstawie liczby klocków, którą zawiera +train_piece_counts = np.array(data_train['piece_count']) +train_prices = np.array(data_train['list_price']) +test_piece_counts = np.array(data_test['piece_count']) +test_prices = np.array(data_test['list_price']) + +# Normalizacja +normalizer = layers.Normalization(input_shape=[1, ], axis=None) +normalizer.adapt(train_piece_counts) + +# Inicjalizacja +model = tf.keras.Sequential([ + normalizer, + layers.Dense(units=1) +]) + +# Kompilacja +model.compile( + optimizer=tf.optimizers.Adam(learning_rate=0.1), + loss='mean_absolute_error' +) + +# Trening +history = model.fit( + train_piece_counts, + train_prices, + epochs=EPOCHS_NUM, + verbose=0, + validation_split=0.2 +) + +# Wykonanie predykcji na danych ze zbioru testującego +y_pred = model.predict(test_piece_counts) + +# Zapis predykcji do pliku +results = pd.DataFrame({'test_set_piece_count': test_piece_counts.tolist(), 'predicted_price': [round(a[0], 2) for a in y_pred.tolist()]}) +results.to_csv('lego_reg_results.csv', index=False, header=True) + +# Zapis modelu do pliku +model.save('lego_reg_model') + +# Opcjonalne statystyki, wykresy +''' +hist = pd.DataFrame(history.history) +hist['epoch'] = history.epoch +print(hist.tail()) + +plt.scatter(train_piece_counts, train_prices, label='Data') +plt.plot(x, y_pred, color='k', label='Predictions') +plt.xlabel('pieces') +plt.ylabel('price') +plt.legend() +plt.show() +''' diff --git a/Jenkinsfile_dataset_for_LAB6 b/lab6/Jenkinsfile_dataset_for_LAB6 similarity index 66% rename from Jenkinsfile_dataset_for_LAB6 rename to lab6/Jenkinsfile_dataset_for_LAB6 index 1b4962d..9a28f30 100644 --- a/Jenkinsfile_dataset_for_LAB6 +++ b/lab6/Jenkinsfile_dataset_for_LAB6 @@ -5,16 +5,16 @@ pipeline { stages { stage('Stage 1') { steps { - sh 'chmod u+x ./process_dataset.py' + sh 'chmod u+x ./lab6/process_dataset.py' echo 'Processing dataset...' - sh 'python3 process_dataset.py' + sh 'python3 lab6/process_dataset.py' echo 'Dataset processed' echo 'Archiving clean train and test datasets...' - archiveArtifacts artifacts: 'lego_sets_clean_train.csv, lego_sets_clean_test.csv' + archiveArtifacts artifacts: 'lab6/lego_sets_clean_train.csv, lab6/lego_sets_clean_test.csv' echo 'Datasets archived' echo 'Launching the s449288-training job...' build job: 's449288-training/master/' } } } -} \ No newline at end of file +} diff --git a/lab6/Jenkinsfile_eval b/lab6/Jenkinsfile_eval index 31b1525..56f638f 100644 --- a/lab6/Jenkinsfile_eval +++ b/lab6/Jenkinsfile_eval @@ -16,28 +16,28 @@ pipeline { stage('Stage 1') { steps { git branch: "${params.BRANCH}", url: 'https://git.wmi.amu.edu.pl/s449288/ium_s449288.git' - sh 'chmod u+x ./evaluate.py' + sh 'chmod u+x ./lab6/evaluate.py' echo 'Copying datasets from the create-dataset job...' - copyArtifacts filter: 'lego_sets_clean_test.csv', projectName: 's449288-create-dataset' + copyArtifacts filter: 'lab6/lego_sets_clean_test.csv', projectName: 's449288-create-dataset' echo 'Datasets copied' echo 'Copying model from the training job...' - copyArtifacts filter: 'lego_reg_model.tar.gz', projectName: "s449288-training/${BRANCH}/", selector: buildParameter('BUILD_SELECTOR') + copyArtifacts filter: 'lab6/lego_reg_model.tar.gz', projectName: "s449288-training/${BRANCH}/", selector: buildParameter('BUILD_SELECTOR') echo 'Model copied' - sh 'tar xvzf lego_reg_model.tar.gz' + sh 'tar xvzf lab6/lego_reg_model.tar.gz' echo 'Optional copying of the metrics file from previous build...' - copyArtifacts filter: 'eval_results.txt', projectName: 's449288-evaluation/master/', optional: true + copyArtifacts filter: 'lab6/eval_results.txt', projectName: 's449288-evaluation/master/', optional: true echo 'Metrics file copied if it did not exist' echo 'Evaluating model...' - sh 'python3 evaluate.py' + sh 'python3 lab6/evaluate.py' echo 'Model evaluated. Metrics saved. Plot saved.' - sh 'head eval_results.txt' - sh 'file error_plot.jpg' + sh 'head lab6/eval_results.txt' + sh 'file lab6/error_plot.jpg' echo 'Archiving metrics file...' - archiveArtifacts 'eval_results.txt' + archiveArtifacts 'lab6/eval_results.txt' echo 'File archived' script { LAST_MAE = sh ( - script: 'tail -1 eval_results.txt', + script: 'tail -1 lab6/eval_results.txt', returnStdout: true ).trim() } @@ -58,4 +58,4 @@ pipeline { emailext body: "CHANGED - ${LAST_MAE} MAE", subject: 's449288-evaluation build status', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms' } } -} \ No newline at end of file +} diff --git a/lab6/Jenkinsfile_train b/lab6/Jenkinsfile_train index 53db015..dc3597a 100644 --- a/lab6/Jenkinsfile_train +++ b/lab6/Jenkinsfile_train @@ -14,18 +14,18 @@ pipeline { stages { stage('Stage 1') { steps { - sh 'chmod u+x ./simple_regression.py' + sh 'chmod u+x ./lab6/simple_regression.py' echo 'Copying datasets from create-dataset...' copyArtifacts filter: '*', projectName: 's449288-create-dataset' echo 'Datasets copied' echo 'Conducting simple regression model test' - sh 'python3 simple_regression.py $EPOCHS_NUM' + sh 'python3 lab6/simple_regression.py $EPOCHS_NUM' echo 'Model and predictions saved' - sh 'head lego_reg_results.csv' - sh 'ls -lh lego_reg_model' + sh 'head lab6/lego_reg_results.csv' + sh 'ls -lh lab6/lego_reg_model' echo 'Archiving model...' - sh 'tar -czf lego_reg_model.tar.gz lego_reg_model/' - archiveArtifacts 'lego_reg_model.tar.gz' + sh 'tar -czf lab6/lego_reg_model.tar.gz lab6/lego_reg_model/' + archiveArtifacts 'lab6/lego_reg_model.tar.gz' echo 'Model archived' echo 'Launching the s449288-evaluation job...' build job: 's449288-evaluation/master/' @@ -46,4 +46,4 @@ pipeline { emailext body: 'CHANGED', subject: 's449288-training build status', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms' } } -} \ No newline at end of file +} diff --git a/lab6/evaluate.py b/lab6/evaluate.py new file mode 100644 index 0000000..0f83c2d --- /dev/null +++ b/lab6/evaluate.py @@ -0,0 +1,37 @@ +import tensorflow as tf +from tensorflow import keras +from matplotlib import pyplot as plt +from matplotlib.ticker import MaxNLocator +import numpy as np +import pandas as pd + +# Załadowanie modelu z pliku +model = keras.models.load_model('lego_reg_model') + +# Załadowanie zbioru testowego +data_test = pd.read_csv('lego_sets_clean_test.csv') +test_piece_counts = np.array(data_test['piece_count']) +test_prices = np.array(data_test['list_price']) + +# Prosta ewaluacja (mean absolute error) +test_results = model.evaluate( + test_piece_counts, + test_prices, verbose=0) + +# Zapis wartości liczbowej metryki do pliku +with open('eval_results.txt', 'a+') as f: + f.write(str(test_results) + '\n') + +# Wygenerowanie i zapisanie do pliku wykresu +with open('eval_results.txt') as f: + scores = [float(line) for line in f if line] + builds = list(range(1, len(scores) + 1)) + + plot = plt.plot(builds, scores) + plt.xlabel('Build number') + plt.xticks(range(1, len(scores) + 1)) + plt.ylabel('Mean absolute error') + plt.title('Model error by build') + plt.savefig('error_plot.jpg') + plt.show() + \ No newline at end of file diff --git a/lab6/process_dataset.py b/lab6/process_dataset.py new file mode 100644 index 0000000..2f54b7b --- /dev/null +++ b/lab6/process_dataset.py @@ -0,0 +1,30 @@ +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split + +# usuwamy przy okazji puste pola +lego = pd.read_csv('lego_sets.csv', encoding='utf-8').dropna() + +# list_price moze byc do dwoch miejsc po przecinku +lego['list_price'] = lego['list_price'].round(2) + +# num_reviews, piece_count i prod_id moga byc wartosciami calkowitymi +lego['num_reviews'] = lego['num_reviews'].apply(np.int64) +lego['piece_count'] = lego['piece_count'].apply(np.int64) +lego['prod_id'] = lego['prod_id'].apply(np.int64) + +# wglad, statystyki +print(lego) +print(lego.describe(include='all')) + +# pierwszy podzial, wydzielamy zbior treningowy +lego_train, lego_rem = train_test_split(lego, train_size=0.8) + +# drugi podział, wydzielamy walidacyjny i testowy +lego_valid, lego_test = train_test_split(lego_rem, test_size=0.5) + +# zapis +lego.to_csv('lego_sets_clean.csv', index=None, header=True) +lego_train.to_csv('lego_sets_clean_train.csv', index=None, header=True) +lego_valid.to_csv('lego_sets_clean_valid.csv', index=None, header=True) +lego_test.to_csv('lego_sets_clean_test.csv', index=None, header=True) \ No newline at end of file diff --git a/lab6/simple_regression.py b/lab6/simple_regression.py new file mode 100644 index 0000000..2f2c77e --- /dev/null +++ b/lab6/simple_regression.py @@ -0,0 +1,69 @@ +import tensorflow as tf +from keras import layers +from keras.models import save_model +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import sys + +# Pobranie przykładowego argumentu trenowania +EPOCHS_NUM = int(sys.argv[1]) + +# Wczytanie danych +data_train = pd.read_csv('lego_sets_clean_train.csv') +data_test = pd.read_csv('lego_sets_clean_test.csv') + +# Wydzielenie zbiorów dla predykcji ceny zestawu na podstawie liczby klocków, którą zawiera +train_piece_counts = np.array(data_train['piece_count']) +train_prices = np.array(data_train['list_price']) +test_piece_counts = np.array(data_test['piece_count']) +test_prices = np.array(data_test['list_price']) + +# Normalizacja +normalizer = layers.Normalization(input_shape=[1, ], axis=None) +normalizer.adapt(train_piece_counts) + +# Inicjalizacja +model = tf.keras.Sequential([ + normalizer, + layers.Dense(units=1) +]) + +# Kompilacja +model.compile( + optimizer=tf.optimizers.Adam(learning_rate=0.1), + loss='mean_absolute_error' +) + +# Trening +history = model.fit( + train_piece_counts, + train_prices, + epochs=EPOCHS_NUM, + verbose=0, + validation_split=0.2 +) + +# Wykonanie predykcji na danych ze zbioru testującego +y_pred = model.predict(test_piece_counts) + +# Zapis predykcji do pliku +results = pd.DataFrame({'test_set_piece_count': test_piece_counts.tolist(), 'predicted_price': [round(a[0], 2) for a in y_pred.tolist()]}) +results.to_csv('lego_reg_results.csv', index=False, header=True) + +# Zapis modelu do pliku +model.save('lego_reg_model') + +# Opcjonalne statystyki, wykresy +''' +hist = pd.DataFrame(history.history) +hist['epoch'] = history.epoch +print(hist.tail()) + +plt.scatter(train_piece_counts, train_prices, label='Data') +plt.plot(x, y_pred, color='k', label='Predictions') +plt.xlabel('pieces') +plt.ylabel('price') +plt.legend() +plt.show() +''' diff --git a/lab7/Jenkinsfile_eval b/lab7/Jenkinsfile_eval index c76801f..533ecc1 100644 --- a/lab7/Jenkinsfile_eval +++ b/lab7/Jenkinsfile_eval @@ -16,28 +16,28 @@ pipeline { stage('Stage 1') { steps { git branch: "${params.BRANCH}", url: 'https://git.wmi.amu.edu.pl/s449288/ium_s449288.git' - sh 'chmod u+x ./evaluate.py' + sh 'chmod u+x ./lab7/evaluate.py' echo 'Copying datasets from the create-dataset job...' - copyArtifacts filter: 'lego_sets_clean_test.csv', projectName: 's449288-create-dataset' + copyArtifacts filter: 'lab7/lego_sets_clean_test.csv', projectName: 's449288-create-dataset' echo 'Datasets copied' echo 'Copying model from the training job...' - copyArtifacts filter: 'lego_reg_model.tar.gz', projectName: "s449288-training/${BRANCH}/", selector: buildParameter('BUILD_SELECTOR') + copyArtifacts filter: 'lab7/lego_reg_model.tar.gz', projectName: "s449288-training/${BRANCH}/", selector: buildParameter('BUILD_SELECTOR') echo 'Model copied' - sh 'tar xvzf lego_reg_model.tar.gz' + sh 'tar xvzf lab7/lego_reg_model.tar.gz' echo 'Optional copying of the metrics file from previous build...' - copyArtifacts filter: 'eval_results.txt', projectName: 's449288-evaluation/master/', optional: true + copyArtifacts filter: 'lab7/eval_results.txt', projectName: 's449288-evaluation/master/', optional: true echo 'Metrics file copied if it did not exist' echo 'Evaluating model...' - sh 'python3 evaluate.py' + sh 'python3 lab7/evaluate.py' echo 'Model evaluated. Metrics saved. Plot saved.' - sh 'head eval_results.txt' - sh 'file error_plot.jpg' + sh 'head lab7/eval_results.txt' + sh 'file lab7/error_plot.jpg' echo 'Archiving metrics file...' - archiveArtifacts 'eval_results.txt' + archiveArtifacts 'lab7/eval_results.txt' echo 'File archived' script { LAST_MAE = sh ( - script: 'tail -1 eval_results.txt', + script: 'tail -1 lab7/eval_results.txt', returnStdout: true ).trim() } @@ -58,4 +58,4 @@ pipeline { emailext body: "CHANGED - ${LAST_MAE} MAE", subject: 's449288-evaluation build status', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms' } } -} \ No newline at end of file +} diff --git a/lab7/Jenkinsfile_train b/lab7/Jenkinsfile_train index 05a3fea..9e15c96 100644 --- a/lab7/Jenkinsfile_train +++ b/lab7/Jenkinsfile_train @@ -7,23 +7,23 @@ pipeline { stages { stage('Stage 1') { steps { - sh 'chmod u+x ./simple_regression_lab7.py' + sh 'chmod u+x ./lab7/simple_regression_lab7.py' echo 'Copying datasets from create-dataset...' copyArtifacts filter: '*', projectName: 's449288-create-dataset' echo 'Datasets copied' echo 'Conducting simple regression model test' - sh 'python3 simple_regression_lab7.py' + sh 'python3 lab7/simple_regression_lab7.py' echo 'Model and predictions saved' - sh 'head lego_reg_results.csv' + sh 'head lab7/lego_reg_results.csv' echo 'Archiving model...' - sh 'ls -lh lego_reg_model' - sh 'tar -czf lego_reg_model.tar.gz lego_reg_model/' - archiveArtifacts 'lego_reg_model.tar.gz' + sh 'ls -lh lab7/lego_reg_model' + sh 'tar -czf lab7/lego_reg_model.tar.gz lab7/lego_reg_model/' + archiveArtifacts 'lab7/lego_reg_model.tar.gz' echo 'Model archived' echo 'Archiving Sacreds output repo...' - sh 'ls -lh runs/*/' - sh 'tar -czf sacred_runs.tar.gz runs/' - archiveArtifacts 'sacred_runs.tar.gz' + sh 'ls -lh lab7/runs/*/' + sh 'tar -czf lab7/sacred_runs.tar.gz lab7/runs/' + archiveArtifacts 'lab7/sacred_runs.tar.gz' echo 'Sacreds repo archived' echo 'Launching the s449288-evaluation job...' build job: 's449288-evaluation/master/' @@ -44,4 +44,4 @@ pipeline { emailext body: 'CHANGED', subject: 's449288-training build status', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms' } } -} \ No newline at end of file +} diff --git a/lab7/evaluate.py b/lab7/evaluate.py new file mode 100644 index 0000000..0f83c2d --- /dev/null +++ b/lab7/evaluate.py @@ -0,0 +1,37 @@ +import tensorflow as tf +from tensorflow import keras +from matplotlib import pyplot as plt +from matplotlib.ticker import MaxNLocator +import numpy as np +import pandas as pd + +# Załadowanie modelu z pliku +model = keras.models.load_model('lego_reg_model') + +# Załadowanie zbioru testowego +data_test = pd.read_csv('lego_sets_clean_test.csv') +test_piece_counts = np.array(data_test['piece_count']) +test_prices = np.array(data_test['list_price']) + +# Prosta ewaluacja (mean absolute error) +test_results = model.evaluate( + test_piece_counts, + test_prices, verbose=0) + +# Zapis wartości liczbowej metryki do pliku +with open('eval_results.txt', 'a+') as f: + f.write(str(test_results) + '\n') + +# Wygenerowanie i zapisanie do pliku wykresu +with open('eval_results.txt') as f: + scores = [float(line) for line in f if line] + builds = list(range(1, len(scores) + 1)) + + plot = plt.plot(builds, scores) + plt.xlabel('Build number') + plt.xticks(range(1, len(scores) + 1)) + plt.ylabel('Mean absolute error') + plt.title('Model error by build') + plt.savefig('error_plot.jpg') + plt.show() + \ No newline at end of file diff --git a/lab7/process_dataset.py b/lab7/process_dataset.py new file mode 100644 index 0000000..2f54b7b --- /dev/null +++ b/lab7/process_dataset.py @@ -0,0 +1,30 @@ +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split + +# usuwamy przy okazji puste pola +lego = pd.read_csv('lego_sets.csv', encoding='utf-8').dropna() + +# list_price moze byc do dwoch miejsc po przecinku +lego['list_price'] = lego['list_price'].round(2) + +# num_reviews, piece_count i prod_id moga byc wartosciami calkowitymi +lego['num_reviews'] = lego['num_reviews'].apply(np.int64) +lego['piece_count'] = lego['piece_count'].apply(np.int64) +lego['prod_id'] = lego['prod_id'].apply(np.int64) + +# wglad, statystyki +print(lego) +print(lego.describe(include='all')) + +# pierwszy podzial, wydzielamy zbior treningowy +lego_train, lego_rem = train_test_split(lego, train_size=0.8) + +# drugi podział, wydzielamy walidacyjny i testowy +lego_valid, lego_test = train_test_split(lego_rem, test_size=0.5) + +# zapis +lego.to_csv('lego_sets_clean.csv', index=None, header=True) +lego_train.to_csv('lego_sets_clean_train.csv', index=None, header=True) +lego_valid.to_csv('lego_sets_clean_valid.csv', index=None, header=True) +lego_test.to_csv('lego_sets_clean_test.csv', index=None, header=True) \ No newline at end of file diff --git a/lab7/simple_regression_lab7.py b/lab7/simple_regression_lab7.py new file mode 100644 index 0000000..238553a --- /dev/null +++ b/lab7/simple_regression_lab7.py @@ -0,0 +1,86 @@ +import tensorflow as tf +from keras import layers +from keras.models import save_model +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from sacred import Experiment +from sacred.observers import FileStorageObserver +from sacred.observers import MongoObserver + +# Stworzenie obiektu klasy Experiment do śledzenia przebiegu regresji narzędziem Sacred +ex = Experiment(save_git_info=False) + +# Dodanie obserwatora FileObserver +ex.observers.append(FileStorageObserver('runs')) + +#Dodanie obserwatora Mongo +ex.observers.append(MongoObserver(url='mongodb://admin:IUM_2021@172.17.0.1:27017', db_name='sacred')) + +# Przykładowa modyfikowalna z Sacred konfiguracja wybranych parametrów treningu +@ex.config +def config(): + epochs = 100 + units = 1 + learning_rate = 0.1 + + +# Reszta kodu wrzucona do udekorowanej funkcji train do wywołania przez Sacred, żeby coś było capture'owane +@ex.capture +def train(epochs, units, learning_rate, _run): + + # Wczytanie danych + data_train = pd.read_csv('lego_sets_clean_train.csv') + data_test = pd.read_csv('lego_sets_clean_test.csv') + + # Wydzielenie zbiorów dla predykcji ceny zestawu na podstawie liczby klocków, którą zawiera + train_piece_counts = np.array(data_train['piece_count']) + train_prices = np.array(data_train['list_price']) + test_piece_counts = np.array(data_test['piece_count']) + test_prices = np.array(data_test['list_price']) + + # Normalizacja + normalizer = layers.Normalization(input_shape=[1, ], axis=None) + normalizer.adapt(train_piece_counts) + + # Inicjalizacja + model = tf.keras.Sequential([ + normalizer, + layers.Dense(units=units) + ]) + + # Kompilacja + model.compile( + optimizer=tf.optimizers.Adam(learning_rate=learning_rate), + loss='mean_absolute_error' + ) + + # Trening + history = model.fit( + train_piece_counts, + train_prices, + epochs=epochs, + verbose=0, + validation_split=0.2 + ) + + # Wykonanie predykcji na danych ze zbioru testującego + y_pred = model.predict(test_piece_counts) + + # Zapis predykcji do pliku + results = pd.DataFrame( + {'test_set_piece_count': test_piece_counts.tolist(), 'predicted_price': [round(a[0], 2) for a in y_pred.tolist()]}) + results.to_csv('lego_reg_results.csv', index=False, header=True) + + # Zapis modelu do pliku standardowo poprzez metodę kerasa i poprzez metodę obiektu Experiment z Sacred + model.save('lego_reg_model') + ex.add_artifact('lego_reg_model/saved_model.pb') + + # Przykładowo zwracamy loss ostatniej epoki w charakterze wyników, żeby było widoczne w plikach zapisanych przez obserwator + hist = pd.DataFrame(history.history) + hist['epoch'] = history.epoch + _run.log_scalar('final.training.loss', hist['loss'].iloc[-1]) + +@ex.automain +def main(units, learning_rate): + train() \ No newline at end of file diff --git a/lab8/process_dataset.py b/lab8/process_dataset.py new file mode 100644 index 0000000..2f54b7b --- /dev/null +++ b/lab8/process_dataset.py @@ -0,0 +1,30 @@ +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split + +# usuwamy przy okazji puste pola +lego = pd.read_csv('lego_sets.csv', encoding='utf-8').dropna() + +# list_price moze byc do dwoch miejsc po przecinku +lego['list_price'] = lego['list_price'].round(2) + +# num_reviews, piece_count i prod_id moga byc wartosciami calkowitymi +lego['num_reviews'] = lego['num_reviews'].apply(np.int64) +lego['piece_count'] = lego['piece_count'].apply(np.int64) +lego['prod_id'] = lego['prod_id'].apply(np.int64) + +# wglad, statystyki +print(lego) +print(lego.describe(include='all')) + +# pierwszy podzial, wydzielamy zbior treningowy +lego_train, lego_rem = train_test_split(lego, train_size=0.8) + +# drugi podział, wydzielamy walidacyjny i testowy +lego_valid, lego_test = train_test_split(lego_rem, test_size=0.5) + +# zapis +lego.to_csv('lego_sets_clean.csv', index=None, header=True) +lego_train.to_csv('lego_sets_clean_train.csv', index=None, header=True) +lego_valid.to_csv('lego_sets_clean_valid.csv', index=None, header=True) +lego_test.to_csv('lego_sets_clean_test.csv', index=None, header=True) \ No newline at end of file diff --git a/lab8/simple_regression_lab8.py b/lab8/simple_regression_lab8.py new file mode 100644 index 0000000..d02e79b --- /dev/null +++ b/lab8/simple_regression_lab8.py @@ -0,0 +1,118 @@ +import tensorflow as tf +from keras import layers +from keras.models import save_model +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from sacred import Experiment +from sacred.observers import FileStorageObserver +from sacred.observers import MongoObserver +import mlflow +from urllib.parse import urlparse + +# Konfiguracja serwera i nazwy eksperymentu MLflow +mlflow.set_tracking_uri('http://tzietkiewicz.vm.wmi.amu.edu.pl:5000/#/') +mlflow.set_experiment('s449288') + +# Stworzenie obiektu klasy Experiment do śledzenia przebiegu regresji narzędziem Sacred +ex = Experiment(save_git_info=False) + +# Dodanie obserwatora FileObserver +ex.observers.append(FileStorageObserver('runs')) + +#Dodanie obserwatora Mongo +ex.observers.append(MongoObserver(url='mongodb://admin:IUM_2021@172.17.0.1:27017', db_name='sacred')) + +# Przykładowa modyfikowalna z Sacred konfiguracja wybranych parametrów treningu +@ex.config +def config(): + epochs = 100 + units = 1 + learning_rate = 0.1 + + +# Reszta kodu wrzucona do udekorowanej funkcji train do wywołania przez Sacred, żeby coś było capture'owane +@ex.capture +def train(epochs, units, learning_rate, _run): + + # Podpięcie treningu do MLflow + with mlflow.start_run() as run: + print('MLflow run experiment_id: {0}'.format(run.info.experiment_id)) + print('MLflow run artifact_uri: {0}'.format(run.info.artifact_uri)) + + # Wczytanie danych + data_train = pd.read_csv('lego_sets_clean_train.csv') + data_test = pd.read_csv('lego_sets_clean_test.csv') + + # Wydzielenie zbiorów dla predykcji ceny zestawu na podstawie liczby klocków, którą zawiera + train_piece_counts = np.array(data_train['piece_count']) + train_prices = np.array(data_train['list_price']) + test_piece_counts = np.array(data_test['piece_count']) + test_prices = np.array(data_test['list_price']) + + # Normalizacja + normalizer = layers.Normalization(input_shape=[1, ], axis=None) + normalizer.adapt(train_piece_counts) + + # Inicjalizacja + model = tf.keras.Sequential([ + normalizer, + layers.Dense(units=units) + ]) + + # Kompilacja + model.compile( + optimizer=tf.optimizers.Adam(learning_rate=learning_rate), + loss='mean_absolute_error' + ) + + # Trening + history = model.fit( + train_piece_counts, + train_prices, + epochs=epochs, + verbose=0, + validation_split=0.2 + ) + + # Wykonanie predykcji na danych ze zbioru testującego + y_pred = model.predict(test_piece_counts) + + # Zapis predykcji do pliku + results = pd.DataFrame( + {'test_set_piece_count': test_piece_counts.tolist(), 'predicted_price': [round(a[0], 2) for a in y_pred.tolist()]}) + results.to_csv('lego_reg_results.csv', index=False, header=True) + + # Zapis modelu do pliku standardowo poprzez metodę kerasa i poprzez metodę obiektu Experiment z Sacred + model.save('lego_reg_model') + ex.add_artifact('lego_reg_model/saved_model.pb') + + # Przykładowo zwracamy loss ostatniej epoki w charakterze wyników, żeby było widoczne w plikach zapisanych przez obserwator + hist = pd.DataFrame(history.history) + hist['epoch'] = history.epoch + _run.log_scalar('final.training.loss', hist['loss'].iloc[-1]) + + # Ewaluacja MAE na potrzeby MLflow (kopia z evaluate.py) + mae = model.evaluate( + test_piece_counts, + test_prices, verbose=0) + + # Zapis parametrów i metryk dla MLflow + mlflow.log_param('epochs', epochs) + mlflow.log_param('units', units) + mlflow.log_param('learning_rate', learning_rate) + mlflow.log_metric("mae", mae) + + # Logowanie i zapis modelu dla Mlflow + signature = mlflow.models.signature.infer_signature(train_piece_counts, model.predict(train_piece_counts)) + tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme + if tracking_url_type_store != 'file': + mlflow.keras.log_model(model, 'lego-model', registered_model_name='TFLegoModel', + signature=signature) + else: + mlflow.keras.log_model(model, 'model', signature=signature, input_example=500) + + +@ex.automain +def main(epochs, units, learning_rate): + train()