From b586cbdbc494045c56555f7ac7c51bb9fb6b4bd3 Mon Sep 17 00:00:00 2001 From: PawelDopierala Date: Wed, 29 May 2024 21:22:02 +0200 Subject: [PATCH] add pipeline dvc --- .gitignore | 11 ++++++++++- data_processing_dvc.py | 14 ++++++++++++++ dvc.yaml | 38 ++++++++++++++++++++++++++++++++++++++ params.yaml | 6 ++++++ 4 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 data_processing_dvc.py create mode 100644 dvc.yaml create mode 100644 params.yaml diff --git a/.gitignore b/.gitignore index 851e9c3..afd9ba7 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,10 @@ -/housing_price_dataset.csv +/data/housing_price_dataset.csv +/hp_model.h5 +/hp_test_predictions.csv +/hp_test_metrics.csv +/plot_rmse.png +/plot_mae.png +/plot_r2.png +/hp_train.csv +/hp_dev.csv +/hp_test.csv diff --git a/data_processing_dvc.py b/data_processing_dvc.py new file mode 100644 index 0000000..1d6810b --- /dev/null +++ b/data_processing_dvc.py @@ -0,0 +1,14 @@ +from sklearn.model_selection import train_test_split +import pandas as pd +import sys + +housing_price_dataset = pd.read_csv(sys.argv[1]) + +housing_price_dataset = pd.get_dummies(housing_price_dataset, columns=['Neighborhood']) + +hp_train_test, hp_dev = train_test_split(housing_price_dataset, test_size=0.1) +hp_train, hp_test = train_test_split(hp_train_test, test_size=1000) + +hp_train.to_csv('hp_train.csv', index=False) +hp_dev.to_csv('hp_dev.csv', index=False) +hp_test.to_csv('hp_test.csv', index=False) diff --git a/dvc.yaml b/dvc.yaml new file mode 100644 index 0000000..a520cad --- /dev/null +++ b/dvc.yaml @@ -0,0 +1,38 @@ +stages: + data_processing: + cmd: python data_processing_dvc.py data/housing_price_dataset.csv + deps: + - data_processing_dvc.py + - data/housing_price_dataset.csv + outs: + - hp_train.csv + - hp_dev.csv + - hp_test.csv + create_model: + cmd: python create_model.py ${create_model.epochs} ${create_model.learning_rate} ${create_model.batch_size} + deps: + - create_model.py + - helper.py + - hp_train.csv + - hp_dev.csv + outs: + - hp_model.h5 + params: + - create_model.epochs + - create_model.learning_rate + - create_model.batch_size + evaluate: + cmd: python evaluate.py ${evaluate.build_number} + deps: + - evaluate.py + - helper.py + - hp_model.h5 + - hp_test.csv + outs: + - hp_test_predictions.csv + - hp_test_metrics.csv + - plot_rmse.png + - plot_mae.png + - plot_r2.png + params: + - evaluate.build_number diff --git a/params.yaml b/params.yaml new file mode 100644 index 0000000..189557d --- /dev/null +++ b/params.yaml @@ -0,0 +1,6 @@ +create_model: + epochs: 20 + learning_rate: 0.001 + batch_size: 32 +evaluate: + build_number: 0 \ No newline at end of file