From 75a3a6e6c7876ebcf0a4daf6c6190e22e54125c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krzysztof=20Raczy=C5=84ski?= <krzysiek.r@onet.pl>
Date: Tue, 21 May 2024 19:47:48 +0200
Subject: [PATCH] IUM_08

---
 mlflow/MLProject         | 12 ++++++++++++
 mlflow/conda.yaml        | 13 +++++++++++++
 mlflow/mlflow_model.py   | 39 +++++++++++++++++++++++++++++++++++++++
 mlflow/mlflow_predict.py | 23 +++++++++++++++++++++++
 4 files changed, 87 insertions(+)
 create mode 100644 mlflow/MLProject
 create mode 100644 mlflow/conda.yaml
 create mode 100644 mlflow/mlflow_model.py
 create mode 100644 mlflow/mlflow_predict.py

diff --git a/mlflow/MLProject b/mlflow/MLProject
new file mode 100644
index 0000000..34a123e
--- /dev/null
+++ b/mlflow/MLProject
@@ -0,0 +1,12 @@
+name: Car Price Prediction
+
+conda_env: conda.yaml
+
+entry_points:
+  main:
+    parameters:
+      epochs: {type: int, default: 20}
+      batch_size: {type: int, default: 32}
+    command: "python mlflow_model.py {epochs} {batch_size}"
+  predict:
+    command: "python mlflow_predict.py"
diff --git a/mlflow/conda.yaml b/mlflow/conda.yaml
new file mode 100644
index 0000000..90d4ded
--- /dev/null
+++ b/mlflow/conda.yaml
@@ -0,0 +1,13 @@
+name: car_price_env
+channels:
+  - default
+dependencies:
+  - python=3.8
+  - pip:
+    - pip
+    - pandas
+    - numpy
+    - scikit-learn
+    - tensorflow
+    - mlflow
+    - h5py
diff --git a/mlflow/mlflow_model.py b/mlflow/mlflow_model.py
new file mode 100644
index 0000000..eb434fa
--- /dev/null
+++ b/mlflow/mlflow_model.py
@@ -0,0 +1,39 @@
+import mlflow
+import mlflow.keras
+import pandas as pd
+import numpy as np
+from tensorflow.keras import Sequential
+from tensorflow.keras.layers import Dense
+from sklearn.preprocessing import MinMaxScaler
+import sys
+
+# Parameters from the command line
+epochs = int(sys.argv[1])
+batch_size = int(sys.argv[2])
+
+mlflow.start_run()
+
+train_data = pd.read_csv('./data/car_prices_train.csv')
+train_data.dropna(inplace=True)
+y_train = train_data['sellingprice'].astype(np.float32)
+X_train = train_data[['year', 'condition', 'transmission']]
+
+scaler_x = MinMaxScaler()
+X_train['condition'] = scaler_x.fit_transform(X_train[['condition']])
+
+scaler_y = MinMaxScaler()
+y_train = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
+X_train = pd.get_dummies(X_train, columns=['transmission'])
+
+model = Sequential([Dense(64, activation='relu'), Dense(32, activation='relu'), Dense(1)])
+model.compile(optimizer='adam', loss='mean_squared_error')
+
+# Training the model with MLflow tracking
+model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)
+
+mlflow.keras.log_model(model, "model")
+
+mlflow.log_param("epochs", epochs)
+mlflow.log_param("batch_size", batch_size)
+
+mlflow.end_run()
diff --git a/mlflow/mlflow_predict.py b/mlflow/mlflow_predict.py
new file mode 100644
index 0000000..29f9c33
--- /dev/null
+++ b/mlflow/mlflow_predict.py
@@ -0,0 +1,23 @@
+import mlflow.keras
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import MinMaxScaler
+
+model = mlflow.keras.load_model("model")
+
+test_data = pd.read_csv('./data/car_prices_test.csv')
+test_data.dropna(inplace=True)
+y_test = test_data['sellingprice'].astype(np.float32)
+X_test = test_data[['year', 'condition', 'transmission']]
+
+scaler_y = MinMaxScaler()
+scaler_y.fit(y_test.values.reshape(-1, 1))
+
+scaler_X = MinMaxScaler()
+X_test['condition'] = scaler_X.fit_transform(X_test[['condition']])
+X_test = pd.get_dummies(X_test, columns=['transmission'])
+
+y_pred_scaled = model.predict(X_test)
+y_pred = scaler_y.inverse_transform(y_pred_scaled)
+y_pred_df = pd.DataFrame(y_pred, columns=['PredictedSellingPrice'])
+y_pred_df.to_csv('predicted_selling_prices.csv', index=False)