From ed9927d7a1455a28290e635a3dfa4ba0824fe946 Mon Sep 17 00:00:00 2001
From: Alicja Szulecka <73056579+AliSzu@users.noreply.github.com>
Date: Mon, 6 May 2024 17:27:28 +0200
Subject: [PATCH] mlflow

---
 mlflow/MLProject            |  13 ++++
 mlflow/conda,yaml           |  14 +++++
 mlflow/mlflow_model.py      | 120 ++++++++++++++++++++++++++++++++++++
 mlflow/mlflow_prediction.py |  95 ++++++++++++++++++++++++++++
 4 files changed, 242 insertions(+)
 create mode 100644 mlflow/MLProject
 create mode 100644 mlflow/conda,yaml
 create mode 100644 mlflow/mlflow_model.py
 create mode 100644 mlflow/mlflow_prediction.py

diff --git a/mlflow/MLProject b/mlflow/MLProject
new file mode 100644
index 0000000..186abe5
--- /dev/null
+++ b/mlflow/MLProject
@@ -0,0 +1,13 @@
+name: mlflow_464914
+
+conda_env: conda.yaml #ścieżka do pliku conda.yaml z definicją środowisk
+# docker_env:
+#  image: mlflow-docker-example-environment
+
+entry_points:
+  main:
+    parameters:
+      epochs: {type: int, default: 10}
+    command: "python mlflow_model.py {epochs}"
+  test:
+    command: "python mlflow_prediction.py"
\ No newline at end of file
diff --git a/mlflow/conda,yaml b/mlflow/conda,yaml
new file mode 100644
index 0000000..ec4614e
--- /dev/null
+++ b/mlflow/conda,yaml
@@ -0,0 +1,14 @@
+name: mlflow_464914
+channels:
+  - defaults
+dependencies:
+  - python=3.6 #Te zależności będą zainstalowane za pomocą conda isntall
+  - pip
+  - pip: #Te ząś za pomocą pip install
+    - scikit-learn==0.23.2
+    - mlflow>=1.0
+    - kaggle
+    - pandas
+    - numpy
+    - torch
+
diff --git a/mlflow/mlflow_model.py b/mlflow/mlflow_model.py
new file mode 100644
index 0000000..d8cb6f3
--- /dev/null
+++ b/mlflow/mlflow_model.py
@@ -0,0 +1,120 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader, Dataset
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+import torch.nn.functional as F
+import mlflow
+import mlflow.sklearn
+import sys
+
+mlflow.set_tracking_uri("http://localhost:5000")
+mlflow.set_experiment("s464914")
+ 
+
+device = (
+    "cuda"
+    if torch.cuda.is_available()
+    else "cpu"
+)
+
+class Model(nn.Module):
+    def __init__(self, input_features=54, hidden_layer1=25, hidden_layer2=30, output_features=8):
+        super().__init__()
+        self.fc1 = nn.Linear(input_features,output_features)
+        self.bn1 = nn.BatchNorm1d(hidden_layer1)  # Add batch normalization
+        self.fc2 = nn.Linear(hidden_layer1, hidden_layer2)
+        self.bn2 = nn.BatchNorm1d(hidden_layer2)  # Add batch normalization
+        self.out = nn.Linear(hidden_layer2, output_features)
+        
+    def forward(self, x):
+        x = F.relu(self.fc1(x))  # Apply batch normalization after first linear layer
+        #x = F.relu(self.bn2(self.fc2(x)))  # Apply batch normalization after second linear layer
+        #x = self.out(x)
+        return x
+
+def main():
+    epochs = int(sys.argv[1])
+    forest_train = pd.read_csv('forest_train.csv')
+    forest_val = pd.read_csv('forest_val.csv')
+
+    print(forest_train.head())
+
+
+    X_train = forest_train.drop(columns=['Cover_Type']).values
+    y_train = forest_train['Cover_Type'].values
+
+    X_val = forest_val.drop(columns=['Cover_Type']).values
+    y_val = forest_val['Cover_Type'].values
+
+
+    # Initialize model, loss function, and optimizer
+    model = Model().to(device)
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.Adam(model.parameters(), lr=0.001)
+
+    # Convert to PyTorch tensors
+    X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
+    y_train = torch.tensor(y_train, dtype=torch.long).to(device)
+    X_val = torch.tensor(X_val, dtype=torch.float32).to(device)
+    y_val = torch.tensor(y_val, dtype=torch.long).to(device)
+
+    # Create DataLoader
+    train_loader = DataLoader(list(zip(X_train, y_train)), batch_size=64, shuffle=True)
+    val_loader = DataLoader(list(zip(X_val, y_val)), batch_size=64)
+
+    with mlflow.start_run() as run:
+        # Training loop
+        for epoch in range(epochs):
+            model.train()  # Set model to training mode
+            running_loss = 0.0
+            for inputs, labels in train_loader:
+                inputs, labels = inputs.to(device), labels.to(device)
+
+                optimizer.zero_grad()
+
+                outputs = model(inputs)
+                loss = criterion(outputs, labels)
+                loss.backward()
+                optimizer.step()
+
+                running_loss += loss.item() * inputs.size(0)
+
+            # Calculate training loss
+            epoch_loss = running_loss / len(train_loader.dataset)
+
+            # Validation
+            model.eval()  # Set model to evaluation mode
+            val_running_loss = 0.0
+            correct = 0
+            total = 0
+            with torch.no_grad():
+                for inputs, labels in val_loader:
+                    inputs, labels = inputs.to(device), labels.to(device)
+
+                    outputs = model(inputs)
+                    val_loss = criterion(outputs, labels)
+                    val_running_loss += val_loss.item() * inputs.size(0)
+
+                    _, predicted = torch.max(outputs, 1)
+                    total += labels.size(0)
+                    correct += (predicted == labels).sum().item()
+
+            # Calculate validation loss and accuracy
+            val_epoch_loss = val_running_loss / len(val_loader.dataset)
+            val_accuracy = correct / total
+
+            print(f"Epoch {epoch+1}/{epochs}, "
+                f"Train Loss: {epoch_loss:.4f}, "
+                f"Val Loss: {val_epoch_loss:.4f}, "
+                f"Val Accuracy: {val_accuracy:.4f}")
+            
+
+        torch.save(model.state_dict(), 'model.pth')
+        mlflow.log_param("epochs", epochs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mlflow/mlflow_prediction.py b/mlflow/mlflow_prediction.py
new file mode 100644
index 0000000..6cb3c30
--- /dev/null
+++ b/mlflow/mlflow_prediction.py
@@ -0,0 +1,95 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader, Dataset
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+import torch.nn.functional as F
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error
+import numpy as np
+import mlflow
+import mlflow.sklearn
+
+mlflow.set_tracking_uri("http://localhost:5000")
+mlflow.set_experiment("s464914")
+
+device = (
+    "cuda"
+    if torch.cuda.is_available()
+    else "cpu"
+)
+
+class Model(nn.Module):
+    def __init__(self, input_features=54, hidden_layer1=25, hidden_layer2=30, output_features=8):
+        super().__init__()
+        self.fc1 = nn.Linear(input_features,output_features)
+        self.bn1 = nn.BatchNorm1d(hidden_layer1)  # Add batch normalization
+        self.fc2 = nn.Linear(hidden_layer1, hidden_layer2)
+        self.bn2 = nn.BatchNorm1d(hidden_layer2)  # Add batch normalization
+        self.out = nn.Linear(hidden_layer2, output_features)
+        
+    def forward(self, x):
+        x = F.relu(self.fc1(x)) 
+        return x
+
+def load_model(model, model_path):
+    model.load_state_dict(torch.load(model_path))
+    model.eval()
+
+def predict(model, input_data):
+    # Convert input data to PyTorch tensor
+    
+    # Perform forward pass
+    with torch.no_grad():
+        output = model(input_data)
+
+    _, predicted_class = torch.max(output, 0)
+    
+    return predicted_class.item()  # Return the predicted class label
+
+def main():
+    with mlflow.start_run() as run:
+        forest_test = pd.read_csv('forest_test.csv')
+
+        X_test = forest_test.drop(columns=['Cover_Type']).values
+        y_test = forest_test['Cover_Type'].values
+
+        X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
+
+        model = Model().to(device)
+        model_path = 'model.pth'  # Path to your saved model file
+        load_model(model, model_path)
+
+        predictions = []
+        true_labels = []
+        with torch.no_grad():
+            for input_data, target in zip(X_test, y_test):
+                output = model(input_data)
+                _, predicted_class = torch.max(output, 0)
+                prediction_entry = f"predicted: {predicted_class.item()} true_label: {target}"
+                predictions.append(prediction_entry)
+                true_labels.append()
+                if predicted_class.item() == target:
+                    true_labels.append(target)
+
+
+        with open(r'predictions.txt', 'w') as fp:
+            for item in predictions:
+                # write each item on a new line
+                fp.write("%s\n" % item)
+
+        accuracy = accuracy_score(true_labels, predictions)
+        precision_micro = precision_score(true_labels, predictions, average='micro')
+        recall_micro = recall_score(true_labels, predictions, average='micro')
+        f1_micro = f1_score(true_labels, predictions, average='micro')
+        rmse = np.sqrt(mean_squared_error(true_labels, predictions))
+
+        mlflow.log_metric("accuracy", accuracy)
+        mlflow.log_metric("precision_micro", precision_micro)
+        mlflow.log_metric("recall_micro", recall_micro)
+        mlflow.log_metric("f1_micro", f1_micro)
+        mlflow.log_metric("rmse", rmse)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file