mlflow

2024-05-06 17:27:28 +02:00 · 2024-05-06 17:27:28 +02:00 · ed9927d7a1
commit ed9927d7a1
parent 8ab682be76
4 changed files with 242 additions and 0 deletions
--- a/mlflow/MLProject
+++ b/mlflow/MLProject
@ -0,0 +1,13 @@
 name: mlflow_464914
 conda_env: conda.yaml #ścieżka do pliku conda.yaml z definicją środowisk
 # docker_env:
 #  image: mlflow-docker-example-environment
 entry_points:
  main:
    parameters:
      epochs: {type: int, default: 10}
    command: "python mlflow_model.py {epochs}"
  test:
    command: "python mlflow_prediction.py"
--- a/mlflow/conda,yaml
+++ b/mlflow/conda,yaml
@ -0,0 +1,14 @@
 name: mlflow_464914
 channels:
  - defaults
 dependencies:
  - python=3.6 #Te zależności będą zainstalowane za pomocą conda isntall
  - pip
  - pip: #Te ząś za pomocą pip install
    - scikit-learn==0.23.2
    - mlflow>=1.0
    - kaggle
    - pandas
    - numpy
    - torch
--- a/mlflow/mlflow_model.py
+++ b/mlflow/mlflow_model.py
@ -0,0 +1,120 @@
 import torch
 import torch.nn as nn
 import torch.optim as optim
 from torch.utils.data import DataLoader, Dataset
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
 import torch.nn.functional as F
 import mlflow
 import mlflow.sklearn
 import sys
 mlflow.set_tracking_uri("http://localhost:5000")
 mlflow.set_experiment("s464914")
 device = (
    "cuda"
    if torch.cuda.is_available()
    else "cpu"
 )
 class Model(nn.Module):
    def __init__(self, input_features=54, hidden_layer1=25, hidden_layer2=30, output_features=8):
        super().__init__()
        self.fc1 = nn.Linear(input_features,output_features)
        self.bn1 = nn.BatchNorm1d(hidden_layer1)  # Add batch normalization
        self.fc2 = nn.Linear(hidden_layer1, hidden_layer2)
        self.bn2 = nn.BatchNorm1d(hidden_layer2)  # Add batch normalization
        self.out = nn.Linear(hidden_layer2, output_features)
    def forward(self, x):
        x = F.relu(self.fc1(x))  # Apply batch normalization after first linear layer
        #x = F.relu(self.bn2(self.fc2(x)))  # Apply batch normalization after second linear layer
        #x = self.out(x)
        return x
 def main():
    epochs = int(sys.argv[1])
    forest_train = pd.read_csv('forest_train.csv')
    forest_val = pd.read_csv('forest_val.csv')
    print(forest_train.head())
    X_train = forest_train.drop(columns=['Cover_Type']).values
    y_train = forest_train['Cover_Type'].values
    X_val = forest_val.drop(columns=['Cover_Type']).values
    y_val = forest_val['Cover_Type'].values
    # Initialize model, loss function, and optimizer
    model = Model().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    # Convert to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
    y_train = torch.tensor(y_train, dtype=torch.long).to(device)
    X_val = torch.tensor(X_val, dtype=torch.float32).to(device)
    y_val = torch.tensor(y_val, dtype=torch.long).to(device)
    # Create DataLoader
    train_loader = DataLoader(list(zip(X_train, y_train)), batch_size=64, shuffle=True)
    val_loader = DataLoader(list(zip(X_val, y_val)), batch_size=64)
    with mlflow.start_run() as run:
        # Training loop
        for epoch in range(epochs):
            model.train()  # Set model to training mode
            running_loss = 0.0
            for inputs, labels in train_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                running_loss += loss.item() * inputs.size(0)
            # Calculate training loss
            epoch_loss = running_loss / len(train_loader.dataset)
            # Validation
            model.eval()  # Set model to evaluation mode
            val_running_loss = 0.0
            correct = 0
            total = 0
            with torch.no_grad():
                for inputs, labels in val_loader:
                    inputs, labels = inputs.to(device), labels.to(device)
                    outputs = model(inputs)
                    val_loss = criterion(outputs, labels)
                    val_running_loss += val_loss.item() * inputs.size(0)
                    _, predicted = torch.max(outputs, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()
            # Calculate validation loss and accuracy
            val_epoch_loss = val_running_loss / len(val_loader.dataset)
            val_accuracy = correct / total
            print(f"Epoch {epoch+1}/{epochs}, "
                f"Train Loss: {epoch_loss:.4f}, "
                f"Val Loss: {val_epoch_loss:.4f}, "
                f"Val Accuracy: {val_accuracy:.4f}")
        torch.save(model.state_dict(), 'model.pth')
        mlflow.log_param("epochs", epochs)
 if __name__ == "__main__":
    main()
--- a/mlflow/mlflow_prediction.py
+++ b/mlflow/mlflow_prediction.py
@ -0,0 +1,95 @@
 import torch
 import torch.nn as nn
 import torch.optim as optim
 from torch.utils.data import DataLoader, Dataset
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
 import torch.nn.functional as F
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error
 import numpy as np
 import mlflow
 import mlflow.sklearn
 mlflow.set_tracking_uri("http://localhost:5000")
 mlflow.set_experiment("s464914")
 device = (
    "cuda"
    if torch.cuda.is_available()
    else "cpu"
 )
 class Model(nn.Module):
    def __init__(self, input_features=54, hidden_layer1=25, hidden_layer2=30, output_features=8):
        super().__init__()
        self.fc1 = nn.Linear(input_features,output_features)
        self.bn1 = nn.BatchNorm1d(hidden_layer1)  # Add batch normalization
        self.fc2 = nn.Linear(hidden_layer1, hidden_layer2)
        self.bn2 = nn.BatchNorm1d(hidden_layer2)  # Add batch normalization
        self.out = nn.Linear(hidden_layer2, output_features)
    def forward(self, x):
        x = F.relu(self.fc1(x)) 
        return x
 def load_model(model, model_path):
    model.load_state_dict(torch.load(model_path))
    model.eval()
 def predict(model, input_data):
    # Convert input data to PyTorch tensor
    # Perform forward pass
    with torch.no_grad():
        output = model(input_data)
    _, predicted_class = torch.max(output, 0)
    return predicted_class.item()  # Return the predicted class label
 def main():
    with mlflow.start_run() as run:
        forest_test = pd.read_csv('forest_test.csv')
        X_test = forest_test.drop(columns=['Cover_Type']).values
        y_test = forest_test['Cover_Type'].values
        X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
        model = Model().to(device)
        model_path = 'model.pth'  # Path to your saved model file
        load_model(model, model_path)
        predictions = []
        true_labels = []
        with torch.no_grad():
            for input_data, target in zip(X_test, y_test):
                output = model(input_data)
                _, predicted_class = torch.max(output, 0)
                prediction_entry = f"predicted: {predicted_class.item()} true_label: {target}"
                predictions.append(prediction_entry)
                true_labels.append()
                if predicted_class.item() == target:
                    true_labels.append(target)
        with open(r'predictions.txt', 'w') as fp:
            for item in predictions:
                # write each item on a new line
                fp.write("%s\n" % item)
        accuracy = accuracy_score(true_labels, predictions)
        precision_micro = precision_score(true_labels, predictions, average='micro')
        recall_micro = recall_score(true_labels, predictions, average='micro')
        f1_micro = f1_score(true_labels, predictions, average='micro')
        rmse = np.sqrt(mean_squared_error(true_labels, predictions))
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision_micro", precision_micro)
        mlflow.log_metric("recall_micro", recall_micro)
        mlflow.log_metric("f1_micro", f1_micro)
        mlflow.log_metric("rmse", rmse)
 if __name__ == "__main__":
    main()