From ed9927d7a1455a28290e635a3dfa4ba0824fe946 Mon Sep 17 00:00:00 2001 From: Alicja Szulecka <73056579+AliSzu@users.noreply.github.com> Date: Mon, 6 May 2024 17:27:28 +0200 Subject: [PATCH] mlflow --- mlflow/MLProject | 13 ++++ mlflow/conda,yaml | 14 +++++ mlflow/mlflow_model.py | 120 ++++++++++++++++++++++++++++++++++++ mlflow/mlflow_prediction.py | 95 ++++++++++++++++++++++++++++ 4 files changed, 242 insertions(+) create mode 100644 mlflow/MLProject create mode 100644 mlflow/conda,yaml create mode 100644 mlflow/mlflow_model.py create mode 100644 mlflow/mlflow_prediction.py diff --git a/mlflow/MLProject b/mlflow/MLProject new file mode 100644 index 0000000..186abe5 --- /dev/null +++ b/mlflow/MLProject @@ -0,0 +1,13 @@ +name: mlflow_464914 + +conda_env: conda.yaml #ścieżka do pliku conda.yaml z definicją środowisk +# docker_env: +# image: mlflow-docker-example-environment + +entry_points: + main: + parameters: + epochs: {type: int, default: 10} + command: "python mlflow_model.py {epochs}" + test: + command: "python mlflow_prediction.py" \ No newline at end of file diff --git a/mlflow/conda,yaml b/mlflow/conda,yaml new file mode 100644 index 0000000..ec4614e --- /dev/null +++ b/mlflow/conda,yaml @@ -0,0 +1,14 @@ +name: mlflow_464914 +channels: + - defaults +dependencies: + - python=3.6 #Te zależności będą zainstalowane za pomocą conda isntall + - pip + - pip: #Te ząś za pomocą pip install + - scikit-learn==0.23.2 + - mlflow>=1.0 + - kaggle + - pandas + - numpy + - torch + diff --git a/mlflow/mlflow_model.py b/mlflow/mlflow_model.py new file mode 100644 index 0000000..d8cb6f3 --- /dev/null +++ b/mlflow/mlflow_model.py @@ -0,0 +1,120 @@ +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader, Dataset +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder +import torch.nn.functional as F +import mlflow +import mlflow.sklearn +import sys + +mlflow.set_tracking_uri("http://localhost:5000") +mlflow.set_experiment("s464914") + + +device = ( + "cuda" + if torch.cuda.is_available() + else "cpu" +) + +class Model(nn.Module): + def __init__(self, input_features=54, hidden_layer1=25, hidden_layer2=30, output_features=8): + super().__init__() + self.fc1 = nn.Linear(input_features,output_features) + self.bn1 = nn.BatchNorm1d(hidden_layer1) # Add batch normalization + self.fc2 = nn.Linear(hidden_layer1, hidden_layer2) + self.bn2 = nn.BatchNorm1d(hidden_layer2) # Add batch normalization + self.out = nn.Linear(hidden_layer2, output_features) + + def forward(self, x): + x = F.relu(self.fc1(x)) # Apply batch normalization after first linear layer + #x = F.relu(self.bn2(self.fc2(x))) # Apply batch normalization after second linear layer + #x = self.out(x) + return x + +def main(): + epochs = int(sys.argv[1]) + forest_train = pd.read_csv('forest_train.csv') + forest_val = pd.read_csv('forest_val.csv') + + print(forest_train.head()) + + + X_train = forest_train.drop(columns=['Cover_Type']).values + y_train = forest_train['Cover_Type'].values + + X_val = forest_val.drop(columns=['Cover_Type']).values + y_val = forest_val['Cover_Type'].values + + + # Initialize model, loss function, and optimizer + model = Model().to(device) + criterion = nn.CrossEntropyLoss() + optimizer = optim.Adam(model.parameters(), lr=0.001) + + # Convert to PyTorch tensors + X_train = torch.tensor(X_train, dtype=torch.float32).to(device) + y_train = torch.tensor(y_train, dtype=torch.long).to(device) + X_val = torch.tensor(X_val, dtype=torch.float32).to(device) + y_val = torch.tensor(y_val, dtype=torch.long).to(device) + + # Create DataLoader + train_loader = DataLoader(list(zip(X_train, y_train)), batch_size=64, shuffle=True) + val_loader = DataLoader(list(zip(X_val, y_val)), batch_size=64) + + with mlflow.start_run() as run: + # Training loop + for epoch in range(epochs): + model.train() # Set model to training mode + running_loss = 0.0 + for inputs, labels in train_loader: + inputs, labels = inputs.to(device), labels.to(device) + + optimizer.zero_grad() + + outputs = model(inputs) + loss = criterion(outputs, labels) + loss.backward() + optimizer.step() + + running_loss += loss.item() * inputs.size(0) + + # Calculate training loss + epoch_loss = running_loss / len(train_loader.dataset) + + # Validation + model.eval() # Set model to evaluation mode + val_running_loss = 0.0 + correct = 0 + total = 0 + with torch.no_grad(): + for inputs, labels in val_loader: + inputs, labels = inputs.to(device), labels.to(device) + + outputs = model(inputs) + val_loss = criterion(outputs, labels) + val_running_loss += val_loss.item() * inputs.size(0) + + _, predicted = torch.max(outputs, 1) + total += labels.size(0) + correct += (predicted == labels).sum().item() + + # Calculate validation loss and accuracy + val_epoch_loss = val_running_loss / len(val_loader.dataset) + val_accuracy = correct / total + + print(f"Epoch {epoch+1}/{epochs}, " + f"Train Loss: {epoch_loss:.4f}, " + f"Val Loss: {val_epoch_loss:.4f}, " + f"Val Accuracy: {val_accuracy:.4f}") + + + torch.save(model.state_dict(), 'model.pth') + mlflow.log_param("epochs", epochs) + + +if __name__ == "__main__": + main() diff --git a/mlflow/mlflow_prediction.py b/mlflow/mlflow_prediction.py new file mode 100644 index 0000000..6cb3c30 --- /dev/null +++ b/mlflow/mlflow_prediction.py @@ -0,0 +1,95 @@ +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader, Dataset +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder +import torch.nn.functional as F +from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error +import numpy as np +import mlflow +import mlflow.sklearn + +mlflow.set_tracking_uri("http://localhost:5000") +mlflow.set_experiment("s464914") + +device = ( + "cuda" + if torch.cuda.is_available() + else "cpu" +) + +class Model(nn.Module): + def __init__(self, input_features=54, hidden_layer1=25, hidden_layer2=30, output_features=8): + super().__init__() + self.fc1 = nn.Linear(input_features,output_features) + self.bn1 = nn.BatchNorm1d(hidden_layer1) # Add batch normalization + self.fc2 = nn.Linear(hidden_layer1, hidden_layer2) + self.bn2 = nn.BatchNorm1d(hidden_layer2) # Add batch normalization + self.out = nn.Linear(hidden_layer2, output_features) + + def forward(self, x): + x = F.relu(self.fc1(x)) + return x + +def load_model(model, model_path): + model.load_state_dict(torch.load(model_path)) + model.eval() + +def predict(model, input_data): + # Convert input data to PyTorch tensor + + # Perform forward pass + with torch.no_grad(): + output = model(input_data) + + _, predicted_class = torch.max(output, 0) + + return predicted_class.item() # Return the predicted class label + +def main(): + with mlflow.start_run() as run: + forest_test = pd.read_csv('forest_test.csv') + + X_test = forest_test.drop(columns=['Cover_Type']).values + y_test = forest_test['Cover_Type'].values + + X_test = torch.tensor(X_test, dtype=torch.float32).to(device) + + model = Model().to(device) + model_path = 'model.pth' # Path to your saved model file + load_model(model, model_path) + + predictions = [] + true_labels = [] + with torch.no_grad(): + for input_data, target in zip(X_test, y_test): + output = model(input_data) + _, predicted_class = torch.max(output, 0) + prediction_entry = f"predicted: {predicted_class.item()} true_label: {target}" + predictions.append(prediction_entry) + true_labels.append() + if predicted_class.item() == target: + true_labels.append(target) + + + with open(r'predictions.txt', 'w') as fp: + for item in predictions: + # write each item on a new line + fp.write("%s\n" % item) + + accuracy = accuracy_score(true_labels, predictions) + precision_micro = precision_score(true_labels, predictions, average='micro') + recall_micro = recall_score(true_labels, predictions, average='micro') + f1_micro = f1_score(true_labels, predictions, average='micro') + rmse = np.sqrt(mean_squared_error(true_labels, predictions)) + + mlflow.log_metric("accuracy", accuracy) + mlflow.log_metric("precision_micro", precision_micro) + mlflow.log_metric("recall_micro", recall_micro) + mlflow.log_metric("f1_micro", f1_micro) + mlflow.log_metric("rmse", rmse) + +if __name__ == "__main__": + main() \ No newline at end of file