IUM_9

Create dvc.yaml
Update config
2024-05-28 21:52:31 +02:00 · 2024-05-28 21:43:31 +02:00 · 2024-05-28 21:34:33 +02:00 · 2024-05-28 21:26:27 +02:00 · 2024-05-28 21:22:43 +02:00 · 2024-05-28 21:22:26 +02:00
23 changed files with 661 additions and 697415 deletions
--- a/.dvc/.gitignore
+++ b/.dvc/.gitignore
@ -0,0 +1,3 @@
 /config.local
 /tmp
 /cache
--- a/.dvc/config
+++ b/.dvc/config
@ -0,0 +1,4 @@
 [core]
    remote = ium_ssh_remote
 ['remote "ium_ssh_remote"']
    url = ssh://ium-sftp@tzietkiewicz.vm.wmi.amu.edu.pl
--- a/.dvcignore
+++ b/.dvcignore
@ -0,0 +1,3 @@
 # Add patterns of files dvc should ignore, which could improve
 # the performance. Learn more at
 # https://dvc.org/doc/user-guide/dvcignore
--- a/.env
+++ b/.env
@ -0,0 +1,5 @@
 MONGO_INITDB_ROOT_USERNAME=admin
 MONGO_INITDB_ROOT_PASSWORD=IUM_2021
 ME_CONFIG_BASICAUTH_USERNAME=mongo_express_user
 ME_CONFIG_BASICAUTH_PASSWORD=mongo_express_pw
 MONGO_DATABASE=sacred
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
 /covtype.csv
--- a/.ipynb_checkpoints/IUM_2-checkpoint.ipynb
+++ b/.ipynb_checkpoints/IUM_2-checkpoint.ipynb
@ -1,95 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install --user kaggle \n",
    "%pip install --user pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "ERROR: Could not find a version that satisfies the requirement git (from versions: none)\n",
      "ERROR: No matching distribution found for git\n",
      "\n",
      "[notice] A new release of pip is available: 23.1.2 -> 24.0\n",
      "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
     ]
    }
   ],
   "source": [
    "%pip install git"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Download data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!kaggle datasets download -d nasa/meteorite-landings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "!tar -xf  meteorite-landings.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/2
+++ b/2
@ -4,7 +4,7 @@ RUN apt update && apt install -y python3-pip
 RUN apt install unzip
 RUN apt install bc
-RUN pip3 install kaggle pandas scikit-learn torch matplotlib
+RUN pip3 install kaggle pandas scikit-learn torch sacred pymongo 
 WORKDIR /app
--- a/59
+++ b/59
@ -1,15 +1,9 @@
 pipeline {
    agent any
    parameters {
-            buildSelector (
+    string(name: 'KAGGLE_USERNAME', defaultValue: 'alicjaszulecka', description: 'Kaggle username')
-                defaultSelector: lastSuccessful(),
+    password(name: 'KAGGLE_KEY', defaultValue:'', description: 'Kaggle Key')
-                description: 'Build for copying artifacts',
+    string(name: 'CUTOFF', defaultValue: '100', description: 'cut off number')
                name: 'BUILD_SELECTOR'
            )
            gitParameter branchFilter: 'origin/(.*)', defaultValue: 'model', name: 'BRANCH', type: 'PT_BRANCH'
        }
        triggers {
            upstream(upstreamProjects: 's464914-training/' + params.BRANCH + '/', threshold: hudson.model.Result.SUCCESS)
    }
    stages {
        stage('Git Checkout') {
@ -17,43 +11,50 @@ pipeline {
               checkout scm
            }
        }
-     stage('Copy Artifacts') {
+         stage('Download dataset') {
      steps {
-               copyArtifacts fingerprintArtifacts: true, projectName: 'z-s464914-create-dataset', selector: buildParameter('BUILD_SELECTOR')
+        withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}"]) {
-               copyArtifacts filter: '*', projectName: 's464914-training/' + params.BRANCH + '/', selector: buildParameter('BUILD_SELECTOR')
+          sh 'pip install kaggle'
-               copyArtifacts filter: '*', projectName: 's464914-evaluation/evaluation/', selector: buildParameter('BUILD_SELECTOR'), optional: true
+          sh 'kaggle datasets download -d uciml/forest-cover-type-dataset'
          sh 'unzip -o forest-cover-type-dataset.zip'
          sh 'rm forest-cover-type-dataset.zip'
        }
      }
-         stage('Prediction') {
+    }
     stage('Build') {
         steps {
            script {
                 withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
                          "KAGGLE_KEY=${params.KAGGLE_KEY}" ]) {
                    def customImage = docker.build("custom-image")
                    customImage.inside {
-                        sh 'python3 ./prediction.py'
+                        sh 'python3 ./IUM_2.py'
-                        archiveArtifacts artifacts: 'predictions.txt', onlyIfSuccessful: true
+                        archiveArtifacts artifacts: 'covtype.csv, forest_train.csv, forest_test.csv, forest_val.csv', onlyIfSuccessful: true
                    }
                 }
             }
         }
        stage('Metrics') {
            steps {
              script {
               def customImage = docker.build("custom-image")
                    customImage.inside {
                        sh 'python3 ./metrics.py'
                        archiveArtifacts artifacts: 'metrics.txt', onlyIfSuccessful: true
                    }
              }
            }
         }
-        stage('Plot Accuracy') {
+         stage('Train and Predict') {
            steps {
              script {
               def customImage = docker.build("custom-image")
                    customImage.inside {
-                        sh 'python3 ./plot.py'
+                        sh 'python3 ./model.py'
-                        archiveArtifacts artifacts: 'accuracy.png', onlyIfSuccessful: true
+                        sh 'python3 ./prediction.py'
                        archiveArtifacts artifacts: 'model.pth, predictions.txt', onlyIfSuccessful: true
                    }
              }
            }
        }
        stage('Experiments') {
            steps {
              script {
               def customImage = docker.build("custom-image")
                    customImage.inside {
                        sh 'python3 ./sacred_model.py'
                        archiveArtifacts artifacts: 'experiments', onlyIfSuccessful: true
                    }
              }
            }
--- a/covtype.csv
+++ b/covtype.csv
--- a/covtype.csv.dvc
+++ b/covtype.csv.dvc
@ -0,0 +1,5 @@
 outs:
 - md5: e88c3c209db2e8982e07c43462d67c87
  size: 75170064
  hash: md5
  path: covtype.csv
--- a/dvc.yaml
+++ b/dvc.yaml
@ -0,0 +1,29 @@
 stages:
  prepare_data:
    cmd: python ./IUM_2.py
    deps:
      - create-dataset.py
      - covtype.csv
    outs:
      - forest_train.csv
      - forest_test.csv
      - forest_val.csv
  train_model:
    cmd: python ./model.py
    deps:
      - model.py
      - forest_train.csv
      - forest_test.csv
      - forest_val.csv
    outs:
      - model.pth
  evaluate_model:
    cmd: python ./prediction.py
    deps:
      - prediction.py
      - model.pth
      - forest_test.csv
    outs:
      - predictions.txt
--- a/environment.yml
+++ b/environment.yml
@ -0,0 +1,189 @@
 name: IUM
 channels:
  - defaults
 dependencies:
  - _tflow_select=2.3.0=mkl
  - abseil-cpp=20211102.0=hd77b12b_0
  - absl-py=2.1.0=py310haa95532_0
  - aiohttp=3.9.5=py310h2bbff1b_0
  - aiosignal=1.2.0=pyhd3eb1b0_0
  - alembic=1.8.1=py310haa95532_0
  - aniso8601=9.0.1=pyhd3eb1b0_0
  - arrow-cpp=11.0.0=h2c9b28c_2
  - astunparse=1.6.3=py_0
  - async-timeout=4.0.3=py310haa95532_0
  - attrs=23.1.0=py310haa95532_0
  - aws-c-common=0.4.57=ha925a31_1
  - aws-c-event-stream=0.1.6=hd77b12b_5
  - aws-checksums=0.1.9=ha925a31_0
  - aws-sdk-cpp=1.8.185=hd77b12b_0
  - bcrypt=3.2.0=py310h2bbff1b_1
  - blas=1.0=mkl
  - blinker=1.6.2=py310haa95532_0
  - boost-cpp=1.82.0=h59b6b97_2
  - bottleneck=1.3.7=py310h9128911_0
  - brotli=1.0.9=h2bbff1b_8
  - brotli-bin=1.0.9=h2bbff1b_8
  - brotli-python=1.0.9=py310hd77b12b_8
  - bzip2=1.0.8=h2bbff1b_6
  - c-ares=1.19.1=h2bbff1b_0
  - ca-certificates=2024.3.11=haa95532_0
  - cachetools=5.3.3=py310haa95532_0
  - certifi=2024.2.2=py310haa95532_0
  - cffi=1.16.0=py310h2bbff1b_1
  - charset-normalizer=2.0.4=pyhd3eb1b0_0
  - click=8.1.7=py310haa95532_0
  - cloudpickle=2.2.1=py310haa95532_0
  - colorama=0.4.6=py310haa95532_0
  - contourpy=1.2.0=py310h59b6b97_0
  - cryptography=41.0.3=py310h3438e0d_0
  - cycler=0.11.0=pyhd3eb1b0_0
  - docker-py=7.0.0=py310haa95532_0
  - entrypoints=0.4=py310haa95532_0
  - flask=2.2.5=py310haa95532_0
  - flatbuffers=2.0.0=h6c2663c_0
  - fonttools=4.51.0=py310h2bbff1b_0
  - freetype=2.12.1=ha860e81_0
  - frozenlist=1.4.0=py310h2bbff1b_0
  - gast=0.4.0=pyhd3eb1b0_0
  - gflags=2.2.2=hd77b12b_1
  - giflib=5.2.1=h8cc25b3_3
  - gitdb=4.0.7=pyhd3eb1b0_0
  - gitpython=3.1.37=py310haa95532_0
  - glog=0.5.0=hd77b12b_1
  - google-auth=2.29.0=py310haa95532_0
  - google-auth-oauthlib=0.4.4=pyhd3eb1b0_0
  - google-pasta=0.2.0=pyhd3eb1b0_0
  - graphene=3.3=py310haa95532_0
  - graphql-core=3.2.3=py310haa95532_1
  - graphql-relay=3.2.0=py310haa95532_0
  - greenlet=3.0.1=py310hd77b12b_0
  - grpc-cpp=1.48.2=hf108199_0
  - grpcio=1.48.2=py310hf108199_0
  - h5py=3.11.0=py310hed405ee_0
  - hdf5=1.12.1=h51c971a_3
  - icc_rt=2022.1.0=h6049295_2
  - icu=58.2=ha925a31_3
  - idna=3.7=py310haa95532_0
  - importlib-metadata=7.0.1=py310haa95532_0
  - intel-openmp=2023.1.0=h59b6b97_46320
  - itsdangerous=2.0.1=pyhd3eb1b0_0
  - jinja2=3.1.3=py310haa95532_0
  - joblib=1.4.0=py310haa95532_0
  - jpeg=9e=h2bbff1b_1
  - keras=2.10.0=py310haa95532_0
  - keras-preprocessing=1.1.2=pyhd3eb1b0_0
  - kiwisolver=1.4.4=py310hd77b12b_0
  - krb5=1.20.1=h5b6d351_1
  - lcms2=2.12=h83e58a3_0
  - lerc=3.0=hd77b12b_0
  - libboost=1.82.0=h3399ecb_2
  - libbrotlicommon=1.0.9=h2bbff1b_8
  - libbrotlidec=1.0.9=h2bbff1b_8
  - libbrotlienc=1.0.9=h2bbff1b_8
  - libclang=14.0.6=default_hb5a9fac_1
  - libclang13=14.0.6=default_h8e68704_1
  - libcurl=8.7.1=h86230a5_0
  - libdeflate=1.17=h2bbff1b_1
  - libevent=2.1.12=hcc03200_0
  - libffi=3.4.4=hd77b12b_1
  - libpng=1.6.39=h8cc25b3_0
  - libpq=12.15=hb652d5d_1
  - libprotobuf=3.20.3=h23ce68f_0
  - libssh2=1.10.0=hcd4344a_2
  - libthrift=0.15.0=he49ee6e_2
  - libtiff=4.5.1=hd77b12b_0
  - libwebp-base=1.3.2=h2bbff1b_0
  - lz4-c=1.9.4=h2bbff1b_1
  - mako=1.2.3=py310haa95532_0
  - markdown=3.4.1=py310haa95532_0
  - markupsafe=2.1.3=py310h2bbff1b_0
  - matplotlib=3.8.4=py310haa95532_0
  - matplotlib-base=3.8.4=py310h4ed8f06_0
  - mkl=2023.1.0=h6b88ed4_46358
  - mkl-service=2.4.0=py310h2bbff1b_1
  - mkl_fft=1.3.8=py310h2bbff1b_0
  - mkl_random=1.2.4=py310h59b6b97_0
  - mlflow=2.12.2=py310hd1fac3c_0
  - multidict=6.0.4=py310h2bbff1b_0
  - numexpr=2.8.7=py310h2cd9be0_0
  - numpy=1.26.4=py310h055cbcc_0
  - numpy-base=1.26.4=py310h65a83cf_0
  - oauthlib=3.2.2=py310haa95532_0
  - openjpeg=2.4.0=h4fc8c34_0
  - openssl=1.1.1w=h2bbff1b_0
  - opt_einsum=3.3.0=pyhd3eb1b0_1
  - orc=1.7.4=h623e30f_1
  - packaging=23.2=py310haa95532_0
  - pandas=2.2.1=py310h5da7b33_0
  - paramiko=2.8.1=pyhd3eb1b0_0
  - pillow=10.3.0=py310h2bbff1b_0
  - pip=24.0=py310haa95532_0
  - ply=3.11=py310haa95532_0
  - protobuf=3.20.3=py310hd77b12b_0
  - pyarrow=11.0.0=py310h790e06d_1
  - pyasn1=0.4.8=pyhd3eb1b0_0
  - pyasn1-modules=0.2.8=py_0
  - pybind11-abi=5=hd3eb1b0_0
  - pycparser=2.21=pyhd3eb1b0_0
  - pyjwt=2.8.0=py310haa95532_0
  - pynacl=1.5.0=py310h8cc25b3_0
  - pyopenssl=23.2.0=py310haa95532_0
  - pyqt=5.15.10=py310hd77b12b_0
  - pyqt5-sip=12.13.0=py310h2bbff1b_0
  - pysocks=1.7.1=py310haa95532_0
  - python=3.10.13=h966fe2a_0
  - python-dateutil=2.9.0post0=py310haa95532_0
  - python-flatbuffers=2.0=pyhd3eb1b0_0
  - python-tzdata=2023.3=pyhd3eb1b0_0
  - pytz=2024.1=py310haa95532_0
  - pywin32=305=py310h2bbff1b_0
  - pyyaml=6.0.1=py310h2bbff1b_0
  - qt-main=5.15.2=h6072711_9
  - querystring_parser=1.2.4=py310haa95532_0
  - re2=2022.04.01=hd77b12b_0
  - requests=2.31.0=py310haa95532_1
  - requests-oauthlib=1.3.0=py_0
  - rsa=4.7.2=pyhd3eb1b0_1
  - scikit-learn=1.4.2=py310h4ed8f06_1
  - scipy=1.13.0=py310h8640f81_0
  - setuptools=69.5.1=py310haa95532_0
  - sip=6.7.12=py310hd77b12b_0
  - six=1.16.0=pyhd3eb1b0_1
  - smmap=4.0.0=pyhd3eb1b0_0
  - snappy=1.1.10=h6c2663c_1
  - sqlalchemy=2.0.25=py310h2bbff1b_0
  - sqlite=3.45.3=h2bbff1b_0
  - sqlparse=0.4.4=py310haa95532_0
  - tbb=2021.8.0=h59b6b97_0
  - tensorboard=2.10.0=py310haa95532_0
  - tensorboard-data-server=0.6.1=py310haa95532_0
  - tensorboard-plugin-wit=1.8.1=py310haa95532_0
  - tensorflow=2.10.0=mkl_py310hd99672f_0
  - tensorflow-base=2.10.0=mkl_py310h6a7f48e_0
  - tensorflow-estimator=2.10.0=py310haa95532_0
  - termcolor=2.1.0=py310haa95532_0
  - threadpoolctl=2.2.0=pyh0d69192_0
  - tk=8.6.14=h0416ee5_0
  - tornado=6.3.3=py310h2bbff1b_0
  - typing-extensions=4.11.0=py310haa95532_0
  - typing_extensions=4.11.0=py310haa95532_0
  - tzdata=2024a=h04d1e81_0
  - unicodedata2=15.1.0=py310h2bbff1b_0
  - urllib3=2.2.1=py310haa95532_0
  - utf8proc=2.6.1=h2bbff1b_1
  - vc=14.2=h2eaa2aa_1
  - vs2015_runtime=14.29.30133=h43f2093_3
  - waitress=2.0.0=pyhd3eb1b0_0
  - websocket-client=1.8.0=py310haa95532_0
  - werkzeug=2.3.8=py310haa95532_0
  - wheel=0.43.0=py310haa95532_0
  - win_inet_pton=1.1.0=py310haa95532_0
  - wrapt=1.14.1=py310h2bbff1b_0
  - xz=5.4.6=h8cc25b3_1
  - yaml=0.2.5=he774522_0
  - yarl=1.9.3=py310h2bbff1b_0
  - zipp=3.17.0=py310haa95532_0
  - zlib=1.2.13=h8cc25b3_1
  - zstd=1.5.5=hd43e919_2
 prefix: C:\Users\Genos\miniconda3\envs\IUM
--- a/metrics.py
+++ b/metrics.py
@ -1,24 +0,0 @@
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error
 import numpy as np
 true_labels = []
 predicted_labels = []
 f = open("predictions.txt", "r")
 for line in f:
  parts = line.strip().split(' ')
  true_labels.append(int(parts[3]))
  predicted_labels.append(int(parts[1]))
 accuracy = accuracy_score(true_labels, predicted_labels)
 precision_micro = precision_score(true_labels, predicted_labels, average='micro')
 recall_micro = recall_score(true_labels, predicted_labels, average='micro')
 f1_micro = f1_score(true_labels, predicted_labels, average='micro')
 rmse = np.sqrt(mean_squared_error(true_labels, predicted_labels))
 with open(r'metrics.txt', 'a') as fp:
    fp.write(f"Accuracy: {accuracy}\n")
    fp.write(f"Precision: {precision_micro}\n")
    fp.write(f"Recall: {recall_micro}\n")
    fp.write(f"F1-score: {f1_micro}\n")
    fp.write(f"RMSE: {rmse}\n")
--- a/mlflow/Dockerfile
+++ b/mlflow/Dockerfile
@ -0,0 +1,15 @@
 FROM python:3.10
 RUN pip install --upgrade pip
 RUN pip3 install mlflow
 RUN pip3 install scikit-learn
 RUN pip3 install pandas
 RUN pip3 install numpy 
 RUN pip3 install torch
 COPY mlflow_model.py .
 COPY mlflow_prediction.py .
 COPY forest_test.csv .
 COPY forest_train.csv .
 COPY forest_val.csv .
--- a/mlflow/MLProject
+++ b/mlflow/MLProject
@ -0,0 +1,13 @@
 name: mlflow_464914
 # conda_env: conda.yaml #ścieżka do pliku conda.yaml z definicją środowisk
 docker_env:
 image: mlflow_image
 entry_points:
  main:
    parameters:
      epochs: {type: int, default: 10}
    command: "python mlflow_model.py {epochs}"
  test:
    command: "python mlflow_prediction.py"
--- a/mlflow/mlflow_model.py
+++ b/mlflow/mlflow_model.py
@ -0,0 +1,120 @@
 import torch
 import torch.nn as nn
 import torch.optim as optim
 from torch.utils.data import DataLoader, Dataset
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
 import torch.nn.functional as F
 import mlflow
 import mlflow.sklearn
 import sys
 mlflow.set_tracking_uri("http://localhost:5000")
 mlflow.set_experiment("s464914")
 device = (
    "cuda"
    if torch.cuda.is_available()
    else "cpu"
 )
 class Model(nn.Module):
    def __init__(self, input_features=54, hidden_layer1=25, hidden_layer2=30, output_features=8):
        super().__init__()
        self.fc1 = nn.Linear(input_features,output_features)
        self.bn1 = nn.BatchNorm1d(hidden_layer1)  # Add batch normalization
        self.fc2 = nn.Linear(hidden_layer1, hidden_layer2)
        self.bn2 = nn.BatchNorm1d(hidden_layer2)  # Add batch normalization
        self.out = nn.Linear(hidden_layer2, output_features)
    def forward(self, x):
        x = F.relu(self.fc1(x))  # Apply batch normalization after first linear layer
        #x = F.relu(self.bn2(self.fc2(x)))  # Apply batch normalization after second linear layer
        #x = self.out(x)
        return x
 def main():
    epochs = int(sys.argv[1])
    forest_train = pd.read_csv('forest_train.csv')
    forest_val = pd.read_csv('forest_val.csv')
    print(forest_train.head())
    X_train = forest_train.drop(columns=['Cover_Type']).values
    y_train = forest_train['Cover_Type'].values
    X_val = forest_val.drop(columns=['Cover_Type']).values
    y_val = forest_val['Cover_Type'].values
    # Initialize model, loss function, and optimizer
    model = Model().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    # Convert to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
    y_train = torch.tensor(y_train, dtype=torch.long).to(device)
    X_val = torch.tensor(X_val, dtype=torch.float32).to(device)
    y_val = torch.tensor(y_val, dtype=torch.long).to(device)
    # Create DataLoader
    train_loader = DataLoader(list(zip(X_train, y_train)), batch_size=64, shuffle=True)
    val_loader = DataLoader(list(zip(X_val, y_val)), batch_size=64)
    with mlflow.start_run() as run:
        # Training loop
        for epoch in range(epochs):
            model.train()  # Set model to training mode
            running_loss = 0.0
            for inputs, labels in train_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                running_loss += loss.item() * inputs.size(0)
            # Calculate training loss
            epoch_loss = running_loss / len(train_loader.dataset)
            # Validation
            model.eval()  # Set model to evaluation mode
            val_running_loss = 0.0
            correct = 0
            total = 0
            with torch.no_grad():
                for inputs, labels in val_loader:
                    inputs, labels = inputs.to(device), labels.to(device)
                    outputs = model(inputs)
                    val_loss = criterion(outputs, labels)
                    val_running_loss += val_loss.item() * inputs.size(0)
                    _, predicted = torch.max(outputs, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()
            # Calculate validation loss and accuracy
            val_epoch_loss = val_running_loss / len(val_loader.dataset)
            val_accuracy = correct / total
            print(f"Epoch {epoch+1}/{epochs}, "
                f"Train Loss: {epoch_loss:.4f}, "
                f"Val Loss: {val_epoch_loss:.4f}, "
                f"Val Accuracy: {val_accuracy:.4f}")
        torch.save(model.state_dict(), 'model.pth')
        mlflow.log_param("epochs", epochs)
 if __name__ == "__main__":
    main()
--- a/mlflow/mlflow_prediction.py
+++ b/mlflow/mlflow_prediction.py
@ -0,0 +1,95 @@
 import torch
 import torch.nn as nn
 import torch.optim as optim
 from torch.utils.data import DataLoader, Dataset
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
 import torch.nn.functional as F
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error
 import numpy as np
 import mlflow
 import mlflow.sklearn
 mlflow.set_tracking_uri("http://localhost:5000")
 mlflow.set_experiment("s464914")
 device = (
    "cuda"
    if torch.cuda.is_available()
    else "cpu"
 )
 class Model(nn.Module):
    def __init__(self, input_features=54, hidden_layer1=25, hidden_layer2=30, output_features=8):
        super().__init__()
        self.fc1 = nn.Linear(input_features,output_features)
        self.bn1 = nn.BatchNorm1d(hidden_layer1)  # Add batch normalization
        self.fc2 = nn.Linear(hidden_layer1, hidden_layer2)
        self.bn2 = nn.BatchNorm1d(hidden_layer2)  # Add batch normalization
        self.out = nn.Linear(hidden_layer2, output_features)
    def forward(self, x):
        x = F.relu(self.fc1(x)) 
        return x
 def load_model(model, model_path):
    model.load_state_dict(torch.load(model_path))
    model.eval()
 def predict(model, input_data):
    # Convert input data to PyTorch tensor
    # Perform forward pass
    with torch.no_grad():
        output = model(input_data)
    _, predicted_class = torch.max(output, 0)
    return predicted_class.item()  # Return the predicted class label
 def main():
    with mlflow.start_run() as run:
        forest_test = pd.read_csv('forest_test.csv')
        X_test = forest_test.drop(columns=['Cover_Type']).values
        y_test = forest_test['Cover_Type'].values
        X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
        model = Model().to(device)
        model_path = 'model.pth'  # Path to your saved model file
        load_model(model, model_path)
        predictions = []
        true_labels = []
        with torch.no_grad():
            for input_data, target in zip(X_test, y_test):
                output = model(input_data)
                _, predicted_class = torch.max(output, 0)
                prediction_entry = f"predicted: {predicted_class.item()} true_label: {target}"
                predictions.append(prediction_entry)
                true_labels.append()
                if predicted_class.item() == target:
                    true_labels.append(target)
        with open(r'predictions.txt', 'w') as fp:
            for item in predictions:
                # write each item on a new line
                fp.write("%s\n" % item)
        accuracy = accuracy_score(true_labels, predictions)
        precision_micro = precision_score(true_labels, predictions, average='micro')
        recall_micro = recall_score(true_labels, predictions, average='micro')
        f1_micro = f1_score(true_labels, predictions, average='micro')
        rmse = np.sqrt(mean_squared_error(true_labels, predictions))
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision_micro", precision_micro)
        mlflow.log_metric("recall_micro", recall_micro)
        mlflow.log_metric("f1_micro", f1_micro)
        mlflow.log_metric("rmse", rmse)
 if __name__ == "__main__":
    main()
--- a/model.py
+++ b/model.py
@ -6,7 +6,6 @@ import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
 import torch.nn.functional as F
 import sys
 device = (
@ -31,9 +30,6 @@ class Model(nn.Module):
        return x
 def main():
    epochs = int(sys.argv[1])
    print(epochs)
    forest_train = pd.read_csv('forest_train.csv')
    forest_val = pd.read_csv('forest_val.csv')
@ -63,6 +59,7 @@ def main():
    val_loader = DataLoader(list(zip(X_val, y_val)), batch_size=64)
    # Training loop
    epochs = 10
    for epoch in range(epochs):
        model.train()  # Set model to training mode
        running_loss = 0.0
--- a/plot.py
+++ b/plot.py
@ -1,21 +0,0 @@
 import matplotlib.pyplot as plt
 import numpy as np
 accuracy = [] 
 f = open("metrics.txt", "r")
 for line in f:
  parts = line.strip().split(' ')
  if(parts[0] == 'Accuracy:'):
    accuracy.append(float(parts[1]))
 build_numbers = np.arange(1, len(accuracy) + 1)
 plt.plot(build_numbers, accuracy,  marker='o', linestyle='-', color='b')
 plt.xlabel('Build Number')
 plt.ylabel('Accuracy')
 plt.title('Accuracy Plot')
 plt.grid(True)
 plt.show()
 plt.savefig('accuracy.png')
--- a/prediction.py
+++ b/prediction.py
@ -6,8 +6,6 @@ import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
 import torch.nn.functional as F
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error
 import numpy as np
 device = (
    "cuda"
@ -43,6 +41,7 @@ def predict(model, input_data):
    return predicted_class.item()  # Return the predicted class label
 def main():
    forest_test = pd.read_csv('forest_test.csv')
@ -56,23 +55,15 @@ def main():
    load_model(model, model_path)
    predictions = []
-    correct = 0
+    for input_data in X_test:
-    total = 0
+        predicted_class = predict(model, input_data)
-    with torch.no_grad():
+        predictions.append(predicted_class)
        for input_data, target in zip(X_test, y_test):
            output = model(input_data)
            _, predicted_class = torch.max(output, 0)
            prediction_entry = f"predicted: {predicted_class.item()} true_label: {target}"
            predictions.append(prediction_entry)
            total += 1
            if predicted_class.item() == target:
                correct += 1
    with open(r'predictions.txt', 'w') as fp:
        for item in predictions:
            # write each item on a new line
            fp.write("%s\n" % item)
 if __name__ == "__main__":
    main()
--- a/predictions.txt
+++ b/predictions.txt
--- a/sacred_model.py
+++ b/sacred_model.py
@ -0,0 +1,126 @@
 import torch
 import torch.nn as nn
 import torch.optim as optim
 from torch.utils.data import DataLoader, Dataset
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
 import torch.nn.functional as F
 from sacred import Experiment
 from sacred.observers import FileStorageObserver, MongoObserver
 device = (
    "cuda"
    if torch.cuda.is_available()
    else "cpu"
 )
 ex = Experiment("464914", interactive=True, save_git_info=False)
 ex.observers.append(FileStorageObserver('experiments'))
 ex.observers.append(MongoObserver(url='mongodb://admin:IUM_2021@tzietkiewicz.vm.wmi.amu.edu.pl:27017',
                                  db_name='sacred')) 
 class Model(nn.Module):
    def __init__(self, input_features=54, hidden_layer1=25, hidden_layer2=30, output_features=8):
        super().__init__()
        self.fc1 = nn.Linear(input_features,output_features)
        self.bn1 = nn.BatchNorm1d(hidden_layer1)  # Add batch normalization
        self.fc2 = nn.Linear(hidden_layer1, hidden_layer2)
        self.bn2 = nn.BatchNorm1d(hidden_layer2)  # Add batch normalization
        self.out = nn.Linear(hidden_layer2, output_features)
    def forward(self, x):
        x = F.relu(self.fc1(x))  # Apply batch normalization after first linear layer
        #x = F.relu(self.bn2(self.fc2(x)))  # Apply batch normalization after second linear layer
        #x = self.out(x)
        return x
@ex.capture
 def capture_params(epochs):
    print(f"epochs: {epochs}")
@ex.main
 def main(_run):
    forest_train_ex = ex.open_resource('forest_train.csv')
    forest_val_ex = ex.open_resource('forest_val.csv')
    forest_val = pd.read_csv('forest_val.csv')
    forest_train = pd.read_csv('forest_train.csv')
    X_train = forest_train.drop(columns=['Cover_Type']).values
    y_train = forest_train['Cover_Type'].values
    X_val = forest_val.drop(columns=['Cover_Type']).values
    y_val = forest_val['Cover_Type'].values
    # Initialize model, loss function, and optimizer
    model = Model().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    # Convert to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
    y_train = torch.tensor(y_train, dtype=torch.long).to(device)
    X_val = torch.tensor(X_val, dtype=torch.float32).to(device)
    y_val = torch.tensor(y_val, dtype=torch.long).to(device)
    # Create DataLoader
    train_loader = DataLoader(list(zip(X_train, y_train)), batch_size=64, shuffle=True)
    val_loader = DataLoader(list(zip(X_val, y_val)), batch_size=64)
    # Training loop
    epochs = 10
    for epoch in range(epochs):
        model.train()  # Set model to training mode
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        # Calculate training loss
        epoch_loss = running_loss / len(train_loader.dataset)
        # Validation
        model.eval()  # Set model to evaluation mode
        val_running_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                val_loss = criterion(outputs, labels)
                val_running_loss += val_loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        # Calculate validation loss and accuracy
        val_epoch_loss = val_running_loss / len(val_loader.dataset)
        val_accuracy = correct / total
        print(f"Epoch {epoch+1}/{epochs}, "
              f"Train Loss: {epoch_loss:.4f}, "
              f"Val Loss: {val_epoch_loss:.4f}, "
              f"Val Accuracy: {val_accuracy:.4f}")
        _run.log_scalar("train loss", epoch_loss)
        _run.log_scalar("val loss", val_epoch_loss)
    capture_params(epochs)
    torch.save(model.state_dict(), 'model.pth')
    ex.add_artifact("model.pth")
 ex.run()
--- a/sacredboard/Dockerfile
+++ b/sacredboard/Dockerfile
@ -0,0 +1,5 @@
 FROM python:3.6-jessie
 RUN pip install https://github.com/chovanecm/sacredboard/archive/develop.zip
 ENTRYPOINT sacredboard -mu mongodb://$MONGO_INITDB_ROOT_USERNAME:$MONGO_INITDB_ROOT_PASSWORD@mongo:27017/?authMechanism=SCRAM-SHA-1 $MONGO_DATABASE
Author	SHA1	Message	Date
Alicja Szulecka	40d0c3e849	IUM_9	2024-05-28 21:52:31 +02:00
Alicja Szulecka	0f254aa5fa	Create dvc.yaml	2024-05-28 21:43:31 +02:00
Alicja Szulecka	abb213675e	Update config	2024-05-28 21:34:33 +02:00
Alicja Szulecka	3cbfc6aca1	Delete IUM_2-checkpoint.ipynb	2024-05-28 21:26:27 +02:00
Alicja Szulecka	80ebb3c0da	dvc	2024-05-28 21:22:43 +02:00
Alicja Szulecka	281c3c6a86	sacredboard	2024-05-28 21:22:26 +02:00
Alicja Szulecka	ae632b1ea3	stop tracking covtype.csv	2024-05-28 21:20:04 +02:00
Alicja Szulecka	7309d49e67	Delete conda,yaml	2024-05-06 22:08:57 +02:00
Alicja Szulecka	c4ce89938c	mlflow	2024-05-06 22:08:05 +02:00
Alicja Szulecka	ed9927d7a1	mlflow	2024-05-06 17:27:28 +02:00
Alicja Szulecka	8ab682be76	Update Dockerfile	2024-05-05 14:12:55 +02:00
Alicja Szulecka	7ff2f9711e	sacred	2024-05-05 14:07:27 +02:00