Compare commits

...

No commits in common. "training_and_evaluation" and "master" have entirely different histories.

30 changed files with 3490 additions and 475 deletions

3
.dvc/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
/config.local
/tmp
/cache

6
.dvc/config Normal file
View File

@ -0,0 +1,6 @@
[core]
remote = ium_ssh_remote
['remote "my_local_remote"']
url = /dvcstore
['remote "ium_ssh_remote"']
url = ssh://ium-sftp@tzietkiewicz.vm.wmi.amu.edu.pl

3
.dvcignore Normal file
View File

@ -0,0 +1,3 @@
# Add patterns of files dvc should ignore, which could improve
# the performance. Learn more at
# https://dvc.org/doc/user-guide/dvcignore

19
.gitignore vendored Normal file
View File

@ -0,0 +1,19 @@
# ---> JupyterNotebooks
# gitignore template for Jupyter Notebooks
# website: http://jupyter.org/
.ipynb_checkpoints
*/.ipynb_checkpoints/*
# IPython
profile_default/
ipython_config.py
# Remove previous ipynb_checkpoints
# git rm -r .ipynb_checkpoints/
/X_train.csv
/X_test.csv
/y_train.csv
/y_test.csv
/model.pth

View File

@ -1,4 +1,5 @@
FROM ubuntu:latest FROM ubuntu:latest
RUN apt update && apt install -y python3-pip RUN apt update && apt install -y python3-pip
RUN pip3 install pandas RUN pip3 install pandas
RUN pip3 install sklearn RUN pip3 install sklearn
@ -6,11 +7,14 @@ RUN pip3 install seaborn
RUN pip3 install ipython RUN pip3 install ipython
RUN pip3 install torch RUN pip3 install torch
RUN pip3 install numpy RUN pip3 install numpy
RUN pip3 install mlflow RUN pip3 install dvc
RUN pip3 install dvc[ssh] paramiko
RUN apt-get install unzip
WORKDIR /app WORKDIR /app
COPY ./training.py ./
COPY ./training_mlflow.py ./ COPY ./body-performance-data.zip ./
COPY ./evaluation.py ./ COPY ./prepare_datasets.py ./
COPY ./predict_444501.py ./ COPY ./train.py ./

18
Jenkinsfile vendored Normal file
View File

@ -0,0 +1,18 @@
pipeline {
agent {
dockerfile true
}
stages {
stage('Check out from version control') {
steps {
checkout([$class: 'GitSCM', branches: [[name: '*/master']], extensions: [], userRemoteConfigs: [[credentialsId: 's444421', url: 'https://git.wmi.amu.edu.pl/s444421/ium_444421.git']]])
}
}
stage('Shell Script') {
steps {
sh 'ipython ./prepare_datasets.py'
archiveArtifacts artifacts: 'X_train.csv, X_test.csv, y_train.csv, y_test.csv '
}
}
}
}

18
Jenkinsfile2 Normal file
View File

@ -0,0 +1,18 @@
pipeline {
agent {
docker {image 'agakul/ium:4.0'}
}
stages {
stage('Check out from version control') {
steps {
checkout([$class: 'GitSCM', branches: [[name: '*/master']], extensions: [], userRemoteConfigs: [[credentialsId: 's444421', url: 'https://git.wmi.amu.edu.pl/s444421/ium_444421.git']]])
}
}
stage('Shell Script') {
steps {
sh 'ipython ./prepare_datasets.py'
archiveArtifacts artifacts: 'X_train.csv, X_test.csv, y_train.csv, y_test.csv '
}
}
}
}

View File

@ -1,12 +0,0 @@
name: s444421
docker_env:
image: agakul/ium:mlflow
entry_points:
main:
parameters:
epochs: {type: float, default: 1000}
command: "python training_mlflow.py {epochs}"
test:
command: "python evaluation.py"

2
README.md Normal file
View File

@ -0,0 +1,2 @@
# ium_444421

BIN
body-performance-data.zip Normal file

Binary file not shown.

531
classification_net.ipynb Normal file
View File

@ -0,0 +1,531 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "forty-fault",
"metadata": {},
"outputs": [],
"source": [
"!kaggle datasets download -d kukuroo3/body-performance-data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "pediatric-tuesday",
"metadata": {},
"outputs": [],
"source": [
"!unzip -o body-performance-data.zip"
]
},
{
"cell_type": "code",
"execution_count": 114,
"id": "interstate-presence",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import classification_report\n",
"import torch\n",
"from torch import nn, optim\n",
"import torch.nn.functional as F"
]
},
{
"cell_type": "code",
"execution_count": 115,
"id": "structural-trigger",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(13393, 12)"
]
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('bodyPerformance.csv')\n",
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 116,
"id": "turkish-category",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>gender</th>\n",
" <th>height_cm</th>\n",
" <th>weight_kg</th>\n",
" <th>body fat_%</th>\n",
" <th>diastolic</th>\n",
" <th>systolic</th>\n",
" <th>gripForce</th>\n",
" <th>sit and bend forward_cm</th>\n",
" <th>sit-ups counts</th>\n",
" <th>broad jump_cm</th>\n",
" <th>class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>27.0</td>\n",
" <td>M</td>\n",
" <td>172.3</td>\n",
" <td>75.24</td>\n",
" <td>21.3</td>\n",
" <td>80.0</td>\n",
" <td>130.0</td>\n",
" <td>54.9</td>\n",
" <td>18.4</td>\n",
" <td>60.0</td>\n",
" <td>217.0</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>25.0</td>\n",
" <td>M</td>\n",
" <td>165.0</td>\n",
" <td>55.80</td>\n",
" <td>15.7</td>\n",
" <td>77.0</td>\n",
" <td>126.0</td>\n",
" <td>36.4</td>\n",
" <td>16.3</td>\n",
" <td>53.0</td>\n",
" <td>229.0</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>31.0</td>\n",
" <td>M</td>\n",
" <td>179.6</td>\n",
" <td>78.00</td>\n",
" <td>20.1</td>\n",
" <td>92.0</td>\n",
" <td>152.0</td>\n",
" <td>44.8</td>\n",
" <td>12.0</td>\n",
" <td>49.0</td>\n",
" <td>181.0</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>32.0</td>\n",
" <td>M</td>\n",
" <td>174.5</td>\n",
" <td>71.10</td>\n",
" <td>18.4</td>\n",
" <td>76.0</td>\n",
" <td>147.0</td>\n",
" <td>41.4</td>\n",
" <td>15.2</td>\n",
" <td>53.0</td>\n",
" <td>219.0</td>\n",
" <td>B</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>28.0</td>\n",
" <td>M</td>\n",
" <td>173.8</td>\n",
" <td>67.70</td>\n",
" <td>17.1</td>\n",
" <td>70.0</td>\n",
" <td>127.0</td>\n",
" <td>43.5</td>\n",
" <td>27.1</td>\n",
" <td>45.0</td>\n",
" <td>217.0</td>\n",
" <td>B</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" age gender height_cm weight_kg body fat_% diastolic systolic \\\n",
"0 27.0 M 172.3 75.24 21.3 80.0 130.0 \n",
"1 25.0 M 165.0 55.80 15.7 77.0 126.0 \n",
"2 31.0 M 179.6 78.00 20.1 92.0 152.0 \n",
"3 32.0 M 174.5 71.10 18.4 76.0 147.0 \n",
"4 28.0 M 173.8 67.70 17.1 70.0 127.0 \n",
"\n",
" gripForce sit and bend forward_cm sit-ups counts broad jump_cm class \n",
"0 54.9 18.4 60.0 217.0 C \n",
"1 36.4 16.3 53.0 229.0 A \n",
"2 44.8 12.0 49.0 181.0 C \n",
"3 41.4 15.2 53.0 219.0 B \n",
"4 43.5 27.1 45.0 217.0 B "
]
},
"execution_count": 116,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 117,
"id": "received-absence",
"metadata": {},
"outputs": [],
"source": [
"cols = ['gender', 'height_cm', 'weight_kg', 'body fat_%', 'sit-ups counts', 'broad jump_cm']\n",
"df = df[cols]\n",
"\n",
"# male - 0, female - 1\n",
"df['gender'].replace({'M': 0, 'F': 1}, inplace = True)\n",
"df = df.dropna(how='any')"
]
},
{
"cell_type": "code",
"execution_count": 118,
"id": "excited-parent",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 0.632196\n",
"1 0.367804\n",
"Name: gender, dtype: float64"
]
},
"execution_count": 118,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.gender.value_counts() / df.shape[0]"
]
},
{
"cell_type": "code",
"execution_count": 119,
"id": "extended-cinema",
"metadata": {},
"outputs": [],
"source": [
"X = df[['height_cm', 'weight_kg', 'body fat_%', 'sit-ups counts', 'broad jump_cm']]\n",
"y = df[['gender']]\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 120,
"id": "animated-farming",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([10714, 5]) torch.Size([10714])\n",
"torch.Size([2679, 5]) torch.Size([2679])\n"
]
}
],
"source": [
"X_train = torch.from_numpy(np.array(X_train)).float()\n",
"y_train = torch.squeeze(torch.from_numpy(y_train.values).float())\n",
"\n",
"X_test = torch.from_numpy(np.array(X_test)).float()\n",
"y_test = torch.squeeze(torch.from_numpy(y_test.values).float())\n",
"\n",
"print(X_train.shape, y_train.shape)\n",
"print(X_test.shape, y_test.shape)"
]
},
{
"cell_type": "code",
"execution_count": 121,
"id": "technical-wallet",
"metadata": {},
"outputs": [],
"source": [
"class Net(nn.Module):\n",
" def __init__(self, n_features):\n",
" super(Net, self).__init__()\n",
" self.fc1 = nn.Linear(n_features, 5)\n",
" self.fc2 = nn.Linear(5, 3)\n",
" self.fc3 = nn.Linear(3, 1)\n",
" def forward(self, x):\n",
" x = F.relu(self.fc1(x))\n",
" x = F.relu(self.fc2(x))\n",
" return torch.sigmoid(self.fc3(x))\n",
"net = Net(X_train.shape[1])"
]
},
{
"cell_type": "code",
"execution_count": 122,
"id": "requested-plymouth",
"metadata": {},
"outputs": [],
"source": [
"criterion = nn.BCELoss()"
]
},
{
"cell_type": "code",
"execution_count": 123,
"id": "iraqi-english",
"metadata": {},
"outputs": [],
"source": [
"optimizer = optim.Adam(net.parameters(), lr=0.001)"
]
},
{
"cell_type": "code",
"execution_count": 124,
"id": "emerging-helmet",
"metadata": {},
"outputs": [],
"source": [
"device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")"
]
},
{
"cell_type": "code",
"execution_count": 125,
"id": "differential-aviation",
"metadata": {},
"outputs": [],
"source": [
"X_train = X_train.to(device)\n",
"y_train = y_train.to(device)\n",
"X_test = X_test.to(device)\n",
"y_test = y_test.to(device)"
]
},
{
"cell_type": "code",
"execution_count": 126,
"id": "ranging-calgary",
"metadata": {},
"outputs": [],
"source": [
"net = net.to(device)\n",
"criterion = criterion.to(device)"
]
},
{
"cell_type": "code",
"execution_count": 127,
"id": "iraqi-blanket",
"metadata": {},
"outputs": [],
"source": [
"def calculate_accuracy(y_true, y_pred):\n",
" predicted = y_pred.ge(.5).view(-1)\n",
" return (y_true == predicted).sum().float() / len(y_true)"
]
},
{
"cell_type": "code",
"execution_count": 128,
"id": "robust-serbia",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch 0\n",
"Train set - loss: 1.005, accuracy: 0.37\n",
"Test set - loss: 1.018, accuracy: 0.358\n",
"\n",
"epoch 100\n",
"Train set - loss: 0.677, accuracy: 0.743\n",
"Test set - loss: 0.679, accuracy: 0.727\n",
"\n",
"epoch 200\n",
"Train set - loss: 0.636, accuracy: 0.79\n",
"Test set - loss: 0.64, accuracy: 0.778\n",
"\n",
"epoch 300\n",
"Train set - loss: 0.568, accuracy: 0.839\n",
"Test set - loss: 0.577, accuracy: 0.833\n",
"\n",
"epoch 400\n",
"Train set - loss: 0.504, accuracy: 0.885\n",
"Test set - loss: 0.514, accuracy: 0.877\n",
"\n",
"epoch 500\n",
"Train set - loss: 0.441, accuracy: 0.922\n",
"Test set - loss: 0.45, accuracy: 0.913\n",
"\n",
"epoch 600\n",
"Train set - loss: 0.388, accuracy: 0.944\n",
"Test set - loss: 0.396, accuracy: 0.938\n",
"\n",
"epoch 700\n",
"Train set - loss: 0.353, accuracy: 0.954\n",
"Test set - loss: 0.359, accuracy: 0.949\n",
"\n",
"epoch 800\n",
"Train set - loss: 0.327, accuracy: 0.958\n",
"Test set - loss: 0.333, accuracy: 0.953\n",
"\n",
"epoch 900\n",
"Train set - loss: 0.306, accuracy: 0.961\n",
"Test set - loss: 0.312, accuracy: 0.955\n",
"\n"
]
}
],
"source": [
"def round_tensor(t, decimal_places=3):\n",
" return round(t.item(), decimal_places)\n",
"for epoch in range(1000):\n",
" y_pred = net(X_train)\n",
" y_pred = torch.squeeze(y_pred)\n",
" train_loss = criterion(y_pred, y_train)\n",
" if epoch % 100 == 0:\n",
" train_acc = calculate_accuracy(y_train, y_pred)\n",
" y_test_pred = net(X_test)\n",
" y_test_pred = torch.squeeze(y_test_pred)\n",
" test_loss = criterion(y_test_pred, y_test)\n",
" test_acc = calculate_accuracy(y_test, y_test_pred)\n",
" print(\n",
"f'''epoch {epoch}\n",
"Train set - loss: {round_tensor(train_loss)}, accuracy: {round_tensor(train_acc)}\n",
"Test set - loss: {round_tensor(test_loss)}, accuracy: {round_tensor(test_acc)}\n",
"''')\n",
" optimizer.zero_grad()\n",
" train_loss.backward()\n",
" optimizer.step()"
]
},
{
"cell_type": "code",
"execution_count": 129,
"id": "optimum-excerpt",
"metadata": {},
"outputs": [],
"source": [
"# torch.save(net, 'model.pth')"
]
},
{
"cell_type": "code",
"execution_count": 130,
"id": "dental-seating",
"metadata": {},
"outputs": [],
"source": [
"# net = torch.load('model.pth')"
]
},
{
"cell_type": "code",
"execution_count": 131,
"id": "german-satisfaction",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" Male 0.97 0.96 0.96 1720\n",
" Female 0.93 0.94 0.94 959\n",
"\n",
" accuracy 0.95 2679\n",
" macro avg 0.95 0.95 0.95 2679\n",
"weighted avg 0.95 0.95 0.95 2679\n",
"\n"
]
}
],
"source": [
"classes = ['Male', 'Female']\n",
"y_pred = net(X_test)\n",
"y_pred = y_pred.ge(.5).view(-1).cpu()\n",
"y_test = y_test.cpu()\n",
"print(classification_report(y_test, y_pred, target_names=classes))"
]
},
{
"cell_type": "code",
"execution_count": 132,
"id": "british-incidence",
"metadata": {},
"outputs": [],
"source": [
"with open('test_out.csv', 'w') as file:\n",
" for y in y_pred:\n",
" file.write(classes[y.item()])\n",
" file.write('\\n')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

1
data/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/bodyPerformance.csv

View File

@ -0,0 +1,4 @@
outs:
- md5: 6d7c3e3d110fac2ade9d8bce60238208
size: 761835
path: bodyPerformance.csv

3
download_data.sh Normal file
View File

@ -0,0 +1,3 @@
#kaggle datasets download -d tejashvi14/travel-insurance-prediction-data
unzip -o travel-insurance-prediction-data.zip
head -n $CUTOFF TravelInsurancePrediction.csv > travel_insurance_data.txt

17
dvc.Jenkinsfile Normal file
View File

@ -0,0 +1,17 @@
pipeline {
agent {
dockerfile true
}
stages {
stage('Dvc pull and reproduce') {
steps {
checkout([$class: 'GitSCM', branches: [[name: '*/master']], extensions: [], userRemoteConfigs: [[credentialsId: 's444421', url: 'https://git.wmi.amu.edu.pl/s444421/ium_444421.git']]])
withCredentials(
[sshUserPrivateKey(credentialsId: '48ac7004-216e-4260-abba-1fe5db753e18', keyFileVariable: 'IUM_SFTP_KEY', passphraseVariable: '', usernameVariable: 'USER')]) {
sh 'dvc remote modify --local ium_ssh_remote keyfile $IUM_SFTP_KEY'
sh 'dvc pull'
sh 'dvc repro'}
}
}
}
}

10
dvc.yaml Normal file
View File

@ -0,0 +1,10 @@
stages:
prepare_datasets:
cmd: python3 prepare_datasets.py
deps:
- data/bodyPerformance.csv
- prepare_datasets.py
train:
cmd: python3 train.py
deps:
- train.py

124
environment.yml Normal file
View File

@ -0,0 +1,124 @@
name: s444421
channels:
- conda-forge
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- _openmp_mutex=5.1=1_gnu
- alembic=1.7.7=pyhd8ed1ab_0
- appdirs=1.4.4=pyh9f0ad1d_0
- asn1crypto=1.5.1=pyhd8ed1ab_0
- blas=1.0=mkl
- bottleneck=1.3.4=py39hce1f21e_0
- brotlipy=0.7.0=py39hb9d737c_1004
- ca-certificates=2022.5.18.1=ha878542_0
- certifi=2022.5.18.1=py39hf3d152e_0
- cffi=1.15.0=py39hd667e15_1
- charset-normalizer=2.0.12=pyhd8ed1ab_0
- click=8.1.3=py39hf3d152e_0
- cloudpickle=2.1.0=pyhd8ed1ab_0
- configparser=5.2.0=pyhd8ed1ab_0
- cryptography=37.0.2=py39hd97740a_0
- cycler=0.11.0=pyhd8ed1ab_0
- databricks-cli=0.12.1=pyhd8ed1ab_0
- docker-py=5.0.3=py39hf3d152e_2
- docker-pycreds=0.4.0=py_0
- entrypoints=0.4=pyhd8ed1ab_0
- flask=2.1.2=pyhd8ed1ab_1
- freetype=2.10.4=h0708190_1
- future=0.18.2=py39hf3d152e_5
- gitdb=4.0.9=pyhd8ed1ab_0
- gitpython=3.1.27=pyhd8ed1ab_0
- greenlet=1.1.2=py39h5a03fae_2
- gunicorn=20.1.0=py39hf3d152e_2
- idna=3.3=pyhd8ed1ab_0
- importlib-metadata=4.11.3=py39hf3d152e_1
- importlib_resources=5.7.1=pyhd8ed1ab_1
- intel-openmp=2021.4.0=h06a4308_3561
- itsdangerous=2.1.2=pyhd8ed1ab_0
- jinja2=3.1.2=pyhd8ed1ab_0
- joblib=1.1.0=pyhd8ed1ab_0
- jpeg=9e=h166bdaf_1
- kiwisolver=1.4.2=py39hf939315_1
- lcms2=2.12=hddcbb42_0
- ld_impl_linux-64=2.38=h1181459_1
- libblas=3.9.0=12_linux64_mkl
- libcblas=3.9.0=12_linux64_mkl
- libffi=3.3=he6710b0_2
- libgcc-ng=11.2.0=h1234567_0
- libgfortran-ng=12.1.0=h69a702a_16
- libgfortran5=12.1.0=hdcd56e2_16
- libgomp=11.2.0=h1234567_0
- liblapack=3.9.0=12_linux64_mkl
- libpng=1.6.37=h21135ba_2
- libprotobuf=3.19.1=h4ff587b_0
- libstdcxx-ng=11.2.0=h1234567_0
- libtiff=4.2.0=h85742a9_0
- libwebp-base=1.2.2=h7f98852_1
- lz4-c=1.9.3=h9c3ff4c_1
- mako=1.2.0=pyhd8ed1ab_1
- markupsafe=2.1.1=py39hb9d737c_1
- matplotlib-base=3.4.3=py39h2fa2bec_2
- mkl=2021.4.0=h06a4308_640
- mkl-service=2.4.0=py39h7f8727e_0
- mkl_fft=1.3.1=py39hd3c417c_0
- mkl_random=1.2.2=py39h51133e4_0
- mlflow=1.26.0=py39ha39b057_0
- ncurses=6.3=h7f8727e_2
- ninja=1.11.0=h924138e_0
- numexpr=2.8.1=py39h6abb31d_0
- numpy=1.22.3=py39he7a7128_0
- numpy-base=1.22.3=py39hf524024_0
- olefile=0.46=pyh9f0ad1d_1
- openssl=1.1.1o=h166bdaf_0
- packaging=21.3=pyhd3eb1b0_0
- pandas=1.4.2=py39h295c915_0
- patsy=0.5.2=pyhd8ed1ab_0
- pillow=7.2.0=py39h6f3857e_2
- pip=21.2.4=py39h06a4308_0
- prometheus_client=0.14.1=pyhd8ed1ab_0
- prometheus_flask_exporter=0.20.1=pyhd8ed1ab_0
- protobuf=3.19.1=py39h295c915_0
- pycparser=2.21=pyhd8ed1ab_0
- pyopenssl=22.0.0=pyhd8ed1ab_0
- pyparsing=3.0.4=pyhd3eb1b0_0
- pysocks=1.7.1=py39hf3d152e_5
- python=3.9.12=h12debd9_0
- python-dateutil=2.8.2=pyhd3eb1b0_0
- python_abi=3.9=2_cp39
- pytorch=1.10.0=cpu_py39hc70245e_1
- pytz=2021.3=pyhd3eb1b0_0
- pyyaml=6.0=py39hb9d737c_4
- querystring_parser=1.2.4=py_0
- readline=8.1.2=h7f8727e_1
- requests=2.27.1=pyhd8ed1ab_0
- scikit-learn=1.1.1=py39h4037b75_0
- scipy=1.8.0=py39hee8e79c_1
- seaborn=0.11.2=hd8ed1ab_0
- seaborn-base=0.11.2=pyhd8ed1ab_0
- setuptools=61.2.0=py39h06a4308_0
- six=1.16.0=pyhd3eb1b0_1
- sleef=3.5.1=h9b69904_2
- smmap=3.0.5=pyh44b312d_0
- sqlalchemy=1.4.36=py39hb9d737c_0
- sqlite=3.38.3=hc218d9a_0
- sqlparse=0.4.2=pyhd8ed1ab_0
- statsmodels=0.13.2=py39hce5d2b2_0
- tabulate=0.8.9=pyhd8ed1ab_0
- tenacity=8.0.1=pyhd8ed1ab_0
- threadpoolctl=3.1.0=pyh8a188c0_0
- tk=8.6.11=h1ccaba5_1
- tornado=6.1=py39hb9d737c_3
- typing_extensions=4.2.0=pyha770c72_1
- tzdata=2022a=hda174b7_0
- urllib3=1.26.9=pyhd8ed1ab_0
- websocket-client=1.3.2=pyhd8ed1ab_0
- werkzeug=2.1.2=pyhd8ed1ab_1
- wheel=0.37.1=pyhd3eb1b0_0
- xz=5.2.5=h7f8727e_1
- yaml=0.2.5=h7f98852_2
- zipp=3.8.0=pyhd8ed1ab_0
- zlib=1.2.12=h7f8727e_2
- zstd=1.4.9=ha95c52a_0
prefix: /home/agata/anaconda3/envs/s444421

View File

@ -1,41 +0,0 @@
def ACC = ''
pipeline {
agent {
dockerfile true
}
parameters {
gitParameter branchFilter: 'origin/(.*)', defaultValue: 'training_and_evaluation', name: 'BRANCH', type: 'PT_BRANCH'
buildSelector(
defaultSelector: lastSuccessful(),
description: 'Which build to use for copying artifacts',
name: 'BUILD_SELECTOR'
)
}
stages {
stage('Stage 1') {
steps {
git branch: "${params.BRANCH}", url: 'https://git.wmi.amu.edu.pl/s444421/ium_444421.git'
copyArtifacts filter: '*', projectName:'s444421-create-dataset', selector: buildParameter('BUILD_SELECTOR')
copyArtifacts filter: '*', projectName:'s444421-training/${BRANCH}/', selector: buildParameter('BUILD_SELECTOR')
copyArtifacts filter: '*', projectName:'s444421-evaluation/training_and_evaluation', optional: true
sh 'ipython ./evaluation.py'
archiveArtifacts artifacts: 'build_accuracy.txt, bilds_accuracy.jpg'
}
}
}
post {
success {
emailext body: 'SUCCESS', subject: 's444421-evaluation status', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms'
}
failure {
emailext body: 'FAILURE', subject: 's444421-evaluation status', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms'
}
aborted {
emailext body: 'ABORTED', subject: 's444421-evaluation status', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms'
}
changed {
emailext body: 'CHANGED', subject: 's444421-evaluation status', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms'
}
}
}

View File

@ -1,89 +0,0 @@
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import torch
from torch import nn, optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
# In[ ]:
class Net(nn.Module):
def __init__(self, n_features):
super(Net, self).__init__()
self.fc1 = nn.Linear(n_features, 5)
self.fc2 = nn.Linear(5, 3)
self.fc3 = nn.Linear(3, 1)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return torch.sigmoid(self.fc3(x))
# In[ ]:
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')
# In[ ]:
X_test = torch.from_numpy(np.array(X_test)).float()
y_test = torch.squeeze(torch.from_numpy(y_test.values).float())
# In[ ]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
X_test = X_test.to(device)
y_test = y_test.to(device)
# In[ ]:
net = torch.load('model.pth')
# In[ ]:
y_pred = net(X_test)
y_pred = y_pred.ge(.5).view(-1).cpu()
y_test = y_test.cpu()
# In[ ]:
accuracy = accuracy_score(y_test, y_pred)
with open('build_accuracy.txt', 'a') as file:
file.write(str(accuracy))
file.write('\n')
# In[ ]:
with open('build_accuracy.txt') as file:
acc = [float(line.rstrip()) for line in file]
builds = list(range(1, len(acc) + 1))
plt.xlabel('build')
plt.ylabel('accuracy')
plt.plot(builds, acc, 'ro')
plt.show()
plt.savefig('bilds_accuracy.jpg')

1
get_stats.sh Normal file
View File

@ -0,0 +1 @@
wc -l travel_insurance_data.txt > stats.txt

View File

@ -1,31 +0,0 @@
pipeline {
agent {
dockerfile {
filename 'Dockerfile'}
}
parameters {
buildSelector(
defaultSelector: lastSuccessful(),
description: 'Which build to use for copying artifacts',
name: 'BUILD_SELECTOR'
)
string(
defaultValue: '{\\"inputs\\": [[167.39999389648438, 72.18000030517578, 40.0, 21.0, 94.0], [162.3000030517578, 67.30000305175781, 18.0, 52.0, 219.0], [178.5, 90.5, 14.699999809265137, 45.0, 262.0], [180.89999389648438, 77.0999984741211, 25.399999618530273, 43.0, 224.0], [177.3000030517578, 88.4800033569336, 35.599998474121094, 18.0, 183.0]]}',
description: 'Inputs',
name: 'INPUT'
)
}
stages {
stage('Copy artifacts') {
steps {
copyArtifacts fingerprintArtifacts: true, projectName: 's444421-training/training_and_evaluation', selector: buildParameter('BUILD_SELECTOR')
}
}
stage('Predict') {
steps {
sh "echo ${params.INPUT} > input_example.json"
sh "ipython ./predict_444501.py"
}
}
}
}

View File

@ -1,9 +0,0 @@
import mlflow
import numpy as np
model = mlflow.pyfunc.load_model('mlruns/1/e435ee5c0c5a468c99eb43c13df4a94b/artifacts/s444421')
with open('input_example.json') as f:
input = json.load(f)
y_predicted = model.predict(np.array([data['inputs']]).reshape(-1, 2))
print(y_predicted[:5])

2566
preparation.ipynb Normal file

File diff suppressed because one or more lines are too long

104
preparation.py Normal file
View File

@ -0,0 +1,104 @@
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
# get_ipython().system('kaggle datasets download -d tejashvi14/travel-insurance-prediction-data')
# In[ ]:
get_ipython().system('unzip -o travel-insurance-prediction-data.zip')
# In[5]:
import pandas as pd
travel_insurance=pd.read_csv('TravelInsurancePrediction.csv', index_col=0)
travel_insurance
# In[ ]:
# usunięcie wierszy zawierających braki
travel_insurance.dropna(axis='index', how='any')
# In[6]:
# normalizacja danych
for column in travel_insurance.columns:
if travel_insurance[column].dtype == 'object':
travel_insurance[column] = travel_insurance[column].str.lower()
travel_insurance
# In[8]:
# podział na podzbiory train/dev/test
import sklearn
from sklearn.model_selection import train_test_split
travel_insurance_train, travel_insurance_rest = sklearn.model_selection.train_test_split(travel_insurance, test_size=0.4, random_state=1)
travel_insurance_test, travel_insurance_dev = sklearn.model_selection.train_test_split(travel_insurance_rest, test_size=0.5, random_state=1)
# In[27]:
travel_insurance.describe(include='all')
# In[23]:
# zwracanie informacji o danym zbiorze
import seaborn as sns
def printInformation(data):
print(f'Size (rows): {len(data)}\n')
mean_value = data.mean()
min_value = data.min(numeric_only=True)
max_value = data.max(numeric_only=True)
std_value = data.std()
median_value = data.median()
print(f'(mean)\n{mean_value}', f'(min)\n{min_value}', f'(max)\n{max_value}', f'(std)\n{std_value}', f'(median)\n{median_value}', sep="\n\n")
sns.pairplot(data=data, hue="TravelInsurance")
# In[24]:
printInformation(travel_insurance)
# In[11]:
printInformation(travel_insurance_train)
# In[12]:
printInformation(travel_insurance_test)
# In[13]:
printInformation(travel_insurance_dev)
# In[ ]:

50
prepare_datasets.py Normal file
View File

@ -0,0 +1,50 @@
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
# get_ipython().system('unzip -o body-performance-data.zip')
# In[4]:
import pandas as pd
from sklearn.model_selection import train_test_split
# In[21]:
df = pd.read_csv('data/bodyPerformance.csv')
# In[22]:
cols = ['gender', 'height_cm', 'weight_kg', 'body fat_%', 'sit-ups counts', 'broad jump_cm']
df = df[cols]
# male - 0, female - 1
df['gender'].replace({'M': 0, 'F': 1}, inplace = True)
df = df.dropna(how='any')
# In[23]:
X = df[['height_cm', 'weight_kg', 'body fat_%', 'sit-ups counts', 'broad jump_cm']]
y = df[['gender']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# In[24]:
X_train.to_csv(r'X_train.csv', index=False)
X_test.to_csv(r'X_test.csv', index=False)
y_train.to_csv(r'y_train.csv', index=False)
y_test.to_csv(r'y_test.csv', index=False)

8
training.py → train.py Executable file → Normal file
View File

@ -15,12 +15,6 @@ import sys
# In[ ]: # In[ ]:
epochs = int(sys.argv[1])
# In[ ]:
X_train = pd.read_csv('X_train.csv') X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv') y_train = pd.read_csv('y_train.csv')
@ -78,7 +72,7 @@ def round_tensor(t, decimal_places=3):
return round(t.item(), decimal_places) return round(t.item(), decimal_places)
for epoch in range(epochs): for epoch in range(1000):
y_pred = net(X_train) y_pred = net(X_train)
y_pred = torch.squeeze(y_pred) y_pred = torch.squeeze(y_pred)
train_loss = criterion(y_pred, y_train) train_loss = criterion(y_pred, y_train)

View File

@ -1,37 +0,0 @@
pipeline {
agent {
dockerfile {
filename 'Dockerfile'
args '-v /mlruns:/mlruns'
}
}
options {
copyArtifactPermission('s444421-predict-s444501');
}
parameters {
buildSelector(
defaultSelector: lastSuccessful(),
description: 'Which build to use for copying artifacts',
name: 'BUILD_SELECTOR'
)
string(
defaultValue: '1000',
description: 'Number of epochs',
name: 'EPOCHS'
)
}
stages {
stage('Check out from version control') {
steps {
checkout([$class: 'GitSCM', branches: [[name: '*/training_and_evaluation']], extensions: [], userRemoteConfigs: [[credentialsId: 's444421', url: 'https://git.wmi.amu.edu.pl/s444421/ium_444421.git']]])
}
}
stage('Training') {
steps {
copyArtifacts filter: '*', projectName:'s444421-create-dataset', selector: buildParameter('BUILD_SELECTOR')
sh 'ipython ./training_mlflow.py $EPOCHS'
archiveArtifacts artifacts: 'mlruns/**'
}
}
}
}

View File

@ -1,131 +0,0 @@
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import torch
from torch import nn, optim
import torch.nn.functional as F
import sys
import mlflow
from urllib.parse import urlparse
# In[ ]:
mlflow.set_tracking_uri("http://172.17.0.1:5000")
mlflow.set_experiment("s444421")
# In[ ]:
epochs = int(sys.argv[1])
# In[ ]:
def prepare_data():
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')
X_train = torch.from_numpy(np.array(X_train)).float()
y_train = torch.squeeze(torch.from_numpy(y_train.values).float())
return X_train, y_train
# In[ ]:
class Net(nn.Module):
def __init__(self, n_features):
super(Net, self).__init__()
self.fc1 = nn.Linear(n_features, 5)
self.fc2 = nn.Linear(5, 3)
self.fc3 = nn.Linear(3, 1)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return torch.sigmoid(self.fc3(x))
# In[ ]:
def calculate_accuracy(y_true, y_pred):
predicted = y_pred.ge(.5).view(-1)
return (y_true == predicted).sum().float() / len(y_true)
# In[ ]:
def round_tensor(t, decimal_places=3):
return round(t.item(), decimal_places)
# In[ ]:
def train_model(X_train, y_train, device, epochs):
net = Net(X_train.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)
X_train = X_train.to(device)
y_train = y_train.to(device)
net = net.to(device)
criterion = criterion.to(device)
for epoch in range(epochs):
y_pred = net(X_train)
y_pred = torch.squeeze(y_pred)
train_loss = criterion(y_pred, y_train)
if epoch % 100 == 0:
train_acc = calculate_accuracy(y_train, y_pred)
print(
f'''epoch {epoch}
Train set - loss: {round_tensor(train_loss)}, accuracy: {round_tensor(train_acc)}
''')
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
return net, round_tensor(train_loss)
# In[ ]:
def my_main(epochs):
X_train, y_train = prepare_data()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model, loss = train_model(X_train, y_train, device, epochs)
torch.save(model, 'model.pth')
mlflow.log_param("epochs", epochs)
mlflow.log_metric("loss", loss)
X_test = pd.read_csv('X_test.csv')
X_test = torch.from_numpy(np.array(X_test)).float()
X_test = X_test.to(device)
y_pred = model(X_test)
y_pred = y_pred.ge(.5).view(-1).cpu()
signature = mlflow.models.signature.infer_signature(X_train.numpy(), np.array(y_pred))
tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
if tracking_url_type_store != "file":
mlflow.sklearn.log_model(model, "my_model", registered_model_name="s444421", signature=signature, input_example=X_test.numpy()[:5])
else:
mlflow.sklearn.log_model(model, "my_model", signature=signature, input_example=X_test.numpy()[:5])
# In[ ]:
with mlflow.start_run() as run:
print("MLflow run experiment_id: {0}".format(run.info.experiment_id))
print("MLflow run artifact_uri: {0}".format(run.info.artifact_uri))
my_main(epochs)

View File

@ -1,113 +0,0 @@
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import torch
from torch import nn, optim
import torch.nn.functional as F
import sys
from sacred import Experiment
from sacred.observers import FileStorageObserver, MongoObserver
# In[ ]:
ex = Experiment(save_git_info=False)
ex.observers.append(FileStorageObserver('my_runs'))
ex.observers.append(MongoObserver(url='mongodb://admin:IUM_2021@172.17.0.1:27017', db_name='sacred'))
@ex.config
def my_config():
epochs = 400
# In[ ]:
def prepare_data():
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')
X_train = torch.from_numpy(np.array(X_train)).float()
y_train = torch.squeeze(torch.from_numpy(y_train.values).float())
return X_train, y_train
# In[ ]:
class Net(nn.Module):
def __init__(self, n_features):
super(Net, self).__init__()
self.fc1 = nn.Linear(n_features, 5)
self.fc2 = nn.Linear(5, 3)
self.fc3 = nn.Linear(3, 1)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return torch.sigmoid(self.fc3(x))
# In[ ]:
def calculate_accuracy(y_true, y_pred):
predicted = y_pred.ge(.5).view(-1)
return (y_true == predicted).sum().float() / len(y_true)
# In[ ]:
def round_tensor(t, decimal_places=3):
return round(t.item(), decimal_places)
# In[ ]:
def train_model(X_train, y_train, device, epochs):
net = Net(X_train.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)
X_train = X_train.to(device)
y_train = y_train.to(device)
net = net.to(device)
criterion = criterion.to(device)
for epoch in range(epochs):
y_pred = net(X_train)
y_pred = torch.squeeze(y_pred)
train_loss = criterion(y_pred, y_train)
if epoch % 100 == 0:
train_acc = calculate_accuracy(y_train, y_pred)
print(
f'''epoch {epoch}
Train set - loss: {round_tensor(train_loss)}, accuracy: {round_tensor(train_acc)}
''')
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
return net, round_tensor(train_loss)
# In[ ]:
@ex.automain
def my_main(epochs, _run):
X_train, y_train = prepare_data()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model, loss = train_model(X_train, y_train, device, epochs)
torch.save(model, 'model.pth')
ex.add_artifact('model.pth')
_run.info["epochs"] = epochs
_run.info["loss"] = loss

Binary file not shown.