05 - Biblioteki DL

2022-04-24 20:51:38 +02:00 · 2022-04-24 20:51:38 +02:00 · 3f803ca909
commit 3f803ca909
parent 9573169b9b
14 changed files with 110926 additions and 30 deletions
--- a/8
+++ b/8
@ -10,10 +10,14 @@ ENV KAGGLE_KEY=${KAGGLE_KEY}
 RUN pip install --user kaggle
 RUN pip install --user pandas
 RUN pip install --user sklearn
+RUN pip install --user torch
+RUN pip install --user tqdm
+RUN pip install --user seaborn

 COPY KaggleV2-May-2016.csv ./
 COPY create_data.py ./
 COPY stats_data.py ./
+COPY stats_data.py ./

-CMD ["python", "create_data.py"]
-CMD ["python", "stats_data.py"]
+# CMD ["python", "create_data.py"]
+# CMD ["python", "stats_data.py"]
--- a/create_data.py
+++ b/create_data.py
@ -1,27 +0,0 @@
-# -*- coding: utf-8 -*-
-
-import pandas as pd
-from sklearn.model_selection import train_test_split
-
-# Data preproccesing
-
-no_shows=pd.read_csv('KaggleV2-May-2016.csv')
-
-# Usunięcie negatywnego wieku
-no_shows = no_shows.drop(no_shows[no_shows["Age"] < 0].index)
-
-# Usunięcie kolumn PatientId oraz AppointmentID
-no_shows.drop(["PatientId", "AppointmentID"], inplace=True, axis=1)
-
-# Zmiena wartości kolumny No-show z Yes/No na wartość boolowską
-no_shows["No-show"] = no_shows["No-show"].map({'Yes': 1, 'No': 0})
-
-# Normalizacja kolumny Age
-no_shows["Age"]=(no_shows["Age"]-no_shows["Age"].min())/(no_shows["Age"].max()-no_shows["Age"].min())
-
-X = no_shows.drop(columns=['No-show'])
-y = no_shows['No-show']
-
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-
-print("Quiting create_data.py")
--- a/data_description.csv
+++ b/data_description.csv
@ -0,0 +1,12 @@
+,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
+count,110527.0,110527.0,110527,110527,110527,110527.0,110527,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527
+unique,,,2,103549,27,,81,,,,,,,2
+top,,,F,2016-05-06T07:09:54Z,2016-06-06T00:00:00Z,,JARDIM CAMBURI,,,,,,,No
+freq,,,71840,24,4692,,7717,,,,,,,88208
+mean,147496265710394.06,5675305.123426855,,,,37.08887421173107,,0.09826558216544373,0.1972459218109602,0.07186479321794674,0.030399811810688793,0.022247957512643968,0.32102563174608917,
+std,256094920291739.1,71295.75153966925,,,,23.110204963682644,,0.2976747541093071,0.397921349947084,0.25826507350746697,0.17168555541424485,0.16154272581427898,0.46687273170186816,
+min,39217.84439,5030230.0,,,,-1.0,,0.0,0.0,0.0,0.0,0.0,0.0,
+25%,4172614444192.0,5640285.5,,,,18.0,,0.0,0.0,0.0,0.0,0.0,0.0,
+50%,31731838713978.0,5680573.0,,,,37.0,,0.0,0.0,0.0,0.0,0.0,0.0,
+75%,94391720898175.0,5725523.5,,,,55.0,,0.0,0.0,0.0,0.0,0.0,1.0,
+max,999981631772427.0,5790484.0,,,,115.0,,1.0,1.0,1.0,1.0,4.0,1.0,
--- a/logs.txt
+++ b/logs.txt
@ -0,0 +1 @@
+loss=0.48354023694992065, accuracy=79.3711829902737
--- a/notebooks/02_Dane.ipynb
+++ b/notebooks/02_Dane.ipynb
--- a/notebooks/05_BibliotekiML.ipynb
+++ b/notebooks/05_BibliotekiML.ipynb
@ -0,0 +1,235 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from tqdm import tqdm\n",
+    "import matplotlib\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\PROGRAMY\\Anaconda3\\envs\\ium\\lib\\site-packages\\ipykernel_launcher.py:2: MatplotlibDeprecationWarning: Support for setting an rcParam that expects a str value to a non-str value is deprecated since 3.5 and support will be removed two minor releases later.\n",
+      "  \n"
+     ]
+    }
+   ],
+   "source": [
+    "matplotlib.rc('text', usetex=True)\n",
+    "matplotlib.rcParams['text.latex.preamble']=[r\"\\usepackage{amsmath}\"]\n",
+    "sns.set_style(\"darkgrid\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataset = pd.read_csv('../train_dataset.csv')\n",
+    "test_dataset = pd.read_csv('../test_dataset.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train = train_dataset.drop(columns=['No-show']).to_numpy()\n",
+    "X_test = test_dataset.drop(columns=['No-show']).to_numpy()\n",
+    "y_train = train_dataset['No-show'].to_numpy()\n",
+    "y_test = test_dataset['No-show'].to_numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class LogisticRegression(torch.nn.Module):\n",
+    "    def __init__(self, input_dim, output_dim):\n",
+    "        super(LogisticRegression, self).__init__()\n",
+    "        self.linear = torch.nn.Linear(input_dim, output_dim)     \n",
+    "    def forward(self, x):\n",
+    "        outputs = torch.sigmoid(self.linear(x))\n",
+    "        return outputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "epochs = 50_000\n",
+    "input_dim = 9\n",
+    "output_dim = 1\n",
+    "learning_rate = 0.01"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = LogisticRegression(input_dim, output_dim)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "criterion = torch.nn.BCELoss()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train, X_test = torch.Tensor(X_train),torch.Tensor(X_test)\n",
+    "y_train, y_test = torch.Tensor(y_train),torch.Tensor(y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training Epochs: 100%|██████████| 50000/50000 [02:01<00:00, 411.29it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "losses = []\n",
+    "losses_test = []\n",
+    "Iterations = []\n",
+    "iter = 0\n",
+    "for epoch in tqdm(range(int(epochs)), desc='Training Epochs'):\n",
+    "    x = X_train\n",
+    "    labels = y_train\n",
+    "    optimizer.zero_grad() # Setting our stored gradients equal to zero\n",
+    "    outputs = model(X_train)\n",
+    "    loss = criterion(torch.squeeze(outputs), labels) \n",
+    "    \n",
+    "    loss.backward() # Computes the gradient of the given tensor w.r.t. the weights/bias\n",
+    "    \n",
+    "    optimizer.step() # Updates weights and biases with the optimizer (SGD)\n",
+    "    \n",
+    "    iter+=1\n",
+    "    if iter%10000==0:\n",
+    "        with torch.no_grad():\n",
+    "            # Calculating the loss and accuracy for the test dataset\n",
+    "            correct_test = 0\n",
+    "            total_test = 0\n",
+    "            outputs_test = torch.squeeze(model(X_test))\n",
+    "            loss_test = criterion(outputs_test, y_test)\n",
+    "            \n",
+    "            predicted_test = outputs_test.round().detach().numpy()\n",
+    "            total_test += y_test.size(0)\n",
+    "            correct_test += np.sum(predicted_test == y_test.detach().numpy())\n",
+    "            accuracy_test = 100 * correct_test/total_test\n",
+    "            losses_test.append(loss_test.item())\n",
+    "            \n",
+    "            # Calculating the loss and accuracy for the train dataset\n",
+    "            total = 0\n",
+    "            correct = 0\n",
+    "            total += y_train.size(0)\n",
+    "            correct += np.sum(torch.squeeze(outputs).round().detach().numpy() == y_train.detach().numpy())\n",
+    "            accuracy = 100 * correct/total\n",
+    "            losses.append(loss.item())\n",
+    "            Iterations.append(iter)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iteration: 50000. \n",
+      "Test - Loss: 0.480914831161499. Accuracy: 79.76567447751742\n",
+      "Train -  Loss: 0.48352959752082825. Accuracy: 79.37570685365301\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"Iteration: {iter}. \\nTest - Loss: {loss_test.item()}. Accuracy: {accuracy_test}\")\n",
+    "print(f\"Train -  Loss: {loss.item()}. Accuracy: {accuracy}\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"logs.txt\", \"a\") as myfile:\n",
+    "    myfile.write(f\"loss={loss.item()}, accuracy={accuracy}\\n\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "3c12dc341c1078754dffca0e61bfc548ab04f96cfe0a82a85a936b702c4881ab"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.7.11 ('ium')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.11"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/notebooks/dataset_stats.ipynb
+++ b/notebooks/dataset_stats.ipynb
--- a/scripts/create_data.py
+++ b/scripts/create_data.py
@ -0,0 +1,62 @@
+# -*- coding: utf-8 -*-
+
+from matplotlib.pyplot import show
+import pandas as pd
+from datetime import datetime
+# from torch.utils.data import random_split
+from sklearn.model_selection import train_test_split
+
+
+def to_datetime(string):
+    return datetime.strptime(string.replace('T', ' ').replace('Z', ''), '%Y-%m-%d %H:%M:%S')
+
+
+# Data preproccesing
+no_shows=pd.read_csv('KaggleV2-May-2016.csv')
+
+# Usunięcie negatywnego wieku
+no_shows = no_shows.drop(no_shows[no_shows["Age"] < 0].index)
+
+# Usunięcie kolumn PatientId oraz AppointmentID
+no_shows.drop(["PatientId", "AppointmentID"], inplace=True, axis=1)
+
+# Zmiena wartości kolumny No-show z Yes/No na wartość boolowską
+no_shows["No-show"] = no_shows["No-show"].map({'Yes': 1, 'No': 0})
+
+# Zmiena wartości kolumny Gender z Male/Female na wartość boolowską
+no_shows["Gender"] = no_shows["Gender"].map({'M': 1, 'F': 0})
+
+# Normalizacja kolumny Age
+no_shows["Age"]=(no_shows["Age"]-no_shows["Age"].min())/(no_shows["Age"].max()-no_shows["Age"].min())
+
+# ScheduledDay - AppointmentDay -> czas miedzy ScheduledDay i AppointmentDay  
+no_shows["AppointmentDay"] = no_shows["AppointmentDay"].apply(lambda x: to_datetime(x))
+no_shows["ScheduledDay"] = no_shows["ScheduledDay"].apply(lambda x: to_datetime(x))
+ 
+no_shows['DaysSinceSchedule'] = no_shows.apply(lambda row: (row.AppointmentDay - row.ScheduledDay).days + 1, axis=1)
+ 
+no_shows.drop(["ScheduledDay", "AppointmentDay"], inplace=True, axis=1)
+ 
+no_shows.insert(2, 'DaysSinceSchedule', no_shows.pop('DaysSinceSchedule'))
+
+# Usuniecie kolumny Neighbourhood
+no_shows.drop(['Neighbourhood'], inplace=True, axis=1)
+
+X = no_shows.drop(columns=['No-show'])
+y = no_shows['No-show']
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+# test_size = int(0.2 * len(no_shows))
+# train_size = len(no_shows) - test_size
+# train_dataset, test_dataset = random_split(no_shows, [train_size, test_size])
+
+# train_dataset = pd.DataFrame(train_dataset.numpy())
+# test_dataset = pd.DataFrame(test_dataset.numpy())
+
+train_dataset = pd.concat([X_train, y_train], axis=1)
+test_dataset = pd.concat([X_test, y_test], axis=1)
+
+train_dataset.to_csv('train_dataset.csv', index=False)
+test_dataset.to_csv('test_dataset.csv', index=False)
+
+print("Quiting create_data.py")
--- a/scripts/preparation.sh
+++ b/scripts/preparation.sh
--- a/scripts/statistics.sh
+++ b/scripts/statistics.sh
--- a/scripts/stats_data.py
+++ b/scripts/stats_data.py
@ -3,7 +3,6 @@
 import pandas as pd

 # Data description
-
 no_shows=pd.read_csv('KaggleV2-May-2016.csv')

 # Wielkość zbioru
--- a/scripts/train_model.py
+++ b/scripts/train_model.py
@ -0,0 +1,82 @@
+import torch
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+import matplotlib
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+train_dataset = pd.read_csv('train_dataset.csv')
+test_dataset = pd.read_csv('test_dataset.csv')
+
+X_train = train_dataset.drop(columns=['No-show']).to_numpy()
+X_test = test_dataset.drop(columns=['No-show']).to_numpy()
+y_train = train_dataset['No-show'].to_numpy()
+y_test = test_dataset['No-show'].to_numpy()
+
+class LogisticRegression(torch.nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super(LogisticRegression, self).__init__()
+        self.linear = torch.nn.Linear(input_dim, output_dim)     
+    def forward(self, x):
+        outputs = torch.sigmoid(self.linear(x))
+        return outputs
+
+epochs = 50_000
+input_dim = 9
+output_dim = 1
+learning_rate = 0.01
+
+model = LogisticRegression(input_dim, output_dim)
+
+criterion = torch.nn.BCELoss()
+
+optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+X_train, X_test = torch.Tensor(X_train),torch.Tensor(X_test)
+y_train, y_test = torch.Tensor(y_train),torch.Tensor(y_test)
+
+losses = []
+losses_test = []
+Iterations = []
+iter = 0
+for epoch in tqdm(range(int(epochs)), desc='Training Epochs'):
+    x = X_train
+    labels = y_train
+    optimizer.zero_grad() # Setting our stored gradients equal to zero
+    outputs = model(X_train)
+    loss = criterion(torch.squeeze(outputs), labels) 
+    
+    loss.backward() # Computes the gradient of the given tensor w.r.t. the weights/bias
+    
+    optimizer.step() # Updates weights and biases with the optimizer (SGD)
+    
+    iter+=1
+    if iter%10000==0:
+        with torch.no_grad():
+            # Calculating the loss and accuracy for the test dataset
+            correct_test = 0
+            total_test = 0
+            outputs_test = torch.squeeze(model(X_test))
+            loss_test = criterion(outputs_test, y_test)
+            
+            predicted_test = outputs_test.round().detach().numpy()
+            total_test += y_test.size(0)
+            correct_test += np.sum(predicted_test == y_test.detach().numpy())
+            accuracy_test = 100 * correct_test/total_test
+            losses_test.append(loss_test.item())
+            
+            # Calculating the loss and accuracy for the train dataset
+            total = 0
+            correct = 0
+            total += y_train.size(0)
+            correct += np.sum(torch.squeeze(outputs).round().detach().numpy() == y_train.detach().numpy())
+            accuracy = 100 * correct/total
+            losses.append(loss.item())
+            Iterations.append(iter)
+
+print(f"Iteration: {iter}. \nTest - Loss: {loss_test.item()}. Accuracy: {accuracy_test}")
+print(f"Train -  Loss: {loss.item()}. Accuracy: {accuracy}\n")
+
+with open("logs.txt", "a") as myfile:
+    myfile.write(f"loss={loss.item()}, accuracy={accuracy}\n")
--- a/test_dataset.csv
+++ b/test_dataset.csv
--- a/train_dataset.csv
+++ b/train_dataset.csv
				`@ -0,0 +1 @@`
				`loss=0.48354023694992065, accuracy=79.3711829902737`