04.01 - Dockerfile

2022-04-03 19:10:49 +02:00 · 2022-04-03 19:10:49 +02:00 · e886f3bbd9
commit e886f3bbd9
parent 09dbff1aed
7 changed files with 112 additions and 214 deletions
--- a/02_Dane.ipynb
+++ b/02_Dane.ipynb
--- a/23
+++ b/23
@ -0,0 +1,23 @@
+FROM python:3.7
+
+WORKDIR /
+
+RUN pip install kaggle
+
+RUN pip install pandas
+
+RUN pip install sklearn
+
+COPY KaggleV2-May-2016.csv ./
+
+COPY create_data.py ./
+
+COPY stats_data.py ./
+
+# CMD ["python", "./create_data.py"]
+
+# CMD ["python", "./stats_data.py"]
+
+# RUN kaggle datasets download -d joniarroba/noshowappointments
+
+# RUN unzip -o noshowappointments.zip
--- a/create_data.ipynb
+++ b/create_data.ipynb
@ -1,94 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!pip install kaggle\n",
-    "!pip install pandas\n",
-    "!pip install seaborn\n",
-    "!pip install torch"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 1 Pobranie zbioru\n",
-    "!kaggle datasets download -d joniarroba/noshowappointments"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!unzip -o noshowappointments.zip"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "no_shows=pd.read_csv('KaggleV2-May-2016.csv')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Wyczyszczenie zbioru\n",
-    "# Usunięcie negatywnego wieku\n",
-    "no_shows = no_shows.drop(no_shows[no_shows[\"Age\"] < 0].index)\n",
-    "\n",
-    "# Usunięcie niewiadomego wieku (zależy od zastosowania)\n",
-    "# no_shows = no_shows.drop(no_shows[no_shows[\"Age\"] == 0].index)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Normalizacja danych\n",
-    "\n",
-    "# Usunięcie kolumn PatientId oraz AppointmentID\n",
-    "no_shows.drop([\"PatientId\", \"AppointmentID\"], inplace=True, axis=1)\n",
-    "\n",
-    "# Zmiena wartości kolumny No-show z Yes/No na wartość boolowską\n",
-    "no_shows[\"No-show\"] = no_shows[\"No-show\"].map({'Yes': 1, 'No': 0})\n",
-    "\n",
-    "# Normalizacja kolumny Age\n",
-    "no_shows[\"Age\"]=(no_shows[\"Age\"]-no_shows[\"Age\"].min())/(no_shows[\"Age\"].max()-no_shows[\"Age\"].min())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Zapisanie wyników jako artefakt"
-   ]
-  }
- ],
- "metadata": {
-  "language_info": {
-   "name": "python"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
--- a/create_data.py
+++ b/create_data.py
@ -0,0 +1,23 @@
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+# Data preproccesing
+
+no_shows=pd.read_csv('KaggleV2-May-2016.csv')
+
+# Usunięcie negatywnego wieku
+no_shows = no_shows.drop(no_shows[no_shows["Age"] < 0].index)
+
+# Usunięcie kolumn PatientId oraz AppointmentID
+no_shows.drop(["PatientId", "AppointmentID"], inplace=True, axis=1)
+
+# Zmiena wartości kolumny No-show z Yes/No na wartość boolowską
+no_shows["No-show"] = no_shows["No-show"].map({'Yes': 1, 'No': 0})
+
+# Normalizacja kolumny Age
+no_shows["Age"]=(no_shows["Age"]-no_shows["Age"].min())/(no_shows["Age"].max()-no_shows["Age"].min())
+
+X = no_shows.drop(columns=['No-show'])
+y = no_shows['No-show']
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
--- a/preparation.sh
+++ b/preparation.sh
@ -1 +1 @@
-echo "Preparation inner"
+python create_data.py
--- a/statistics.sh
+++ b/statistics.sh
@ -1 +1 @@
-wc -l KaggleV2-May-2016.csv >> statistics.csv
+python stats_data.py
--- a/stats_data.py
+++ b/stats_data.py
@ -0,0 +1,11 @@
+import pandas as pd
+
+# Data description
+
+no_shows=pd.read_csv('KaggleV2-May-2016.csv')
+
+# Wielkość zbioru
+print(f"Wielkosc zbioru: {len(no_shows)}")
+
+# Opis parametrów
+print(no_shows.describe(include='all'))