04.01 - Dockerfile

2022-04-03 19:10:49 +02:00 · 2022-04-03 19:10:49 +02:00 · e886f3bbd9
commit e886f3bbd9
parent 09dbff1aed
7 changed files with 112 additions and 214 deletions
--- a/02_Dane.ipynb
+++ b/02_Dane.ipynb
--- a/23
+++ b/23
@ -0,0 +1,23 @@
 FROM python:3.7
 WORKDIR /
 RUN pip install kaggle
 RUN pip install pandas
 RUN pip install sklearn
 COPY KaggleV2-May-2016.csv ./
 COPY create_data.py ./
 COPY stats_data.py ./
 # CMD ["python", "./create_data.py"]
 # CMD ["python", "./stats_data.py"]
 # RUN kaggle datasets download -d joniarroba/noshowappointments
 # RUN unzip -o noshowappointments.zip
--- a/create_data.ipynb
+++ b/create_data.ipynb
@ -1,94 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install kaggle\n",
    "!pip install pandas\n",
    "!pip install seaborn\n",
    "!pip install torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1 Pobranie zbioru\n",
    "!kaggle datasets download -d joniarroba/noshowappointments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!unzip -o noshowappointments.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "no_shows=pd.read_csv('KaggleV2-May-2016.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Wyczyszczenie zbioru\n",
    "# Usunięcie negatywnego wieku\n",
    "no_shows = no_shows.drop(no_shows[no_shows[\"Age\"] < 0].index)\n",
    "\n",
    "# Usunięcie niewiadomego wieku (zależy od zastosowania)\n",
    "# no_shows = no_shows.drop(no_shows[no_shows[\"Age\"] == 0].index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Normalizacja danych\n",
    "\n",
    "# Usunięcie kolumn PatientId oraz AppointmentID\n",
    "no_shows.drop([\"PatientId\", \"AppointmentID\"], inplace=True, axis=1)\n",
    "\n",
    "# Zmiena wartości kolumny No-show z Yes/No na wartość boolowską\n",
    "no_shows[\"No-show\"] = no_shows[\"No-show\"].map({'Yes': 1, 'No': 0})\n",
    "\n",
    "# Normalizacja kolumny Age\n",
    "no_shows[\"Age\"]=(no_shows[\"Age\"]-no_shows[\"Age\"].min())/(no_shows[\"Age\"].max()-no_shows[\"Age\"].min())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Zapisanie wyników jako artefakt"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/create_data.py
+++ b/create_data.py
@ -0,0 +1,23 @@
 import pandas as pd
 from sklearn.model_selection import train_test_split
 # Data preproccesing
 no_shows=pd.read_csv('KaggleV2-May-2016.csv')
 # Usunięcie negatywnego wieku
 no_shows = no_shows.drop(no_shows[no_shows["Age"] < 0].index)
 # Usunięcie kolumn PatientId oraz AppointmentID
 no_shows.drop(["PatientId", "AppointmentID"], inplace=True, axis=1)
 # Zmiena wartości kolumny No-show z Yes/No na wartość boolowską
 no_shows["No-show"] = no_shows["No-show"].map({'Yes': 1, 'No': 0})
 # Normalizacja kolumny Age
 no_shows["Age"]=(no_shows["Age"]-no_shows["Age"].min())/(no_shows["Age"].max()-no_shows["Age"].min())
 X = no_shows.drop(columns=['No-show'])
 y = no_shows['No-show']
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
--- a/preparation.sh
+++ b/preparation.sh
@ -1 +1 @@
-echo "Preparation inner"
+python create_data.py
--- a/statistics.sh
+++ b/statistics.sh
@ -1 +1 @@
-wc -l KaggleV2-May-2016.csv >> statistics.csv
+python stats_data.py
--- a/stats_data.py
+++ b/stats_data.py
@ -0,0 +1,11 @@
 import pandas as pd
 # Data description
 no_shows=pd.read_csv('KaggleV2-May-2016.csv')
 # Wielkość zbioru
 print(f"Wielkosc zbioru: {len(no_shows)}")
 # Opis parametrów
 print(no_shows.describe(include='all'))
		`@ -1 +1 @@`
			`echo "Preparation inner"`				`python create_data.py`
		`@ -1 +1 @@`
			`wc -l KaggleV2-May-2016.csv >> statistics.csv`				`python stats_data.py`