04.01 - Dockerfile
This commit is contained in:
parent
09dbff1aed
commit
e886f3bbd9
171
02_Dane.ipynb
171
02_Dane.ipynb
File diff suppressed because one or more lines are too long
23
Dockerfile
Normal file
23
Dockerfile
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
FROM python:3.7
|
||||||
|
|
||||||
|
WORKDIR /
|
||||||
|
|
||||||
|
RUN pip install kaggle
|
||||||
|
|
||||||
|
RUN pip install pandas
|
||||||
|
|
||||||
|
RUN pip install sklearn
|
||||||
|
|
||||||
|
COPY KaggleV2-May-2016.csv ./
|
||||||
|
|
||||||
|
COPY create_data.py ./
|
||||||
|
|
||||||
|
COPY stats_data.py ./
|
||||||
|
|
||||||
|
# CMD ["python", "./create_data.py"]
|
||||||
|
|
||||||
|
# CMD ["python", "./stats_data.py"]
|
||||||
|
|
||||||
|
# RUN kaggle datasets download -d joniarroba/noshowappointments
|
||||||
|
|
||||||
|
# RUN unzip -o noshowappointments.zip
|
@ -1,94 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"!pip install kaggle\n",
|
|
||||||
"!pip install pandas\n",
|
|
||||||
"!pip install seaborn\n",
|
|
||||||
"!pip install torch"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# 1 Pobranie zbioru\n",
|
|
||||||
"!kaggle datasets download -d joniarroba/noshowappointments"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"!unzip -o noshowappointments.zip"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"no_shows=pd.read_csv('KaggleV2-May-2016.csv')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Wyczyszczenie zbioru\n",
|
|
||||||
"# Usunięcie negatywnego wieku\n",
|
|
||||||
"no_shows = no_shows.drop(no_shows[no_shows[\"Age\"] < 0].index)\n",
|
|
||||||
"\n",
|
|
||||||
"# Usunięcie niewiadomego wieku (zależy od zastosowania)\n",
|
|
||||||
"# no_shows = no_shows.drop(no_shows[no_shows[\"Age\"] == 0].index)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Normalizacja danych\n",
|
|
||||||
"\n",
|
|
||||||
"# Usunięcie kolumn PatientId oraz AppointmentID\n",
|
|
||||||
"no_shows.drop([\"PatientId\", \"AppointmentID\"], inplace=True, axis=1)\n",
|
|
||||||
"\n",
|
|
||||||
"# Zmiena wartości kolumny No-show z Yes/No na wartość boolowską\n",
|
|
||||||
"no_shows[\"No-show\"] = no_shows[\"No-show\"].map({'Yes': 1, 'No': 0})\n",
|
|
||||||
"\n",
|
|
||||||
"# Normalizacja kolumny Age\n",
|
|
||||||
"no_shows[\"Age\"]=(no_shows[\"Age\"]-no_shows[\"Age\"].min())/(no_shows[\"Age\"].max()-no_shows[\"Age\"].min())"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Zapisanie wyników jako artefakt"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"language_info": {
|
|
||||||
"name": "python"
|
|
||||||
},
|
|
||||||
"orig_nbformat": 4
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 2
|
|
||||||
}
|
|
23
create_data.py
Normal file
23
create_data.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
# Data preproccesing
|
||||||
|
|
||||||
|
no_shows=pd.read_csv('KaggleV2-May-2016.csv')
|
||||||
|
|
||||||
|
# Usunięcie negatywnego wieku
|
||||||
|
no_shows = no_shows.drop(no_shows[no_shows["Age"] < 0].index)
|
||||||
|
|
||||||
|
# Usunięcie kolumn PatientId oraz AppointmentID
|
||||||
|
no_shows.drop(["PatientId", "AppointmentID"], inplace=True, axis=1)
|
||||||
|
|
||||||
|
# Zmiena wartości kolumny No-show z Yes/No na wartość boolowską
|
||||||
|
no_shows["No-show"] = no_shows["No-show"].map({'Yes': 1, 'No': 0})
|
||||||
|
|
||||||
|
# Normalizacja kolumny Age
|
||||||
|
no_shows["Age"]=(no_shows["Age"]-no_shows["Age"].min())/(no_shows["Age"].max()-no_shows["Age"].min())
|
||||||
|
|
||||||
|
X = no_shows.drop(columns=['No-show'])
|
||||||
|
y = no_shows['No-show']
|
||||||
|
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
@ -1 +1 @@
|
|||||||
echo "Preparation inner"
|
python create_data.py
|
@ -1 +1 @@
|
|||||||
wc -l KaggleV2-May-2016.csv >> statistics.csv
|
python stats_data.py
|
11
stats_data.py
Normal file
11
stats_data.py
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# Data description
|
||||||
|
|
||||||
|
no_shows=pd.read_csv('KaggleV2-May-2016.csv')
|
||||||
|
|
||||||
|
# Wielkość zbioru
|
||||||
|
print(f"Wielkosc zbioru: {len(no_shows)}")
|
||||||
|
|
||||||
|
# Opis parametrów
|
||||||
|
print(no_shows.describe(include='all'))
|
Loading…
Reference in New Issue
Block a user