ium_478855/create_data.ipynb

95 lines
2.1 KiB
Plaintext
Raw Normal View History

2022-03-21 11:04:16 +01:00
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install kaggle\n",
"!pip install pandas\n",
"!pip install seaborn\n",
"!pip install torch"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 1 Pobranie zbioru\n",
"!kaggle datasets download -d joniarroba/noshowappointments"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!unzip -o noshowappointments.zip"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"no_shows=pd.read_csv('KaggleV2-May-2016.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Wyczyszczenie zbioru\n",
"# Usunięcie negatywnego wieku\n",
"no_shows = no_shows.drop(no_shows[no_shows[\"Age\"] < 0].index)\n",
"\n",
"# Usunięcie niewiadomego wieku (zależy od zastosowania)\n",
"# no_shows = no_shows.drop(no_shows[no_shows[\"Age\"] == 0].index)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Normalizacja danych\n",
"\n",
"# Usunięcie kolumn PatientId oraz AppointmentID\n",
"no_shows.drop([\"PatientId\", \"AppointmentID\"], inplace=True, axis=1)\n",
"\n",
"# Zmiena wartości kolumny No-show z Yes/No na wartość boolowską\n",
"no_shows[\"No-show\"] = no_shows[\"No-show\"].map({'Yes': 1, 'No': 0})\n",
"\n",
"# Normalizacja kolumny Age\n",
"no_shows[\"Age\"]=(no_shows[\"Age\"]-no_shows[\"Age\"].min())/(no_shows[\"Age\"].max()-no_shows[\"Age\"].min())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Zapisanie wyników jako artefakt"
]
}
],
"metadata": {
"language_info": {
"name": "python"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}