2022-04-03 20:21:50 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
2022-04-03 20:19:58 +02:00
|
|
|
|
2022-04-03 19:10:49 +02:00
|
|
|
import pandas as pd
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
|
|
|
|
# Data preproccesing
|
|
|
|
|
|
|
|
no_shows=pd.read_csv('KaggleV2-May-2016.csv')
|
|
|
|
|
|
|
|
# Usunięcie negatywnego wieku
|
|
|
|
no_shows = no_shows.drop(no_shows[no_shows["Age"] < 0].index)
|
|
|
|
|
|
|
|
# Usunięcie kolumn PatientId oraz AppointmentID
|
|
|
|
no_shows.drop(["PatientId", "AppointmentID"], inplace=True, axis=1)
|
|
|
|
|
|
|
|
# Zmiena wartości kolumny No-show z Yes/No na wartość boolowską
|
|
|
|
no_shows["No-show"] = no_shows["No-show"].map({'Yes': 1, 'No': 0})
|
|
|
|
|
|
|
|
# Normalizacja kolumny Age
|
|
|
|
no_shows["Age"]=(no_shows["Age"]-no_shows["Age"].min())/(no_shows["Age"].max()-no_shows["Age"].min())
|
|
|
|
|
|
|
|
X = no_shows.drop(columns=['No-show'])
|
|
|
|
y = no_shows['No-show']
|
|
|
|
|
2022-04-03 21:18:45 +02:00
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
|
|
|
|
|
|
print("Quiting create_data.py")
|