# -*- coding: utf-8 -*- import pandas as pd from datetime import datetime # from torch.utils.data import random_split from sklearn.model_selection import train_test_split def to_datetime(string): return datetime.strptime(string.replace('T', ' ').replace('Z', ''), '%Y-%m-%d %H:%M:%S') # Data preproccesing no_shows=pd.read_csv('KaggleV2-May-2016.csv') # Usunięcie negatywnego wieku no_shows = no_shows.drop(no_shows[no_shows["Age"] < 0].index) # Usunięcie kolumn PatientId oraz AppointmentID no_shows.drop(["PatientId", "AppointmentID"], inplace=True, axis=1) # Zmiena wartości kolumny No-show z Yes/No na wartość boolowską no_shows["No-show"] = no_shows["No-show"].map({'Yes': 1, 'No': 0}) # Zmiena wartości kolumny Gender z Male/Female na wartość boolowską no_shows["Gender"] = no_shows["Gender"].map({'M': 1, 'F': 0}) # Normalizacja kolumny Age no_shows["Age"]=(no_shows["Age"]-no_shows["Age"].min())/(no_shows["Age"].max()-no_shows["Age"].min()) # ScheduledDay - AppointmentDay -> czas miedzy ScheduledDay i AppointmentDay no_shows["AppointmentDay"] = no_shows["AppointmentDay"].apply(lambda x: to_datetime(x)) no_shows["ScheduledDay"] = no_shows["ScheduledDay"].apply(lambda x: to_datetime(x)) no_shows['DaysSinceSchedule'] = no_shows.apply(lambda row: (row.AppointmentDay - row.ScheduledDay).days + 1, axis=1) no_shows.drop(["ScheduledDay", "AppointmentDay"], inplace=True, axis=1) no_shows.insert(2, 'DaysSinceSchedule', no_shows.pop('DaysSinceSchedule')) # Usuniecie kolumny Neighbourhood no_shows.drop(['Neighbourhood'], inplace=True, axis=1) X = no_shows.drop(columns=['No-show']) y = no_shows['No-show'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # test_size = int(0.2 * len(no_shows)) # train_size = len(no_shows) - test_size # train_dataset, test_dataset = random_split(no_shows, [train_size, test_size]) # train_dataset = pd.DataFrame(train_dataset.numpy()) # test_dataset = pd.DataFrame(test_dataset.numpy()) train_dataset = pd.concat([X_train, y_train], axis=1) test_dataset = pd.concat([X_test, y_test], axis=1) train_dataset.to_csv('train_dataset.csv', index=False) test_dataset.to_csv('test_dataset.csv', index=False) print("Quiting create_data.py")