ium_478855/create_data.py

# -*- coding: utf-8 -*-

import pandas as pd
from datetime import datetime
# from torch.utils.data import random_split
from sklearn.model_selection import train_test_split


def to_datetime(string):
    return datetime.strptime(string.replace('T', ' ').replace('Z', ''), '%Y-%m-%d %H:%M:%S')


# Data preproccesing
no_shows=pd.read_csv('KaggleV2-May-2016.csv')

# Usunięcie negatywnego wieku
no_shows = no_shows.drop(no_shows[no_shows["Age"] < 0].index)

# Usunięcie kolumn PatientId oraz AppointmentID
no_shows.drop(["PatientId", "AppointmentID"], inplace=True, axis=1)

# Zmiena wartości kolumny No-show z Yes/No na wartość boolowską
no_shows["No-show"] = no_shows["No-show"].map({'Yes': 1, 'No': 0})

# Zmiena wartości kolumny Gender z Male/Female na wartość boolowską
no_shows["Gender"] = no_shows["Gender"].map({'M': 1, 'F': 0})

# Normalizacja kolumny Age
no_shows["Age"]=(no_shows["Age"]-no_shows["Age"].min())/(no_shows["Age"].max()-no_shows["Age"].min())

# ScheduledDay - AppointmentDay -> czas miedzy ScheduledDay i AppointmentDay  
no_shows["AppointmentDay"] = no_shows["AppointmentDay"].apply(lambda x: to_datetime(x))
no_shows["ScheduledDay"] = no_shows["ScheduledDay"].apply(lambda x: to_datetime(x))
 
no_shows['DaysSinceSchedule'] = no_shows.apply(lambda row: (row.AppointmentDay - row.ScheduledDay).days + 1, axis=1)
 
no_shows.drop(["ScheduledDay", "AppointmentDay"], inplace=True, axis=1)
 
no_shows.insert(2, 'DaysSinceSchedule', no_shows.pop('DaysSinceSchedule'))

# Usuniecie kolumny Neighbourhood
no_shows.drop(['Neighbourhood'], inplace=True, axis=1)

X = no_shows.drop(columns=['No-show'])
y = no_shows['No-show']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# test_size = int(0.2 * len(no_shows))
# train_size = len(no_shows) - test_size
# train_dataset, test_dataset = random_split(no_shows, [train_size, test_size])

# train_dataset = pd.DataFrame(train_dataset.numpy())
# test_dataset = pd.DataFrame(test_dataset.numpy())

train_dataset = pd.concat([X_train, y_train], axis=1)
test_dataset = pd.concat([X_test, y_test], axis=1)

train_dataset.to_csv('train_dataset.csv', index=False)
test_dataset.to_csv('test_dataset.csv', index=False)

train_dataset.to_csv('train_dataset_dvc.csv', index=False)
test_dataset.to_csv('test_dataset_dvc.csv', index=False)

print("Quiting create_data.py")
05 - Biblioteki DL 2022-04-24 20:51:38 +02:00			`# -- coding: utf-8 --`

			`import pandas as pd`
			`from datetime import datetime`
			`# from torch.utils.data import random_split`
			`from sklearn.model_selection import train_test_split`


			`def to_datetime(string):`
			`return datetime.strptime(string.replace('T', ' ').replace('Z', ''), '%Y-%m-%d %H:%M:%S')`


			`# Data preproccesing`
			`no_shows=pd.read_csv('KaggleV2-May-2016.csv')`

			`# Usunięcie negatywnego wieku`
			`no_shows = no_shows.drop(no_shows[no_shows["Age"] < 0].index)`

			`# Usunięcie kolumn PatientId oraz AppointmentID`
			`no_shows.drop(["PatientId", "AppointmentID"], inplace=True, axis=1)`

			`# Zmiena wartości kolumny No-show z Yes/No na wartość boolowską`
			`no_shows["No-show"] = no_shows["No-show"].map({'Yes': 1, 'No': 0})`

			`# Zmiena wartości kolumny Gender z Male/Female na wartość boolowską`
			`no_shows["Gender"] = no_shows["Gender"].map({'M': 1, 'F': 0})`

			`# Normalizacja kolumny Age`
			`no_shows["Age"]=(no_shows["Age"]-no_shows["Age"].min())/(no_shows["Age"].max()-no_shows["Age"].min())`

			`# ScheduledDay - AppointmentDay -> czas miedzy ScheduledDay i AppointmentDay`
			`no_shows["AppointmentDay"] = no_shows["AppointmentDay"].apply(lambda x: to_datetime(x))`
			`no_shows["ScheduledDay"] = no_shows["ScheduledDay"].apply(lambda x: to_datetime(x))`

			`no_shows['DaysSinceSchedule'] = no_shows.apply(lambda row: (row.AppointmentDay - row.ScheduledDay).days + 1, axis=1)`

			`no_shows.drop(["ScheduledDay", "AppointmentDay"], inplace=True, axis=1)`

			`no_shows.insert(2, 'DaysSinceSchedule', no_shows.pop('DaysSinceSchedule'))`

			`# Usuniecie kolumny Neighbourhood`
			`no_shows.drop(['Neighbourhood'], inplace=True, axis=1)`

			`X = no_shows.drop(columns=['No-show'])`
			`y = no_shows['No-show']`
			`X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)`

			`# test_size = int(0.2 * len(no_shows))`
			`# train_size = len(no_shows) - test_size`
			`# train_dataset, test_dataset = random_split(no_shows, [train_size, test_size])`

			`# train_dataset = pd.DataFrame(train_dataset.numpy())`
			`# test_dataset = pd.DataFrame(test_dataset.numpy())`

			`train_dataset = pd.concat([X_train, y_train], axis=1)`
			`test_dataset = pd.concat([X_test, y_test], axis=1)`

			`train_dataset.to_csv('train_dataset.csv', index=False)`
			`test_dataset.to_csv('test_dataset.csv', index=False)`

DVC 2022-06-05 09:09:13 +02:00			`train_dataset.to_csv('train_dataset_dvc.csv', index=False)`
			`test_dataset.to_csv('test_dataset_dvc.csv', index=False)`

05 - Biblioteki DL 2022-04-24 20:51:38 +02:00			`print("Quiting create_data.py")`