# import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler from datasets import load_dataset dataset = load_dataset("mstz/liver")['train'] dataset = dataset.to_pandas() train, test = train_test_split(dataset, test_size=0.2, random_state=42) train, val = train_test_split(train, test_size=0.2, random_state=42) numerical_features = ['age', 'total_bilirubin', 'direct_ribilubin', 'alkaline_phosphotase', 'alamine_aminotransferasi', 'aspartate_aminotransferase', 'total_proteins', 'albumin', 'albumin_to_globulin_ratio'] scaler = MinMaxScaler() train[numerical_features] = scaler.fit_transform(train[numerical_features]) test[numerical_features] = scaler.fit_transform(test[numerical_features]) val[numerical_features] = scaler.fit_transform(val[numerical_features]) train.dropna(inplace=True) test.dropna(inplace=True) val.dropna(inplace=True) train.to_csv('train.data') test.to_csv('test.data') val.to_csv('dev.data')