2024-03-27 12:21:52 +01:00
|
|
|
# Necessary imports
|
|
|
|
import pandas as pd
|
2024-04-03 19:57:37 +02:00
|
|
|
import kaggle
|
2024-03-27 12:21:52 +01:00
|
|
|
import sys
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
|
|
|
2024-04-03 19:57:37 +02:00
|
|
|
# Download the dataset
|
|
|
|
kaggle.api.authenticate()
|
|
|
|
kaggle.api.dataset_download_files('uciml/breast-cancer-wisconsin-data', path='./datasets', unzip=True)
|
|
|
|
|
2024-03-27 12:21:52 +01:00
|
|
|
# Load the dataset
|
|
|
|
df = pd.read_csv('./datasets/data.csv', index_col='id')
|
|
|
|
|
|
|
|
# Drop the columns that are not necessary
|
|
|
|
df = df.drop(columns=['Unnamed: 32'])
|
|
|
|
|
|
|
|
# Check for CUTOFF parameter
|
|
|
|
if len(sys.argv) > 1:
|
|
|
|
CUTOFF = int(sys.argv[1])
|
2024-03-29 14:48:13 +01:00
|
|
|
df = df.iloc[:CUTOFF]
|
2024-03-27 12:21:52 +01:00
|
|
|
|
|
|
|
# Check for missing values
|
|
|
|
print(df.isnull().sum())
|
|
|
|
|
|
|
|
# Print the first 5 rows of the dataset
|
|
|
|
print(df.head())
|
|
|
|
|
2024-04-03 19:57:37 +02:00
|
|
|
# Convert the diagnosis column to binary
|
|
|
|
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})
|
|
|
|
|
2024-03-27 12:21:52 +01:00
|
|
|
# Normalize the dataset
|
|
|
|
scaler = MinMaxScaler()
|
|
|
|
df[df.columns[1:]] = scaler.fit_transform(df[df.columns[1:]])
|
|
|
|
|
|
|
|
# Split the dataset into training, validation and testing sets
|
|
|
|
df_train, df_val_test = train_test_split(df, test_size=0.2, random_state=1234)
|
|
|
|
df_val, df_test = train_test_split(df_val_test, test_size=0.5, random_state=1234)
|
|
|
|
|
|
|
|
# Datasets information
|
|
|
|
print(f"Cały zbiór: {df.shape[0]} wierszy, {df.shape[1]} kolumn")
|
|
|
|
print(f"Zbiór treningowy: {df_train.shape[0]} wierszy, {df_train.shape[1]} kolumn")
|
|
|
|
print(f"Zbiór walidacyjny: {df_val.shape[0]} wierszy, {df_val.shape[1]} kolumn")
|
|
|
|
print(f"Zbiór testowy: {df_test.shape[0]} wierszy, {df_test.shape[1]} kolumn")
|
|
|
|
|
|
|
|
# Datasets statistics
|
|
|
|
print(f"Statystyki całego zbioru:")
|
|
|
|
print(df.describe())
|
|
|
|
|
|
|
|
print(f"Statystyki zbioru treningowego:")
|
|
|
|
print(df_train.describe())
|
|
|
|
|
|
|
|
print(f"Statystyki zbioru walidacyjnego:")
|
|
|
|
print(df_val.describe())
|
|
|
|
|
|
|
|
print(f"Statystyki zbioru testowego:")
|
|
|
|
print(df_test.describe())
|
|
|
|
|
|
|
|
# Distribution of the target variable
|
|
|
|
print(f"Rozkład zmiennej docelowej w całym zbiorze:")
|
|
|
|
print(df['diagnosis'].value_counts())
|
|
|
|
|
|
|
|
print(f"Rozkład zmiennej docelowej w zbiorze treningowym:")
|
|
|
|
print(df_train['diagnosis'].value_counts())
|
|
|
|
|
|
|
|
print(f"Rozkład zmiennej docelowej w zbiorze walidacyjnym:")
|
|
|
|
print(df_val['diagnosis'].value_counts())
|
|
|
|
|
|
|
|
print(f"Rozkład zmiennej docelowej w zbiorze testowym:")
|
|
|
|
print(df_test['diagnosis'].value_counts())
|
|
|
|
|
|
|
|
# Save the datasets
|
|
|
|
df.to_csv('./datasets/data.csv')
|
|
|
|
df_train.to_csv('./datasets/train.csv')
|
|
|
|
df_val.to_csv('./datasets/val.csv')
|
|
|
|
df_test.to_csv('./datasets/test.csv')
|