ium_464863/download_dataset.py

# Necessary imports
import pandas as pd
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
df = pd.read_csv('./datasets/data.csv', index_col='id')

# Drop the columns that are not necessary
df = df.drop(columns=['Unnamed: 32'])

# Check for CUTOFF parameter
if len(sys.argv) > 1:
    CUTOFF = int(sys.argv[1])
    df = df.iloc[:CUTOFF]

# Check for missing values
print(df.isnull().sum())

# Print the first 5 rows of the dataset
print(df.head())

# Convert the diagnosis column to binary
df['diagnosis'] = df['diagnosis'].map({'M': 1.0, 'B': 0.0})

# Normalize the dataset
scaler = MinMaxScaler()
df[df.columns[1:]] = scaler.fit_transform(df[df.columns[1:]])

# Split the dataset into training, validation and testing sets
df_train, df_val_test = train_test_split(df, test_size=0.2, random_state=1234)
df_val, df_test = train_test_split(df_val_test, test_size=0.5, random_state=1234)

# Datasets information
print(f"Cały zbiór: {df.shape[0]} wierszy, {df.shape[1]} kolumn")
print(f"Zbiór treningowy: {df_train.shape[0]} wierszy, {df_train.shape[1]} kolumn")
print(f"Zbiór walidacyjny: {df_val.shape[0]} wierszy, {df_val.shape[1]} kolumn")
print(f"Zbiór testowy: {df_test.shape[0]} wierszy, {df_test.shape[1]} kolumn")

# Datasets statistics
print(f"Statystyki całego zbioru:")
print(df.describe())

print(f"Statystyki zbioru treningowego:")
print(df_train.describe())

print(f"Statystyki zbioru walidacyjnego:")
print(df_val.describe())

print(f"Statystyki zbioru testowego:")
print(df_test.describe())

# Distribution of the target variable
print(f"Rozkład zmiennej docelowej w całym zbiorze:")
print(df['diagnosis'].value_counts())

print(f"Rozkład zmiennej docelowej w zbiorze treningowym:")
print(df_train['diagnosis'].value_counts())

print(f"Rozkład zmiennej docelowej w zbiorze walidacyjnym:")
print(df_val['diagnosis'].value_counts())

print(f"Rozkład zmiennej docelowej w zbiorze testowym:")
print(df_test['diagnosis'].value_counts())

# Save the datasets
df.to_csv('./datasets/data.csv')
df_train.to_csv('./datasets/train.csv')
df_val.to_csv('./datasets/val.csv')
df_test.to_csv('./datasets/test.csv')
IUM_04 - add python scripts for downloading dataset and extract statistics and other information about datasets 2024-03-27 12:21:52 +01:00			`# Necessary imports`
			`import pandas as pd`
			`import sys`
			`from sklearn.model_selection import train_test_split`
			`from sklearn.preprocessing import MinMaxScaler`

			`# Load the dataset`
			`df = pd.read_csv('./datasets/data.csv', index_col='id')`

			`# Drop the columns that are not necessary`
			`df = df.drop(columns=['Unnamed: 32'])`

			`# Check for CUTOFF parameter`
			`if len(sys.argv) > 1:`
			`CUTOFF = int(sys.argv[1])`
IUM_04 - add Dockerfile, fix download dataset script 2024-03-29 14:48:13 +01:00			`df = df.iloc[:CUTOFF]`
IUM_04 - add python scripts for downloading dataset and extract statistics and other information about datasets 2024-03-27 12:21:52 +01:00
			`# Check for missing values`
			`print(df.isnull().sum())`

			`# Print the first 5 rows of the dataset`
			`print(df.head())`

IUM_04 - update Jenkinsfile, update Dockerfile, add requirements.txt file 2024-04-03 19:57:37 +02:00			`# Convert the diagnosis column to binary`
IUM_05 - add train and prediction scripts, update Docker env, update Jenkinsfile 2024-04-04 09:06:39 +02:00			`df['diagnosis'] = df['diagnosis'].map({'M': 1.0, 'B': 0.0})`
IUM_04 - update Jenkinsfile, update Dockerfile, add requirements.txt file 2024-04-03 19:57:37 +02:00
IUM_04 - add python scripts for downloading dataset and extract statistics and other information about datasets 2024-03-27 12:21:52 +01:00			`# Normalize the dataset`
			`scaler = MinMaxScaler()`
			`df[df.columns[1:]] = scaler.fit_transform(df[df.columns[1:]])`

			`# Split the dataset into training, validation and testing sets`
			`df_train, df_val_test = train_test_split(df, test_size=0.2, random_state=1234)`
			`df_val, df_test = train_test_split(df_val_test, test_size=0.5, random_state=1234)`

			`# Datasets information`
			`print(f"Cały zbiór: {df.shape[0]} wierszy, {df.shape[1]} kolumn")`
			`print(f"Zbiór treningowy: {df_train.shape[0]} wierszy, {df_train.shape[1]} kolumn")`
			`print(f"Zbiór walidacyjny: {df_val.shape[0]} wierszy, {df_val.shape[1]} kolumn")`
			`print(f"Zbiór testowy: {df_test.shape[0]} wierszy, {df_test.shape[1]} kolumn")`

			`# Datasets statistics`
			`print(f"Statystyki całego zbioru:")`
			`print(df.describe())`

			`print(f"Statystyki zbioru treningowego:")`
			`print(df_train.describe())`

			`print(f"Statystyki zbioru walidacyjnego:")`
			`print(df_val.describe())`

			`print(f"Statystyki zbioru testowego:")`
			`print(df_test.describe())`

			`# Distribution of the target variable`
			`print(f"Rozkład zmiennej docelowej w całym zbiorze:")`
			`print(df['diagnosis'].value_counts())`

			`print(f"Rozkład zmiennej docelowej w zbiorze treningowym:")`
			`print(df_train['diagnosis'].value_counts())`

			`print(f"Rozkład zmiennej docelowej w zbiorze walidacyjnym:")`
			`print(df_val['diagnosis'].value_counts())`

			`print(f"Rozkład zmiennej docelowej w zbiorze testowym:")`
			`print(df_test['diagnosis'].value_counts())`

			`# Save the datasets`
			`df.to_csv('./datasets/data.csv')`
			`df_train.to_csv('./datasets/train.csv')`
			`df_val.to_csv('./datasets/val.csv')`
			`df_test.to_csv('./datasets/test.csv')`