IUM_04 - add python scripts for downloading dataset and extract statistics and other information about datasets
This commit is contained in:
parent
b1c9d8c590
commit
a0b0d0e0ca
207
IUM_02.ipynb
207
IUM_02.ipynb
File diff suppressed because one or more lines are too long
73
download_dataset.py
Normal file
73
download_dataset.py
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
# Necessary imports
|
||||||
|
import pandas as pd
|
||||||
|
import sys
|
||||||
|
import kaggle
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.preprocessing import MinMaxScaler
|
||||||
|
|
||||||
|
# Download the dataset from Kaggle
|
||||||
|
kaggle.api.authenticate()
|
||||||
|
kaggle.api.dataset_download_files('uciml/breast-cancer-wisconsin-data', path='./datasets', unzip=True)
|
||||||
|
|
||||||
|
# Load the dataset
|
||||||
|
df = pd.read_csv('./datasets/data.csv', index_col='id')
|
||||||
|
|
||||||
|
# Drop the columns that are not necessary
|
||||||
|
df = df.drop(columns=['Unnamed: 32'])
|
||||||
|
|
||||||
|
# Check for CUTOFF parameter
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
CUTOFF = int(sys.argv[1])
|
||||||
|
df = df.iloc[:CUTOFF - 1]
|
||||||
|
|
||||||
|
# Check for missing values
|
||||||
|
print(df.isnull().sum())
|
||||||
|
|
||||||
|
# Print the first 5 rows of the dataset
|
||||||
|
print(df.head())
|
||||||
|
|
||||||
|
# Normalize the dataset
|
||||||
|
scaler = MinMaxScaler()
|
||||||
|
df[df.columns[1:]] = scaler.fit_transform(df[df.columns[1:]])
|
||||||
|
|
||||||
|
# Split the dataset into training, validation and testing sets
|
||||||
|
df_train, df_val_test = train_test_split(df, test_size=0.2, random_state=1234)
|
||||||
|
df_val, df_test = train_test_split(df_val_test, test_size=0.5, random_state=1234)
|
||||||
|
|
||||||
|
# Datasets information
|
||||||
|
print(f"Cały zbiór: {df.shape[0]} wierszy, {df.shape[1]} kolumn")
|
||||||
|
print(f"Zbiór treningowy: {df_train.shape[0]} wierszy, {df_train.shape[1]} kolumn")
|
||||||
|
print(f"Zbiór walidacyjny: {df_val.shape[0]} wierszy, {df_val.shape[1]} kolumn")
|
||||||
|
print(f"Zbiór testowy: {df_test.shape[0]} wierszy, {df_test.shape[1]} kolumn")
|
||||||
|
|
||||||
|
# Datasets statistics
|
||||||
|
print(f"Statystyki całego zbioru:")
|
||||||
|
print(df.describe())
|
||||||
|
|
||||||
|
print(f"Statystyki zbioru treningowego:")
|
||||||
|
print(df_train.describe())
|
||||||
|
|
||||||
|
print(f"Statystyki zbioru walidacyjnego:")
|
||||||
|
print(df_val.describe())
|
||||||
|
|
||||||
|
print(f"Statystyki zbioru testowego:")
|
||||||
|
print(df_test.describe())
|
||||||
|
|
||||||
|
# Distribution of the target variable
|
||||||
|
print(f"Rozkład zmiennej docelowej w całym zbiorze:")
|
||||||
|
print(df['diagnosis'].value_counts())
|
||||||
|
|
||||||
|
print(f"Rozkład zmiennej docelowej w zbiorze treningowym:")
|
||||||
|
print(df_train['diagnosis'].value_counts())
|
||||||
|
|
||||||
|
print(f"Rozkład zmiennej docelowej w zbiorze walidacyjnym:")
|
||||||
|
print(df_val['diagnosis'].value_counts())
|
||||||
|
|
||||||
|
print(f"Rozkład zmiennej docelowej w zbiorze testowym:")
|
||||||
|
print(df_test['diagnosis'].value_counts())
|
||||||
|
|
||||||
|
# Save the datasets
|
||||||
|
df.to_csv('./datasets/data.csv')
|
||||||
|
df_train.to_csv('./datasets/train.csv')
|
||||||
|
df_val.to_csv('./datasets/val.csv')
|
||||||
|
df_test.to_csv('./datasets/test.csv')
|
36
get_stats.py
Normal file
36
get_stats.py
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
# Necessary imports
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# Display settings
|
||||||
|
pd.set_option('display.max_columns', None)
|
||||||
|
|
||||||
|
# Load the datasets
|
||||||
|
df = pd.read_csv('./datasets/data.csv', index_col='id')
|
||||||
|
df_train = pd.read_csv('./datasets/train.csv', index_col='id')
|
||||||
|
df_val = pd.read_csv('./datasets/val.csv', index_col='id')
|
||||||
|
df_test = pd.read_csv('./datasets/test.csv', index_col='id')
|
||||||
|
|
||||||
|
# Save statistics and target distribution to a stats.txt file
|
||||||
|
with open('stats.txt', 'w') as f:
|
||||||
|
f.write(f"Cały zbiór: {df.shape[0]} wierszy, {df.shape[1]} kolumn\n")
|
||||||
|
f.write(f"Zbiór treningowy: {df_train.shape[0]} wierszy, {df_train.shape[1]} kolumn\n")
|
||||||
|
f.write(f"Zbiór walidacyjny: {df_val.shape[0]} wierszy, {df_val.shape[1]} kolumn\n")
|
||||||
|
f.write(f"Zbiór testowy: {df_test.shape[0]} wierszy, {df_test.shape[1]} kolumn\n")
|
||||||
|
|
||||||
|
f.write(f"\nStatystyki całego zbioru:\n")
|
||||||
|
f.write(f"{df.describe()}\n")
|
||||||
|
f.write(f"\nStatystyki zbioru treningowego:\n")
|
||||||
|
f.write(f"{df_train.describe()}\n")
|
||||||
|
f.write(f"\nStatystyki zbioru walidacyjnego:\n")
|
||||||
|
f.write(f"{df_val.describe()}\n")
|
||||||
|
f.write(f"\nStatystyki zbioru testowego:\n")
|
||||||
|
f.write(f"{df_test.describe()}\n")
|
||||||
|
|
||||||
|
f.write(f"\nRozkład zmiennej docelowej w całym zbiorze:\n")
|
||||||
|
f.write(f"{df['diagnosis'].value_counts()}\n")
|
||||||
|
f.write(f"\nRozkład zmiennej docelowej w zbiorze treningowym:\n")
|
||||||
|
f.write(f"{df_train['diagnosis'].value_counts()}\n")
|
||||||
|
f.write(f"\nRozkład zmiennej docelowej w zbiorze walidacyjnym:\n")
|
||||||
|
f.write(f"{df_val['diagnosis'].value_counts()}\n")
|
||||||
|
f.write(f"\nRozkład zmiennej docelowej w zbiorze testowym:\n")
|
||||||
|
f.write(f"{df_test['diagnosis'].value_counts()}\n")
|
Loading…
Reference in New Issue
Block a user