import os from kaggle.api.kaggle_api_extended import KaggleApi import zipfile from sklearn.model_selection import train_test_split import pandas as pd import numpy as np from sklearn.preprocessing import MinMaxScaler pd.set_option('display.max_columns', 100) DATA_DIRECTORY = './data' CSV_NAME = DATA_DIRECTORY + '/openpowerlifting.csv' def download_data_from_kaggle(): api = KaggleApi() api.authenticate() api.dataset_download_files('dansbecker/powerlifting-database', path=DATA_DIRECTORY) def extract_data_from_zip(): for file_name in os.listdir(DATA_DIRECTORY): if file_name.endswith(".zip"): file_path = os.path.join(DATA_DIRECTORY, file_name) with zipfile.ZipFile(file_path, "r") as zip_ref: zip_ref.extractall(DATA_DIRECTORY) print(f"The file {file_name} has been unzipped.") def process_data(csv_name): # Read in the data and drop the specified columns data = pd.read_csv(csv_name) data.drop(columns=["Squat4Kg", "Bench4Kg", "Deadlift4Kg"], inplace=True) data.dropna(inplace=True) # Remove negative values numeric_cols = data.select_dtypes(include=np.number).columns data[numeric_cols] = data[numeric_cols].apply(lambda x: x.clip(lower=0)).dropna() # Split the data into train, dev, and test sets if not already done if "train" not in data.columns or "dev" not in data.columns or "test" not in data.columns: data_train, data_devtest = train_test_split(data, test_size=0.2, random_state=42, stratify=data["Division"]) data_dev, data_test = train_test_split(data_devtest, test_size=0.5, random_state=42, stratify=data_devtest["Division"]) data_train["Set"] = "train" data_dev["Set"] = "dev" data_test["Set"] = "test" data = pd.concat([data_train, data_dev, data_test], ignore_index=True) # Collect and print statistics for the data and its subsets print("Data Set Statistics:") print("Size: {}".format(len(data))) print("Avg values:") print(data.mean()) print("Min values:") print(data.min()) print("Max values:") print(data.max()) print("Standard deviations:") print(data.std()) print("Median values:") print(data.median()) # Compute the frequency distribution of examples for individual classes print("\nFrequency distribution of examples for individual classes:") print(data["Class"].value_counts()) # Normalize the data to the range of 0.0 - 1.0 scaler = MinMaxScaler() data.iloc[:, :-2] = scaler.fit_transform(data.iloc[:, :-2]) # Clear the collection of artifacts (e.g. blank lines, examples with invalid values) data.dropna(inplace=True) # Clear the remaining columns from negative and empty values data[data.columns[:-2]] = data[data.columns[:-2]].apply(lambda x: x.clip(lower=0)) return data # download_data_from_kaggle() # extract_data_from_zip() process_data(CSV_NAME)