import os

from kaggle.api.kaggle_api_extended import KaggleApi
import zipfile
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
pd.set_option('display.max_columns', 100)


DATA_DIRECTORY = './data'

CSV_NAME = DATA_DIRECTORY + '/openpowerlifting.csv'
def download_data_from_kaggle():
    api = KaggleApi()
    api.authenticate()
    api.dataset_download_files('dansbecker/powerlifting-database', path=DATA_DIRECTORY)
def extract_data_from_zip():
    for file_name in os.listdir(DATA_DIRECTORY):
        if file_name.endswith(".zip"):
            file_path = os.path.join(DATA_DIRECTORY, file_name)
            with zipfile.ZipFile(file_path, "r") as zip_ref:
                zip_ref.extractall(DATA_DIRECTORY)
                print(f"The file {file_name} has been unzipped.")
def process_data(csv_name):
    # Read in the data and drop the specified columns
    data = pd.read_csv(csv_name)
    data.drop(columns=["Squat4Kg", "Bench4Kg", "Deadlift4Kg"], inplace=True)
    data.dropna(inplace=True)

    # Remove negative values
    numeric_cols = data.select_dtypes(include=np.number).columns
    data[numeric_cols] = data[numeric_cols].apply(lambda x: x.clip(lower=0)).dropna()

    # Split the data into train, dev, and test sets if not already done
    if "train" not in data.columns or "dev" not in data.columns or "test" not in data.columns:
        data_train, data_devtest = train_test_split(data, test_size=0.2, random_state=42, stratify=data["Division"])
        data_dev, data_test = train_test_split(data_devtest, test_size=0.5, random_state=42, stratify=data_devtest["Division"])
        data_train["Set"] = "train"
        data_dev["Set"] = "dev"
        data_test["Set"] = "test"
        data = pd.concat([data_train, data_dev, data_test], ignore_index=True)

    # Collect and print statistics for the data and its subsets
    print("Data Set Statistics:")
    print("Size: {}".format(len(data)))
    print("Avg values:")
    print(data.mean())
    print("Min values:")
    print(data.min())
    print("Max values:")
    print(data.max())
    print("Standard deviations:")
    print(data.std())
    print("Median values:")
    print(data.median())

    # Compute the frequency distribution of examples for individual classes
    print("\nFrequency distribution of examples for individual classes:")
    print(data["Class"].value_counts())

    # Normalize the data to the range of 0.0 - 1.0
    scaler = MinMaxScaler()
    data.iloc[:, :-2] = scaler.fit_transform(data.iloc[:, :-2])

    # Clear the collection of artifacts (e.g. blank lines, examples with invalid values)
    data.dropna(inplace=True)

    # Clear the remaining columns from negative and empty values
    data[data.columns[:-2]] = data[data.columns[:-2]].apply(lambda x: x.clip(lower=0))

    return data
# download_data_from_kaggle()
# extract_data_from_zip()
process_data(CSV_NAME)