diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..1990df4 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,4 @@ +FROM python:latest +RUN apt-get update && apt-get install -y +RUN pip install pandas +RUN pip install scikit-learn \ No newline at end of file diff --git a/main.py b/create-dataset.py similarity index 95% rename from main.py rename to create-dataset.py index 9761aa9..473f601 100644 --- a/main.py +++ b/create-dataset.py @@ -1,76 +1,76 @@ -import os - -from kaggle.api.kaggle_api_extended import KaggleApi -import zipfile -from sklearn.model_selection import train_test_split -import pandas as pd -import numpy as np -from sklearn.preprocessing import MinMaxScaler -pd.set_option('display.max_columns', 100) - - -DATA_DIRECTORY = './data' - -CSV_NAME = DATA_DIRECTORY + '/openpowerlifting.csv' -def download_data_from_kaggle(): - api = KaggleApi() - api.authenticate() - api.dataset_download_files('dansbecker/powerlifting-database', path=DATA_DIRECTORY) -def extract_data_from_zip(): - for file_name in os.listdir(DATA_DIRECTORY): - if file_name.endswith(".zip"): - file_path = os.path.join(DATA_DIRECTORY, file_name) - with zipfile.ZipFile(file_path, "r") as zip_ref: - zip_ref.extractall(DATA_DIRECTORY) - print(f"The file {file_name} has been unzipped.") -def process_data(csv_name): - # Read in the data and drop the specified columns - data = pd.read_csv(csv_name) - data.drop(columns=["Squat4Kg", "Bench4Kg", "Deadlift4Kg"], inplace=True) - data.dropna(inplace=True) - - # Remove negative values - numeric_cols = data.select_dtypes(include=np.number).columns - data[numeric_cols] = data[numeric_cols].apply(lambda x: x.clip(lower=0)).dropna() - - # Split the data into train, dev, and test sets if not already done - if "train" not in data.columns or "dev" not in data.columns or "test" not in data.columns: - data_train, data_devtest = train_test_split(data, test_size=0.2, random_state=42, stratify=data["Division"]) - data_dev, data_test = train_test_split(data_devtest, test_size=0.5, random_state=42, stratify=data_devtest["Division"]) - data_train["Set"] = "train" - data_dev["Set"] = "dev" - data_test["Set"] = "test" - data = pd.concat([data_train, data_dev, data_test], ignore_index=True) - - # Collect and print statistics for the data and its subsets - print("Data Set Statistics:") - print("Size: {}".format(len(data))) - print("Avg values:") - print(data.mean()) - print("Min values:") - print(data.min()) - print("Max values:") - print(data.max()) - print("Standard deviations:") - print(data.std()) - print("Median values:") - print(data.median()) - - # Compute the frequency distribution of examples for individual classes - print("\nFrequency distribution of examples for individual classes:") - print(data["Class"].value_counts()) - - # Normalize the data to the range of 0.0 - 1.0 - scaler = MinMaxScaler() - data.iloc[:, :-2] = scaler.fit_transform(data.iloc[:, :-2]) - - # Clear the collection of artifacts (e.g. blank lines, examples with invalid values) - data.dropna(inplace=True) - - # Clear the remaining columns from negative and empty values - data[data.columns[:-2]] = data[data.columns[:-2]].apply(lambda x: x.clip(lower=0)) - - return data -# download_data_from_kaggle() -# extract_data_from_zip() +import os + +from kaggle.api.kaggle_api_extended import KaggleApi +import zipfile +from sklearn.model_selection import train_test_split +import pandas as pd +import numpy as np +from sklearn.preprocessing import MinMaxScaler +pd.set_option('display.max_columns', 100) + + +DATA_DIRECTORY = './ium_z434686/' + +CSV_NAME = DATA_DIRECTORY + '/openpowerlifting.csv' +def download_data_from_kaggle(): + api = KaggleApi() + api.authenticate() + api.dataset_download_files('dansbecker/powerlifting-database', path=DATA_DIRECTORY) +def extract_data_from_zip(): + for file_name in os.listdir(DATA_DIRECTORY): + if file_name.endswith(".zip"): + file_path = os.path.join(DATA_DIRECTORY, file_name) + with zipfile.ZipFile(file_path, "r") as zip_ref: + zip_ref.extractall(DATA_DIRECTORY) + print(f"The file {file_name} has been unzipped.") +def process_data(csv_name): + # Read in the data and drop the specified columns + data = pd.read_csv(csv_name) + data.drop(columns=["Squat4Kg", "Bench4Kg", "Deadlift4Kg"], inplace=True) + data.dropna(inplace=True) + + # Remove negative values + numeric_cols = data.select_dtypes(include=np.number).columns + data[numeric_cols] = data[numeric_cols].apply(lambda x: x.clip(lower=0)).dropna() + + # Split the data into train, dev, and test sets if not already done + if "train" not in data.columns or "dev" not in data.columns or "test" not in data.columns: + data_train, data_devtest = train_test_split(data, test_size=0.2, random_state=42, stratify=data["Division"]) + data_dev, data_test = train_test_split(data_devtest, test_size=0.5, random_state=42, stratify=data_devtest["Division"]) + data_train["Set"] = "train" + data_dev["Set"] = "dev" + data_test["Set"] = "test" + data = pd.concat([data_train, data_dev, data_test], ignore_index=True) + + # Collect and print statistics for the data and its subsets + print("Data Set Statistics:") + print("Size: {}".format(len(data))) + print("Avg values:") + print(data.mean()) + print("Min values:") + print(data.min()) + print("Max values:") + print(data.max()) + print("Standard deviations:") + print(data.std()) + print("Median values:") + print(data.median()) + + # Compute the frequency distribution of examples for individual classes + print("\nFrequency distribution of examples for individual classes:") + print(data["Class"].value_counts()) + + # Normalize the data to the range of 0.0 - 1.0 + scaler = MinMaxScaler() + data.iloc[:, :-2] = scaler.fit_transform(data.iloc[:, :-2]) + + # Clear the collection of artifacts (e.g. blank lines, examples with invalid values) + data.dropna(inplace=True) + + # Clear the remaining columns from negative and empty values + data[data.columns[:-2]] = data[data.columns[:-2]].apply(lambda x: x.clip(lower=0)) + + return data +# download_data_from_kaggle() +# extract_data_from_zip() process_data(CSV_NAME) \ No newline at end of file