From a9e10b6186559f03d3522374a784ee2555f6bac1 Mon Sep 17 00:00:00 2001
From: mikaleta <thekalkam@gmail.com>
Date: Thu, 20 Apr 2023 20:03:42 +0200
Subject: [PATCH] change path to jenkins

---
 Dockerfile                   |   4 +
 main.py => create-dataset.py | 150 +++++++++++++++++------------------
 2 files changed, 79 insertions(+), 75 deletions(-)
 create mode 100644 Dockerfile
 rename main.py => create-dataset.py (95%)

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..1990df4
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,4 @@
+FROM python:latest
+RUN apt-get update && apt-get install -y
+RUN pip install pandas
+RUN pip install scikit-learn
\ No newline at end of file
diff --git a/main.py b/create-dataset.py
similarity index 95%
rename from main.py
rename to create-dataset.py
index 9761aa9..473f601 100644
--- a/main.py
+++ b/create-dataset.py
@@ -1,76 +1,76 @@
-import os
-
-from kaggle.api.kaggle_api_extended import KaggleApi
-import zipfile
-from sklearn.model_selection import train_test_split
-import pandas as pd
-import numpy as np
-from sklearn.preprocessing import MinMaxScaler
-pd.set_option('display.max_columns', 100)
-
-
-DATA_DIRECTORY = './data'
-
-CSV_NAME = DATA_DIRECTORY + '/openpowerlifting.csv'
-def download_data_from_kaggle():
-    api = KaggleApi()
-    api.authenticate()
-    api.dataset_download_files('dansbecker/powerlifting-database', path=DATA_DIRECTORY)
-def extract_data_from_zip():
-    for file_name in os.listdir(DATA_DIRECTORY):
-        if file_name.endswith(".zip"):
-            file_path = os.path.join(DATA_DIRECTORY, file_name)
-            with zipfile.ZipFile(file_path, "r") as zip_ref:
-                zip_ref.extractall(DATA_DIRECTORY)
-                print(f"The file {file_name} has been unzipped.")
-def process_data(csv_name):
-    # Read in the data and drop the specified columns
-    data = pd.read_csv(csv_name)
-    data.drop(columns=["Squat4Kg", "Bench4Kg", "Deadlift4Kg"], inplace=True)
-    data.dropna(inplace=True)
-
-    # Remove negative values
-    numeric_cols = data.select_dtypes(include=np.number).columns
-    data[numeric_cols] = data[numeric_cols].apply(lambda x: x.clip(lower=0)).dropna()
-
-    # Split the data into train, dev, and test sets if not already done
-    if "train" not in data.columns or "dev" not in data.columns or "test" not in data.columns:
-        data_train, data_devtest = train_test_split(data, test_size=0.2, random_state=42, stratify=data["Division"])
-        data_dev, data_test = train_test_split(data_devtest, test_size=0.5, random_state=42, stratify=data_devtest["Division"])
-        data_train["Set"] = "train"
-        data_dev["Set"] = "dev"
-        data_test["Set"] = "test"
-        data = pd.concat([data_train, data_dev, data_test], ignore_index=True)
-
-    # Collect and print statistics for the data and its subsets
-    print("Data Set Statistics:")
-    print("Size: {}".format(len(data)))
-    print("Avg values:")
-    print(data.mean())
-    print("Min values:")
-    print(data.min())
-    print("Max values:")
-    print(data.max())
-    print("Standard deviations:")
-    print(data.std())
-    print("Median values:")
-    print(data.median())
-
-    # Compute the frequency distribution of examples for individual classes
-    print("\nFrequency distribution of examples for individual classes:")
-    print(data["Class"].value_counts())
-
-    # Normalize the data to the range of 0.0 - 1.0
-    scaler = MinMaxScaler()
-    data.iloc[:, :-2] = scaler.fit_transform(data.iloc[:, :-2])
-
-    # Clear the collection of artifacts (e.g. blank lines, examples with invalid values)
-    data.dropna(inplace=True)
-
-    # Clear the remaining columns from negative and empty values
-    data[data.columns[:-2]] = data[data.columns[:-2]].apply(lambda x: x.clip(lower=0))
-
-    return data
-# download_data_from_kaggle()
-# extract_data_from_zip()
+import os
+
+from kaggle.api.kaggle_api_extended import KaggleApi
+import zipfile
+from sklearn.model_selection import train_test_split
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import MinMaxScaler
+pd.set_option('display.max_columns', 100)
+
+
+DATA_DIRECTORY = './ium_z434686/'
+
+CSV_NAME = DATA_DIRECTORY + '/openpowerlifting.csv'
+def download_data_from_kaggle():
+    api = KaggleApi()
+    api.authenticate()
+    api.dataset_download_files('dansbecker/powerlifting-database', path=DATA_DIRECTORY)
+def extract_data_from_zip():
+    for file_name in os.listdir(DATA_DIRECTORY):
+        if file_name.endswith(".zip"):
+            file_path = os.path.join(DATA_DIRECTORY, file_name)
+            with zipfile.ZipFile(file_path, "r") as zip_ref:
+                zip_ref.extractall(DATA_DIRECTORY)
+                print(f"The file {file_name} has been unzipped.")
+def process_data(csv_name):
+    # Read in the data and drop the specified columns
+    data = pd.read_csv(csv_name)
+    data.drop(columns=["Squat4Kg", "Bench4Kg", "Deadlift4Kg"], inplace=True)
+    data.dropna(inplace=True)
+
+    # Remove negative values
+    numeric_cols = data.select_dtypes(include=np.number).columns
+    data[numeric_cols] = data[numeric_cols].apply(lambda x: x.clip(lower=0)).dropna()
+
+    # Split the data into train, dev, and test sets if not already done
+    if "train" not in data.columns or "dev" not in data.columns or "test" not in data.columns:
+        data_train, data_devtest = train_test_split(data, test_size=0.2, random_state=42, stratify=data["Division"])
+        data_dev, data_test = train_test_split(data_devtest, test_size=0.5, random_state=42, stratify=data_devtest["Division"])
+        data_train["Set"] = "train"
+        data_dev["Set"] = "dev"
+        data_test["Set"] = "test"
+        data = pd.concat([data_train, data_dev, data_test], ignore_index=True)
+
+    # Collect and print statistics for the data and its subsets
+    print("Data Set Statistics:")
+    print("Size: {}".format(len(data)))
+    print("Avg values:")
+    print(data.mean())
+    print("Min values:")
+    print(data.min())
+    print("Max values:")
+    print(data.max())
+    print("Standard deviations:")
+    print(data.std())
+    print("Median values:")
+    print(data.median())
+
+    # Compute the frequency distribution of examples for individual classes
+    print("\nFrequency distribution of examples for individual classes:")
+    print(data["Class"].value_counts())
+
+    # Normalize the data to the range of 0.0 - 1.0
+    scaler = MinMaxScaler()
+    data.iloc[:, :-2] = scaler.fit_transform(data.iloc[:, :-2])
+
+    # Clear the collection of artifacts (e.g. blank lines, examples with invalid values)
+    data.dropna(inplace=True)
+
+    # Clear the remaining columns from negative and empty values
+    data[data.columns[:-2]] = data[data.columns[:-2]].apply(lambda x: x.clip(lower=0))
+
+    return data
+# download_data_from_kaggle()
+# extract_data_from_zip()
 process_data(CSV_NAME)
\ No newline at end of file