change path to jenkins

This commit is contained in:
mikaleta 2023-04-20 20:03:42 +02:00
parent cd93aaa75e
commit a9e10b6186
2 changed files with 79 additions and 75 deletions

4
Dockerfile Normal file
View File

@ -0,0 +1,4 @@
FROM python:latest
RUN apt-get update && apt-get install -y
RUN pip install pandas
RUN pip install scikit-learn

View File

@ -1,76 +1,76 @@
import os import os
from kaggle.api.kaggle_api_extended import KaggleApi from kaggle.api.kaggle_api_extended import KaggleApi
import zipfile import zipfile
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import MinMaxScaler
pd.set_option('display.max_columns', 100) pd.set_option('display.max_columns', 100)
DATA_DIRECTORY = './data' DATA_DIRECTORY = './ium_z434686/'
CSV_NAME = DATA_DIRECTORY + '/openpowerlifting.csv' CSV_NAME = DATA_DIRECTORY + '/openpowerlifting.csv'
def download_data_from_kaggle(): def download_data_from_kaggle():
api = KaggleApi() api = KaggleApi()
api.authenticate() api.authenticate()
api.dataset_download_files('dansbecker/powerlifting-database', path=DATA_DIRECTORY) api.dataset_download_files('dansbecker/powerlifting-database', path=DATA_DIRECTORY)
def extract_data_from_zip(): def extract_data_from_zip():
for file_name in os.listdir(DATA_DIRECTORY): for file_name in os.listdir(DATA_DIRECTORY):
if file_name.endswith(".zip"): if file_name.endswith(".zip"):
file_path = os.path.join(DATA_DIRECTORY, file_name) file_path = os.path.join(DATA_DIRECTORY, file_name)
with zipfile.ZipFile(file_path, "r") as zip_ref: with zipfile.ZipFile(file_path, "r") as zip_ref:
zip_ref.extractall(DATA_DIRECTORY) zip_ref.extractall(DATA_DIRECTORY)
print(f"The file {file_name} has been unzipped.") print(f"The file {file_name} has been unzipped.")
def process_data(csv_name): def process_data(csv_name):
# Read in the data and drop the specified columns # Read in the data and drop the specified columns
data = pd.read_csv(csv_name) data = pd.read_csv(csv_name)
data.drop(columns=["Squat4Kg", "Bench4Kg", "Deadlift4Kg"], inplace=True) data.drop(columns=["Squat4Kg", "Bench4Kg", "Deadlift4Kg"], inplace=True)
data.dropna(inplace=True) data.dropna(inplace=True)
# Remove negative values # Remove negative values
numeric_cols = data.select_dtypes(include=np.number).columns numeric_cols = data.select_dtypes(include=np.number).columns
data[numeric_cols] = data[numeric_cols].apply(lambda x: x.clip(lower=0)).dropna() data[numeric_cols] = data[numeric_cols].apply(lambda x: x.clip(lower=0)).dropna()
# Split the data into train, dev, and test sets if not already done # Split the data into train, dev, and test sets if not already done
if "train" not in data.columns or "dev" not in data.columns or "test" not in data.columns: if "train" not in data.columns or "dev" not in data.columns or "test" not in data.columns:
data_train, data_devtest = train_test_split(data, test_size=0.2, random_state=42, stratify=data["Division"]) data_train, data_devtest = train_test_split(data, test_size=0.2, random_state=42, stratify=data["Division"])
data_dev, data_test = train_test_split(data_devtest, test_size=0.5, random_state=42, stratify=data_devtest["Division"]) data_dev, data_test = train_test_split(data_devtest, test_size=0.5, random_state=42, stratify=data_devtest["Division"])
data_train["Set"] = "train" data_train["Set"] = "train"
data_dev["Set"] = "dev" data_dev["Set"] = "dev"
data_test["Set"] = "test" data_test["Set"] = "test"
data = pd.concat([data_train, data_dev, data_test], ignore_index=True) data = pd.concat([data_train, data_dev, data_test], ignore_index=True)
# Collect and print statistics for the data and its subsets # Collect and print statistics for the data and its subsets
print("Data Set Statistics:") print("Data Set Statistics:")
print("Size: {}".format(len(data))) print("Size: {}".format(len(data)))
print("Avg values:") print("Avg values:")
print(data.mean()) print(data.mean())
print("Min values:") print("Min values:")
print(data.min()) print(data.min())
print("Max values:") print("Max values:")
print(data.max()) print(data.max())
print("Standard deviations:") print("Standard deviations:")
print(data.std()) print(data.std())
print("Median values:") print("Median values:")
print(data.median()) print(data.median())
# Compute the frequency distribution of examples for individual classes # Compute the frequency distribution of examples for individual classes
print("\nFrequency distribution of examples for individual classes:") print("\nFrequency distribution of examples for individual classes:")
print(data["Class"].value_counts()) print(data["Class"].value_counts())
# Normalize the data to the range of 0.0 - 1.0 # Normalize the data to the range of 0.0 - 1.0
scaler = MinMaxScaler() scaler = MinMaxScaler()
data.iloc[:, :-2] = scaler.fit_transform(data.iloc[:, :-2]) data.iloc[:, :-2] = scaler.fit_transform(data.iloc[:, :-2])
# Clear the collection of artifacts (e.g. blank lines, examples with invalid values) # Clear the collection of artifacts (e.g. blank lines, examples with invalid values)
data.dropna(inplace=True) data.dropna(inplace=True)
# Clear the remaining columns from negative and empty values # Clear the remaining columns from negative and empty values
data[data.columns[:-2]] = data[data.columns[:-2]].apply(lambda x: x.clip(lower=0)) data[data.columns[:-2]] = data[data.columns[:-2]].apply(lambda x: x.clip(lower=0))
return data return data
# download_data_from_kaggle() # download_data_from_kaggle()
# extract_data_from_zip() # extract_data_from_zip()
process_data(CSV_NAME) process_data(CSV_NAME)