change paths to jenkins env
This commit is contained in:
parent
a9e10b6186
commit
83a9fa44dc
@ -1,76 +1,34 @@
|
||||
import os
|
||||
|
||||
from kaggle.api.kaggle_api_extended import KaggleApi
|
||||
import zipfile
|
||||
from sklearn.model_selection import train_test_split
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
|
||||
pd.set_option('display.max_columns', 100)
|
||||
|
||||
|
||||
DATA_DIRECTORY = './ium_z434686/'
|
||||
DATA_DIRECTORY = './ium_z434686'
|
||||
|
||||
CSV_NAME = DATA_DIRECTORY + '/openpowerlifting.csv'
|
||||
def download_data_from_kaggle():
|
||||
api = KaggleApi()
|
||||
api.authenticate()
|
||||
api.dataset_download_files('dansbecker/powerlifting-database', path=DATA_DIRECTORY)
|
||||
def extract_data_from_zip():
|
||||
for file_name in os.listdir(DATA_DIRECTORY):
|
||||
if file_name.endswith(".zip"):
|
||||
file_path = os.path.join(DATA_DIRECTORY, file_name)
|
||||
with zipfile.ZipFile(file_path, "r") as zip_ref:
|
||||
zip_ref.extractall(DATA_DIRECTORY)
|
||||
print(f"The file {file_name} has been unzipped.")
|
||||
|
||||
def process_data(csv_name):
|
||||
# Read in the data and drop the specified columns
|
||||
data = pd.read_csv(csv_name)
|
||||
data.drop(columns=["Squat4Kg", "Bench4Kg", "Deadlift4Kg"], inplace=True)
|
||||
data.dropna(inplace=True)
|
||||
CUTOFF = int(os.environ['CUTOFF'])
|
||||
|
||||
# Remove negative values
|
||||
numeric_cols = data.select_dtypes(include=np.number).columns
|
||||
data[numeric_cols] = data[numeric_cols].apply(lambda x: x.clip(lower=0)).dropna()
|
||||
powerlifting_data = pd.read_csv(csv_name,
|
||||
engine='python',
|
||||
encoding='ISO-8859-1',
|
||||
sep=',')
|
||||
powerlifting_data.dropna()
|
||||
powerlifting_data.drop(columns=["Squat4Kg", "Bench4Kg", "Deadlift4Kg"], inplace=True)
|
||||
|
||||
# Split the data into train, dev, and test sets if not already done
|
||||
if "train" not in data.columns or "dev" not in data.columns or "test" not in data.columns:
|
||||
data_train, data_devtest = train_test_split(data, test_size=0.2, random_state=42, stratify=data["Division"])
|
||||
data_dev, data_test = train_test_split(data_devtest, test_size=0.5, random_state=42, stratify=data_devtest["Division"])
|
||||
data_train["Set"] = "train"
|
||||
data_dev["Set"] = "dev"
|
||||
data_test["Set"] = "test"
|
||||
data = pd.concat([data_train, data_dev, data_test], ignore_index=True)
|
||||
powerlifting_data.sample(CUTOFF)
|
||||
|
||||
# Collect and print statistics for the data and its subsets
|
||||
print("Data Set Statistics:")
|
||||
print("Size: {}".format(len(data)))
|
||||
print("Avg values:")
|
||||
print(data.mean())
|
||||
print("Min values:")
|
||||
print(data.min())
|
||||
print("Max values:")
|
||||
print(data.max())
|
||||
print("Standard deviations:")
|
||||
print(data.std())
|
||||
print("Median values:")
|
||||
print(data.median())
|
||||
X, Y = powerlifting_data, powerlifting_data
|
||||
|
||||
# Compute the frequency distribution of examples for individual classes
|
||||
print("\nFrequency distribution of examples for individual classes:")
|
||||
print(data["Class"].value_counts())
|
||||
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.3, random_state=1)
|
||||
X_dev, X_test, Y_dev, Y_test = train_test_split(X_temp, Y_temp, test_size=0.3, random_state=1)
|
||||
|
||||
# Normalize the data to the range of 0.0 - 1.0
|
||||
scaler = MinMaxScaler()
|
||||
data.iloc[:, :-2] = scaler.fit_transform(data.iloc[:, :-2])
|
||||
X_train.to_csv('X_train.csv', index=False)
|
||||
X_dev.to_csv('X_dev.csv', index=False)
|
||||
X_test.to_csv('X_test.csv', index=False)
|
||||
|
||||
# Clear the collection of artifacts (e.g. blank lines, examples with invalid values)
|
||||
data.dropna(inplace=True)
|
||||
|
||||
# Clear the remaining columns from negative and empty values
|
||||
data[data.columns[:-2]] = data[data.columns[:-2]].apply(lambda x: x.clip(lower=0))
|
||||
|
||||
return data
|
||||
# download_data_from_kaggle()
|
||||
# extract_data_from_zip()
|
||||
process_data(CSV_NAME)
|
Loading…
Reference in New Issue
Block a user