diff --git a/Jenkinsfile b/Jenkinsfile index 2cf5c35..dee4ba2 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,26 +1,67 @@ pipeline { - agent any + agent any + //Definijuemy parametry, które będzie można podać podczas wywoływania zadania + parameters{ + string( + defaultValue: 'mikaleta', + description: 'Kaggle username', + name: 'KAGGLE_USERNAME', + trim: false + ) + password( + defaultValue: '', + description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials', + name: 'KAGGLE_KEY' + ) + string( + defaultValue: '500', + description: 'CUTOFF', + name: 'CUTOFF', + trim: false + ) + } stages { - stage('Clone repository') { - steps { - checkout([$class: 'GitSCM', branches: [[name: '*/master']], - doGenerateSubmoduleConfigurations: false, - extensions: [], submoduleCfg: [], - userRemoteConfigs: [[url: 'https://git.wmi.amu.edu.pl/s486867/ium_s486867']]]) - } - } - - stage('Process data') { - steps { - sh './process_data.sh' - } - } + stage('clear_before') { + steps { + sh 'rm -rf *' + } + } - stage('Archive artifacts') { - steps { - archiveArtifacts artifacts: 'results.txt', onlyIfSuccessful: true + stage('Clone Git') { + steps { + sh 'git clone https://git.wmi.amu.edu.pl/s486867/ium_z486867' + } + } + + stage('Build') { + steps { + withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", + "KAGGLE_KEY=${params.KAGGLE_KEY}" ]) { + sh 'kaggle datasets download -d dansbecker/powerlifting-database' + sh 'unzip video-game-sales-with-ratings.zip -d ./ium_z486867' } - } - } -} + } + } + stage('Docker') { + agent { + dockerfile { + filename 'Dockerfile' + dir 'ium_z486867' + reuseNode true + } + } + steps { + sh 'python ./ium_z486867/create-dataset.py' + archiveArtifacts 'X_test.csv' + archiveArtifacts 'X_dev.csv' + archiveArtifacts 'X_train.csv' + } + } + stage('clear_after') { + steps { + sh 'rm -rf *' + } + } + } +} \ No newline at end of file diff --git a/main.py b/main.py index f28b7a2..9761aa9 100644 --- a/main.py +++ b/main.py @@ -1,33 +1,76 @@ +import os + from kaggle.api.kaggle_api_extended import KaggleApi import zipfile from sklearn.model_selection import train_test_split import pandas as pd +import numpy as np +from sklearn.preprocessing import MinMaxScaler pd.set_option('display.max_columns', 100) -api = KaggleApi() -api.authenticate() -api.dataset_download_files('shivamb/netflix-shows', path='./data') -with zipfile.ZipFile('./data/netflix-shows.zip', 'r') as zip_ref: - zip_ref.extractall('./data') -netflix = pd.read_csv('./data/netflix_titles.csv') +DATA_DIRECTORY = './data' -netflix.dropna(inplace=True) +CSV_NAME = DATA_DIRECTORY + '/openpowerlifting.csv' +def download_data_from_kaggle(): + api = KaggleApi() + api.authenticate() + api.dataset_download_files('dansbecker/powerlifting-database', path=DATA_DIRECTORY) +def extract_data_from_zip(): + for file_name in os.listdir(DATA_DIRECTORY): + if file_name.endswith(".zip"): + file_path = os.path.join(DATA_DIRECTORY, file_name) + with zipfile.ZipFile(file_path, "r") as zip_ref: + zip_ref.extractall(DATA_DIRECTORY) + print(f"The file {file_name} has been unzipped.") +def process_data(csv_name): + # Read in the data and drop the specified columns + data = pd.read_csv(csv_name) + data.drop(columns=["Squat4Kg", "Bench4Kg", "Deadlift4Kg"], inplace=True) + data.dropna(inplace=True) -random_seed = 42 -train_data, test_data = train_test_split(netflix, test_size=0.2, random_state=random_seed) -train_data, dev_data = train_test_split(train_data, test_size=0.25, random_state=random_seed) + # Remove negative values + numeric_cols = data.select_dtypes(include=np.number).columns + data[numeric_cols] = data[numeric_cols].apply(lambda x: x.clip(lower=0)).dropna() -train_stats = train_data.describe(include='all') -print(f"\nTraining set statistics:\n{train_stats}") -dev_stats = dev_data.describe(include='all') -print(f"\nDevelopment set statistics:\n{dev_stats}") -test_stats = test_data.describe(include='all') -print(f"\nTest set statistics:\n{test_stats}") + # Split the data into train, dev, and test sets if not already done + if "train" not in data.columns or "dev" not in data.columns or "test" not in data.columns: + data_train, data_devtest = train_test_split(data, test_size=0.2, random_state=42, stratify=data["Division"]) + data_dev, data_test = train_test_split(data_devtest, test_size=0.5, random_state=42, stratify=data_devtest["Division"]) + data_train["Set"] = "train" + data_dev["Set"] = "dev" + data_test["Set"] = "test" + data = pd.concat([data_train, data_dev, data_test], ignore_index=True) -train_class_dist = train_data["type"].value_counts() -print(f"\nTraining set class distribution:\n{train_class_dist}") -dev_class_dist = dev_data["type"].value_counts() -print(f"\nDevelopment set class distribution:\n{dev_class_dist}") -test_class_dist = test_data["type"].value_counts() -print(f"\nTest set class distribution:\n{test_class_dist}") + # Collect and print statistics for the data and its subsets + print("Data Set Statistics:") + print("Size: {}".format(len(data))) + print("Avg values:") + print(data.mean()) + print("Min values:") + print(data.min()) + print("Max values:") + print(data.max()) + print("Standard deviations:") + print(data.std()) + print("Median values:") + print(data.median()) + + # Compute the frequency distribution of examples for individual classes + print("\nFrequency distribution of examples for individual classes:") + print(data["Class"].value_counts()) + + # Normalize the data to the range of 0.0 - 1.0 + scaler = MinMaxScaler() + data.iloc[:, :-2] = scaler.fit_transform(data.iloc[:, :-2]) + + # Clear the collection of artifacts (e.g. blank lines, examples with invalid values) + data.dropna(inplace=True) + + # Clear the remaining columns from negative and empty values + data[data.columns[:-2]] = data[data.columns[:-2]].apply(lambda x: x.clip(lower=0)) + + return data +# download_data_from_kaggle() +# extract_data_from_zip() +process_data(CSV_NAME) \ No newline at end of file