First jenkinsfile
This commit is contained in:
parent
485673f131
commit
52f3e10bb1
79
Jenkinsfile
vendored
79
Jenkinsfile
vendored
@ -1,26 +1,67 @@
|
|||||||
pipeline {
|
pipeline {
|
||||||
agent any
|
agent any
|
||||||
|
|
||||||
|
//Definijuemy parametry, które będzie można podać podczas wywoływania zadania
|
||||||
|
parameters{
|
||||||
|
string(
|
||||||
|
defaultValue: 'mikaleta',
|
||||||
|
description: 'Kaggle username',
|
||||||
|
name: 'KAGGLE_USERNAME',
|
||||||
|
trim: false
|
||||||
|
)
|
||||||
|
password(
|
||||||
|
defaultValue: '',
|
||||||
|
description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
|
||||||
|
name: 'KAGGLE_KEY'
|
||||||
|
)
|
||||||
|
string(
|
||||||
|
defaultValue: '500',
|
||||||
|
description: 'CUTOFF',
|
||||||
|
name: 'CUTOFF',
|
||||||
|
trim: false
|
||||||
|
)
|
||||||
|
}
|
||||||
stages {
|
stages {
|
||||||
stage('Clone repository') {
|
stage('clear_before') {
|
||||||
steps {
|
steps {
|
||||||
checkout([$class: 'GitSCM', branches: [[name: '*/master']],
|
sh 'rm -rf *'
|
||||||
doGenerateSubmoduleConfigurations: false,
|
}
|
||||||
extensions: [], submoduleCfg: [],
|
}
|
||||||
userRemoteConfigs: [[url: 'https://git.wmi.amu.edu.pl/s486867/ium_s486867']]])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
stage('Process data') {
|
stage('Clone Git') {
|
||||||
steps {
|
steps {
|
||||||
sh './process_data.sh'
|
sh 'git clone https://git.wmi.amu.edu.pl/s486867/ium_z486867'
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
stage('Archive artifacts') {
|
stage('Build') {
|
||||||
steps {
|
steps {
|
||||||
archiveArtifacts artifacts: 'results.txt', onlyIfSuccessful: true
|
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
|
||||||
|
"KAGGLE_KEY=${params.KAGGLE_KEY}" ]) {
|
||||||
|
sh 'kaggle datasets download -d dansbecker/powerlifting-database'
|
||||||
|
sh 'unzip video-game-sales-with-ratings.zip -d ./ium_z486867'
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
stage('Docker') {
|
||||||
|
agent {
|
||||||
|
dockerfile {
|
||||||
|
filename 'Dockerfile'
|
||||||
|
dir 'ium_z486867'
|
||||||
|
reuseNode true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
steps {
|
||||||
|
sh 'python ./ium_z486867/create-dataset.py'
|
||||||
|
archiveArtifacts 'X_test.csv'
|
||||||
|
archiveArtifacts 'X_dev.csv'
|
||||||
|
archiveArtifacts 'X_train.csv'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stage('clear_after') {
|
||||||
|
steps {
|
||||||
|
sh 'rm -rf *'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
87
main.py
87
main.py
@ -1,33 +1,76 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
from kaggle.api.kaggle_api_extended import KaggleApi
|
from kaggle.api.kaggle_api_extended import KaggleApi
|
||||||
import zipfile
|
import zipfile
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.preprocessing import MinMaxScaler
|
||||||
pd.set_option('display.max_columns', 100)
|
pd.set_option('display.max_columns', 100)
|
||||||
|
|
||||||
api = KaggleApi()
|
|
||||||
api.authenticate()
|
|
||||||
api.dataset_download_files('shivamb/netflix-shows', path='./data')
|
|
||||||
with zipfile.ZipFile('./data/netflix-shows.zip', 'r') as zip_ref:
|
|
||||||
zip_ref.extractall('./data')
|
|
||||||
|
|
||||||
netflix = pd.read_csv('./data/netflix_titles.csv')
|
DATA_DIRECTORY = './data'
|
||||||
|
|
||||||
netflix.dropna(inplace=True)
|
CSV_NAME = DATA_DIRECTORY + '/openpowerlifting.csv'
|
||||||
|
def download_data_from_kaggle():
|
||||||
|
api = KaggleApi()
|
||||||
|
api.authenticate()
|
||||||
|
api.dataset_download_files('dansbecker/powerlifting-database', path=DATA_DIRECTORY)
|
||||||
|
def extract_data_from_zip():
|
||||||
|
for file_name in os.listdir(DATA_DIRECTORY):
|
||||||
|
if file_name.endswith(".zip"):
|
||||||
|
file_path = os.path.join(DATA_DIRECTORY, file_name)
|
||||||
|
with zipfile.ZipFile(file_path, "r") as zip_ref:
|
||||||
|
zip_ref.extractall(DATA_DIRECTORY)
|
||||||
|
print(f"The file {file_name} has been unzipped.")
|
||||||
|
def process_data(csv_name):
|
||||||
|
# Read in the data and drop the specified columns
|
||||||
|
data = pd.read_csv(csv_name)
|
||||||
|
data.drop(columns=["Squat4Kg", "Bench4Kg", "Deadlift4Kg"], inplace=True)
|
||||||
|
data.dropna(inplace=True)
|
||||||
|
|
||||||
random_seed = 42
|
# Remove negative values
|
||||||
train_data, test_data = train_test_split(netflix, test_size=0.2, random_state=random_seed)
|
numeric_cols = data.select_dtypes(include=np.number).columns
|
||||||
train_data, dev_data = train_test_split(train_data, test_size=0.25, random_state=random_seed)
|
data[numeric_cols] = data[numeric_cols].apply(lambda x: x.clip(lower=0)).dropna()
|
||||||
|
|
||||||
train_stats = train_data.describe(include='all')
|
# Split the data into train, dev, and test sets if not already done
|
||||||
print(f"\nTraining set statistics:\n{train_stats}")
|
if "train" not in data.columns or "dev" not in data.columns or "test" not in data.columns:
|
||||||
dev_stats = dev_data.describe(include='all')
|
data_train, data_devtest = train_test_split(data, test_size=0.2, random_state=42, stratify=data["Division"])
|
||||||
print(f"\nDevelopment set statistics:\n{dev_stats}")
|
data_dev, data_test = train_test_split(data_devtest, test_size=0.5, random_state=42, stratify=data_devtest["Division"])
|
||||||
test_stats = test_data.describe(include='all')
|
data_train["Set"] = "train"
|
||||||
print(f"\nTest set statistics:\n{test_stats}")
|
data_dev["Set"] = "dev"
|
||||||
|
data_test["Set"] = "test"
|
||||||
|
data = pd.concat([data_train, data_dev, data_test], ignore_index=True)
|
||||||
|
|
||||||
train_class_dist = train_data["type"].value_counts()
|
# Collect and print statistics for the data and its subsets
|
||||||
print(f"\nTraining set class distribution:\n{train_class_dist}")
|
print("Data Set Statistics:")
|
||||||
dev_class_dist = dev_data["type"].value_counts()
|
print("Size: {}".format(len(data)))
|
||||||
print(f"\nDevelopment set class distribution:\n{dev_class_dist}")
|
print("Avg values:")
|
||||||
test_class_dist = test_data["type"].value_counts()
|
print(data.mean())
|
||||||
print(f"\nTest set class distribution:\n{test_class_dist}")
|
print("Min values:")
|
||||||
|
print(data.min())
|
||||||
|
print("Max values:")
|
||||||
|
print(data.max())
|
||||||
|
print("Standard deviations:")
|
||||||
|
print(data.std())
|
||||||
|
print("Median values:")
|
||||||
|
print(data.median())
|
||||||
|
|
||||||
|
# Compute the frequency distribution of examples for individual classes
|
||||||
|
print("\nFrequency distribution of examples for individual classes:")
|
||||||
|
print(data["Class"].value_counts())
|
||||||
|
|
||||||
|
# Normalize the data to the range of 0.0 - 1.0
|
||||||
|
scaler = MinMaxScaler()
|
||||||
|
data.iloc[:, :-2] = scaler.fit_transform(data.iloc[:, :-2])
|
||||||
|
|
||||||
|
# Clear the collection of artifacts (e.g. blank lines, examples with invalid values)
|
||||||
|
data.dropna(inplace=True)
|
||||||
|
|
||||||
|
# Clear the remaining columns from negative and empty values
|
||||||
|
data[data.columns[:-2]] = data[data.columns[:-2]].apply(lambda x: x.clip(lower=0))
|
||||||
|
|
||||||
|
return data
|
||||||
|
# download_data_from_kaggle()
|
||||||
|
# extract_data_from_zip()
|
||||||
|
process_data(CSV_NAME)
|
Loading…
Reference in New Issue
Block a user