Dockerfile
This commit is contained in:
parent
16d7603009
commit
5f21137f2d
9
Dockerfile
Normal file
9
Dockerfile
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
FROM ubuntu:latest
|
||||||
|
|
||||||
|
RUN apt update && apt install -y python3-pip unzip
|
||||||
|
|
||||||
|
RUN pip install --user kaggle pandas numpy scikit-learn
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY ./create-dataset.py ./
|
16
Jenkinsfile
vendored
16
Jenkinsfile
vendored
@ -1,5 +1,7 @@
|
|||||||
pipeline {
|
pipeline {
|
||||||
agent any
|
agent {
|
||||||
|
dockerfile true
|
||||||
|
}
|
||||||
|
|
||||||
parameters {
|
parameters {
|
||||||
string (
|
string (
|
||||||
@ -16,24 +18,18 @@ pipeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
stages {
|
stages {
|
||||||
stage('Clone Repository') {
|
stage('Run create-dataset script') {
|
||||||
steps {
|
|
||||||
git branch: 'main', url: 'https://git.wmi.amu.edu.pl/s464913/ium_464913.git'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
stage('Download Dataset') {
|
|
||||||
steps {
|
steps {
|
||||||
script {
|
script {
|
||||||
withEnv (["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}"]) {
|
withEnv (["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}"]) {
|
||||||
sh 'chmod +x download_dataset.sh'
|
sh 'python create-dataset.py'
|
||||||
sh './download_dataset.sh'
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
stage('Archive Artifacts') {
|
stage('Archive Artifacts') {
|
||||||
steps {
|
steps {
|
||||||
archiveArtifacts artifacts: 'data/*', onlyIfSuccessful: true
|
archiveArtifacts artifacts: '/data/*', onlyIfSuccessful: true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
120
create-dataset.py
Normal file
120
create-dataset.py
Normal file
@ -0,0 +1,120 @@
|
|||||||
|
import os
|
||||||
|
import kaggle
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
|
||||||
|
def download_kaggle_dataset():
|
||||||
|
kaggle.api.authenticate()
|
||||||
|
kaggle.api.dataset_download_files("mlg-ulb/creditcardfraud", path="./", unzip=True)
|
||||||
|
|
||||||
|
|
||||||
|
def load_data(name):
|
||||||
|
df = pd.read_csv(name)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_data(df):
|
||||||
|
scaler = StandardScaler()
|
||||||
|
df["Amount"] = scaler.fit_transform(df["Amount"].values.reshape(-1, 1))
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def create_undersample_data(df):
|
||||||
|
# Determine the number of instances in the minority class
|
||||||
|
fraud_count = len(df[df.Class == 1])
|
||||||
|
fraud_indices = np.array(df[df.Class == 1].index)
|
||||||
|
|
||||||
|
# Select indices corresponding to majority class instances
|
||||||
|
normal_indices = df[df.Class == 0].index
|
||||||
|
|
||||||
|
# Randomly sample the same number of instances from the majority class
|
||||||
|
random_normal_indices = np.random.choice(normal_indices, fraud_count, replace=False)
|
||||||
|
random_normal_indices = np.array(random_normal_indices)
|
||||||
|
|
||||||
|
# Combine indices of both classes
|
||||||
|
undersample_indice = np.concatenate([fraud_indices, random_normal_indices])
|
||||||
|
|
||||||
|
# Undersample dataset
|
||||||
|
undersample_data = df.iloc[undersample_indice, :]
|
||||||
|
|
||||||
|
X_undersample = undersample_data.iloc[:, undersample_data.columns != "Class"]
|
||||||
|
y_undersample = undersample_data.iloc[:, undersample_data.columns == "Class"]
|
||||||
|
|
||||||
|
return undersample_data, X_undersample, y_undersample
|
||||||
|
|
||||||
|
|
||||||
|
def split_undersample_data(X_undersample, y_undersample):
|
||||||
|
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = (
|
||||||
|
train_test_split(X_undersample, y_undersample, test_size=0.3, random_state=0)
|
||||||
|
)
|
||||||
|
|
||||||
|
return (
|
||||||
|
X_train_undersample,
|
||||||
|
X_test_undersample,
|
||||||
|
y_train_undersample,
|
||||||
|
y_test_undersample,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def save_undersample_data(
|
||||||
|
undersample_data,
|
||||||
|
X_train_undersample,
|
||||||
|
X_test_undersample,
|
||||||
|
y_train_undersample,
|
||||||
|
y_test_undersample,
|
||||||
|
):
|
||||||
|
undersample_data.to_csv("/data/undersample_data.csv", index=False)
|
||||||
|
X_train_undersample.to_csv("/data/X_train_undersample.csv", index=False)
|
||||||
|
X_test_undersample.to_csv("/data/X_test_undersample.csv", index=False)
|
||||||
|
y_train_undersample.to_csv("/data/y_train_undersample.csv", index=False)
|
||||||
|
y_test_undersample.to_csv("/data/y_test_undersample.csv", index=False)
|
||||||
|
|
||||||
|
|
||||||
|
def split_whole_data(df):
|
||||||
|
X = df.iloc[:, df.columns != "Class"]
|
||||||
|
y = df.iloc[:, df.columns == "Class"]
|
||||||
|
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(
|
||||||
|
X, y, test_size=0.3, random_state=0
|
||||||
|
)
|
||||||
|
return X_train, X_test, y_train, y_test
|
||||||
|
|
||||||
|
|
||||||
|
def save_whole_data(df, X_train, X_test, y_train, y_test):
|
||||||
|
df.to_csv("/data/creditcard.csv", index=False)
|
||||||
|
X_train.to_csv("/data/X_train.csv", index=False)
|
||||||
|
X_test.to_csv("/data/X_test.csv", index=False)
|
||||||
|
y_train.to_csv("/data/y_train.csv", index=False)
|
||||||
|
y_test.to_csv("/data/y_test.csv", index=False)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
download_kaggle_dataset()
|
||||||
|
os.makedirs("data", exist_ok=True)
|
||||||
|
|
||||||
|
df = load_data("creditcard.csv")
|
||||||
|
df = normalize_data(df)
|
||||||
|
|
||||||
|
undersample_data, X_undersample, y_undersample = create_undersample_data(df)
|
||||||
|
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = (
|
||||||
|
split_undersample_data(X_undersample, y_undersample)
|
||||||
|
)
|
||||||
|
save_undersample_data(
|
||||||
|
undersample_data,
|
||||||
|
X_train_undersample,
|
||||||
|
X_test_undersample,
|
||||||
|
y_train_undersample,
|
||||||
|
y_test_undersample,
|
||||||
|
)
|
||||||
|
|
||||||
|
X_train, X_test, y_train, y_test = split_whole_data(df)
|
||||||
|
save_whole_data(X_train, X_test, y_train, y_test)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -1,7 +1,5 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
# Install the Kaggle API
|
|
||||||
pip install kaggle
|
|
||||||
# Download the dataset from Kaggle
|
# Download the dataset from Kaggle
|
||||||
kaggle datasets download -d mlg-ulb/creditcardfraud
|
kaggle datasets download -d mlg-ulb/creditcardfraud
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user