Dockerfile
This commit is contained in:
parent
5f863f13b1
commit
d0f0f7390c
@ -1,9 +1,5 @@
|
|||||||
FROM ubuntu:latest
|
FROM ubuntu:latest
|
||||||
|
|
||||||
RUN apt update && apt install -y python3-pip unzip
|
RUN apt update && apt install -y python3-pip
|
||||||
|
|
||||||
RUN pip install kaggle pandas numpy scikit-learn
|
RUN pip install pandas numpy scikit-learn
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY ./create-dataset.py ./
|
|
73
IUM_2.ipynb
73
IUM_2.ipynb
@ -4,14 +4,14 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## IUM 2"
|
"## IUM 2\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Installation of packages"
|
"### Installation of packages\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -67,7 +67,7 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Importing libraries"
|
"### Importing libraries\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -90,7 +90,7 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Downloading a dataset"
|
"### Downloading a dataset\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -114,7 +114,7 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Uncompress a file"
|
"### Uncompress a file\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -139,7 +139,7 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Load the data"
|
"### Load the data\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -148,15 +148,15 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"df = pd.read_csv('creditcard.csv')\n",
|
"df = pd.read_csv(\"creditcard.csv\")\n",
|
||||||
"pd.set_option('display.max_columns', None)"
|
"pd.set_option(\"display.max_columns\", None)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Check missing values"
|
"### Check missing values\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -214,7 +214,7 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Size of the dataset"
|
"### Size of the dataset\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -275,7 +275,7 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Normalising the data"
|
"### Normalising the data\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -286,14 +286,14 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"scaler = StandardScaler()\n",
|
"scaler = StandardScaler()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"df['Amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1, 1))"
|
"df[\"Amount\"] = scaler.fit_transform(df[\"Amount\"].values.reshape(-1, 1))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Summary statistics"
|
"### Summary statistics\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -717,7 +717,7 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Distribution of legitimate and fraudulent transactions"
|
"### Distribution of legitimate and fraudulent transactions\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -740,7 +740,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"df['Class'].value_counts()"
|
"df[\"Class\"].value_counts()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -748,7 +748,8 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Undersampling the data\n",
|
"### Undersampling the data\n",
|
||||||
"We will employ undersampling as one class significantly dominates the other."
|
"\n",
|
||||||
|
"We will employ undersampling as one class significantly dominates the other.\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -774,15 +775,15 @@
|
|||||||
"# Undersample dataset\n",
|
"# Undersample dataset\n",
|
||||||
"undersample_data = df.iloc[undersample_indice, :]\n",
|
"undersample_data = df.iloc[undersample_indice, :]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"X_undersample = undersample_data.iloc[:, undersample_data.columns != 'Class']\n",
|
"X_undersample = undersample_data.iloc[:, undersample_data.columns != \"Class\"]\n",
|
||||||
"y_undersample = undersample_data.iloc[:, undersample_data.columns == 'Class']"
|
"y_undersample = undersample_data.iloc[:, undersample_data.columns == \"Class\"]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Size of undersampled dataset"
|
"### Size of undersampled dataset\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -843,7 +844,7 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Summary statistics of the undersampled dataset"
|
"### Summary statistics of the undersampled dataset\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -1257,7 +1258,7 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Distribution of legitimate and fraudulent transactions in an undersampled dataset"
|
"### Distribution of legitimate and fraudulent transactions in an undersampled dataset\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -1280,14 +1281,14 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"undersample_data['Class'].value_counts()"
|
"undersample_data[\"Class\"].value_counts()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Splitting whole data into training and test datasets"
|
"### Splitting whole data into training and test datasets\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -1296,8 +1297,8 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"X = df.iloc[:, df.columns != 'Class']\n",
|
"X = df.iloc[:, df.columns != \"Class\"]\n",
|
||||||
"y = df.iloc[:, df.columns == 'Class']\n",
|
"y = df.iloc[:, df.columns == \"Class\"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)"
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)"
|
||||||
]
|
]
|
||||||
@ -1306,7 +1307,7 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Statistical measures of the training dataset of whole data"
|
"### Statistical measures of the training dataset of whole data\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -1810,14 +1811,14 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"pd.concat([X_train, y_train], axis=1)['Class'].value_counts()"
|
"pd.concat([X_train, y_train], axis=1)[\"Class\"].value_counts()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Statistical measures of the test dataset of whole data"
|
"### Statistical measures of the test dataset of whole data\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -2311,14 +2312,14 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"pd.concat([X_test, y_test], axis=1)['Class'].value_counts()"
|
"pd.concat([X_test, y_test], axis=1)[\"Class\"].value_counts()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Splitting undersampled data into training and test datasets"
|
"### Splitting undersampled data into training and test datasets\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -2327,14 +2328,16 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample, y_undersample, test_size = 0.3, random_state = 0)"
|
"X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = (\n",
|
||||||
|
" train_test_split(X_undersample, y_undersample, test_size=0.3, random_state=0)\n",
|
||||||
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Statistical measures of the training dataset of undersampled data"
|
"### Statistical measures of the training dataset of undersampled data\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -2818,14 +2821,14 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"pd.concat([X_train_undersample, y_train_undersample], axis=1)['Class'].value_counts()"
|
"pd.concat([X_train_undersample, y_train_undersample], axis=1)[\"Class\"].value_counts()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Statistical measures of the test dataset of undersampled data"
|
"### Statistical measures of the test dataset of undersampled data\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -3309,7 +3312,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"pd.concat([X_test_undersample, y_test_undersample], axis=1)['Class'].value_counts()"
|
"pd.concat([X_test_undersample, y_test_undersample], axis=1)[\"Class\"].value_counts()"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
1
Jenkinsfile
vendored
1
Jenkinsfile
vendored
@ -25,6 +25,7 @@ pipeline {
|
|||||||
stage('Download dataset') {
|
stage('Download dataset') {
|
||||||
steps {
|
steps {
|
||||||
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}"]) {
|
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}"]) {
|
||||||
|
sh 'pip install kaggle'
|
||||||
sh 'kaggle datasets download -d mlg-ulb/creditcardfraud'
|
sh 'kaggle datasets download -d mlg-ulb/creditcardfraud'
|
||||||
sh 'unzip -o creditcardfraud.zip'
|
sh 'unzip -o creditcardfraud.zip'
|
||||||
sh 'rm creditcardfraud.zip'
|
sh 'rm creditcardfraud.zip'
|
||||||
|
@ -88,11 +88,8 @@ def save_whole_data(df, X_train, X_test, y_train, y_test):
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# download_kaggle_dataset()
|
|
||||||
os.makedirs("data", exist_ok=True)
|
os.makedirs("data", exist_ok=True)
|
||||||
|
|
||||||
os.system
|
|
||||||
|
|
||||||
df = load_data("creditcard.csv")
|
df = load_data("creditcard.csv")
|
||||||
df = normalize_data(df)
|
df = normalize_data(df)
|
||||||
|
|
||||||
|
119
dataset-stats.py
Normal file
119
dataset-stats.py
Normal file
@ -0,0 +1,119 @@
|
|||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def write_to_file(file_name):
|
||||||
|
df = pd.read_csv("data/creditcard.csv")
|
||||||
|
pd.set_option("display.max_columns", None)
|
||||||
|
|
||||||
|
undersample_data = pd.read_csv("data/undersample_data.csv")
|
||||||
|
X_test_undersample = pd.read_csv("data/X_test_undersample.csv")
|
||||||
|
y_test_undersample = pd.read_csv("data/y_test_undersample.csv")
|
||||||
|
X_train_undersample = pd.read_csv("data/X_train_undersample.csv")
|
||||||
|
y_train_undersample = pd.read_csv("data/y_train_undersample.csv")
|
||||||
|
|
||||||
|
X_test = pd.read_csv("data/X_test.csv")
|
||||||
|
y_test = pd.read_csv("data/y_test.csv")
|
||||||
|
X_train = pd.read_csv("data/X_train.csv")
|
||||||
|
y_train = pd.read_csv("data/y_train.csv")
|
||||||
|
|
||||||
|
with open("stats_data/" + file_name, "w") as f:
|
||||||
|
f.write("Check missing values\n")
|
||||||
|
f.write(str(df.isnull().sum()))
|
||||||
|
|
||||||
|
f.write("\n\n")
|
||||||
|
|
||||||
|
f.write("Size of the dataset\n")
|
||||||
|
f.write(str(df.info()))
|
||||||
|
|
||||||
|
f.write("\n\n")
|
||||||
|
|
||||||
|
f.write("Summary statistics\n")
|
||||||
|
f.write(str(df.describe()))
|
||||||
|
|
||||||
|
f.write("\n\n")
|
||||||
|
|
||||||
|
f.write("Distribution of legitimate and fraudulent transactions\n")
|
||||||
|
f.write(str(df["Class"].value_counts()))
|
||||||
|
|
||||||
|
f.write("\n\n")
|
||||||
|
|
||||||
|
f.write("Size of undersampled dataset\n")
|
||||||
|
f.write(str(undersample_data.info()))
|
||||||
|
|
||||||
|
f.write("\n\n")
|
||||||
|
|
||||||
|
f.write("Summary statistics of the undersampled dataset\n")
|
||||||
|
f.write(str(undersample_data.describe()))
|
||||||
|
|
||||||
|
f.write("\n\n")
|
||||||
|
|
||||||
|
f.write(
|
||||||
|
"Distribution of legitimate and fraudulent transactions in an undersampled dataset\n"
|
||||||
|
)
|
||||||
|
f.write(str(undersample_data["Class"].value_counts()))
|
||||||
|
|
||||||
|
f.write("\n\n")
|
||||||
|
|
||||||
|
f.write("Statistical measures of the training dataset of whole data\n")
|
||||||
|
f.write(str(pd.concat([X_train, y_train], axis=1).info()))
|
||||||
|
f.write("\n")
|
||||||
|
f.write(str(pd.concat([X_train, y_train], axis=1).describe()))
|
||||||
|
f.write("\n")
|
||||||
|
f.write(str(pd.concat([X_train, y_train], axis=1)["Class"].value_counts()))
|
||||||
|
|
||||||
|
f.write("\n\n")
|
||||||
|
|
||||||
|
f.write("Statistical measures of the test dataset of whole data\n")
|
||||||
|
f.write(str(pd.concat([X_test, y_test], axis=1).info()))
|
||||||
|
f.write("\n")
|
||||||
|
f.write(str(pd.concat([X_test, y_test], axis=1).describe()))
|
||||||
|
f.write("\n")
|
||||||
|
f.write(str(pd.concat([X_test, y_test], axis=1)["Class"].value_counts()))
|
||||||
|
|
||||||
|
f.write("\n\n")
|
||||||
|
|
||||||
|
f.write("Statistical measures of the training dataset of undersampled data\n")
|
||||||
|
f.write(
|
||||||
|
str(pd.concat([X_train_undersample, y_train_undersample], axis=1).info())
|
||||||
|
)
|
||||||
|
f.write("\n")
|
||||||
|
f.write(
|
||||||
|
str(
|
||||||
|
pd.concat([X_train_undersample, y_train_undersample], axis=1).describe()
|
||||||
|
)
|
||||||
|
)
|
||||||
|
f.write("\n")
|
||||||
|
f.write(
|
||||||
|
str(
|
||||||
|
pd.concat([X_train_undersample, y_train_undersample], axis=1)[
|
||||||
|
"Class"
|
||||||
|
].value_counts()
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
f.write("\n\n")
|
||||||
|
|
||||||
|
f.write("Statistical measures of the test dataset of undersampled data\n")
|
||||||
|
f.write(str(pd.concat([X_test_undersample, y_test_undersample], axis=1).info()))
|
||||||
|
f.write("\n")
|
||||||
|
f.write(
|
||||||
|
str(pd.concat([X_test_undersample, y_test_undersample], axis=1).describe())
|
||||||
|
)
|
||||||
|
f.write("\n")
|
||||||
|
f.write(
|
||||||
|
str(
|
||||||
|
pd.concat([X_test_undersample, y_test_undersample], axis=1)[
|
||||||
|
"Class"
|
||||||
|
].value_counts()
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
os.makedirs("stats_data", exist_ok=True)
|
||||||
|
write_to_file("stats.txt")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
5
stats/Dockerfile
Normal file
5
stats/Dockerfile
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
FROM ubuntu:latest
|
||||||
|
|
||||||
|
RUN apt update && apt install -y python3-pip
|
||||||
|
|
||||||
|
RUN pip install pandas
|
10
stats/Jenkinsfile
vendored
10
stats/Jenkinsfile
vendored
@ -1,5 +1,7 @@
|
|||||||
pipeline {
|
pipeline {
|
||||||
agent any
|
agent docker {
|
||||||
|
image 'mateusz887/ium:latest'
|
||||||
|
}
|
||||||
|
|
||||||
parameters {
|
parameters {
|
||||||
buildSelector(
|
buildSelector(
|
||||||
@ -17,13 +19,13 @@ pipeline {
|
|||||||
}
|
}
|
||||||
stage('Copy Artifacts') {
|
stage('Copy Artifacts') {
|
||||||
steps {
|
steps {
|
||||||
copyArtifacts filter: 'data/*', projectName: 'z-s464913-create-dataset', selector: buildParameter('BUILD_SELECTOR')
|
copyArtifacts filter: 'data/*', projectName: 'z-s464913-create-dataset-1', selector: buildParameter('BUILD_SELECTOR')
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
stage('Generate Report') {
|
stage('Generate Report') {
|
||||||
steps {
|
steps {
|
||||||
sh 'chmod +x stats.sh'
|
sh 'chmod +x dataset_stats.py'
|
||||||
sh './stats.sh'
|
sh './dataset_stats.py'
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
stage('Archive Artifacts') {
|
stage('Archive Artifacts') {
|
||||||
|
Loading…
Reference in New Issue
Block a user