diff --git a/Dockerfile b/Dockerfile index 69895c1..8def072 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,5 @@ FROM ubuntu:latest -RUN apt update && apt install -y python3-pip unzip +RUN apt update && apt install -y python3-pip -RUN pip install kaggle pandas numpy scikit-learn - -WORKDIR /app - -COPY ./create-dataset.py ./ \ No newline at end of file +RUN pip install pandas numpy scikit-learn \ No newline at end of file diff --git a/IUM_2.ipynb b/IUM_2.ipynb index 2cedc12..6910212 100644 --- a/IUM_2.ipynb +++ b/IUM_2.ipynb @@ -4,14 +4,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## IUM 2" + "## IUM 2\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Installation of packages" + "### Installation of packages\n" ] }, { @@ -67,7 +67,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Importing libraries" + "### Importing libraries\n" ] }, { @@ -90,7 +90,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Downloading a dataset" + "### Downloading a dataset\n" ] }, { @@ -114,7 +114,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Uncompress a file" + "### Uncompress a file\n" ] }, { @@ -139,7 +139,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Load the data" + "### Load the data\n" ] }, { @@ -148,15 +148,15 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.read_csv('creditcard.csv')\n", - "pd.set_option('display.max_columns', None)" + "df = pd.read_csv(\"creditcard.csv\")\n", + "pd.set_option(\"display.max_columns\", None)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Check missing values" + "### Check missing values\n" ] }, { @@ -214,7 +214,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Size of the dataset" + "### Size of the dataset\n" ] }, { @@ -275,7 +275,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Normalising the data" + "### Normalising the data\n" ] }, { @@ -286,14 +286,14 @@ "source": [ "scaler = StandardScaler()\n", "\n", - "df['Amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1, 1))" + "df[\"Amount\"] = scaler.fit_transform(df[\"Amount\"].values.reshape(-1, 1))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Summary statistics" + "### Summary statistics\n" ] }, { @@ -717,7 +717,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Distribution of legitimate and fraudulent transactions" + "### Distribution of legitimate and fraudulent transactions\n" ] }, { @@ -740,7 +740,7 @@ } ], "source": [ - "df['Class'].value_counts()" + "df[\"Class\"].value_counts()" ] }, { @@ -748,7 +748,8 @@ "metadata": {}, "source": [ "### Undersampling the data\n", - "We will employ undersampling as one class significantly dominates the other." + "\n", + "We will employ undersampling as one class significantly dominates the other.\n" ] }, { @@ -774,15 +775,15 @@ "# Undersample dataset\n", "undersample_data = df.iloc[undersample_indice, :]\n", "\n", - "X_undersample = undersample_data.iloc[:, undersample_data.columns != 'Class']\n", - "y_undersample = undersample_data.iloc[:, undersample_data.columns == 'Class']" + "X_undersample = undersample_data.iloc[:, undersample_data.columns != \"Class\"]\n", + "y_undersample = undersample_data.iloc[:, undersample_data.columns == \"Class\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Size of undersampled dataset" + "### Size of undersampled dataset\n" ] }, { @@ -843,7 +844,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Summary statistics of the undersampled dataset" + "### Summary statistics of the undersampled dataset\n" ] }, { @@ -1257,7 +1258,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Distribution of legitimate and fraudulent transactions in an undersampled dataset" + "### Distribution of legitimate and fraudulent transactions in an undersampled dataset\n" ] }, { @@ -1280,14 +1281,14 @@ } ], "source": [ - "undersample_data['Class'].value_counts()" + "undersample_data[\"Class\"].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Splitting whole data into training and test datasets" + "### Splitting whole data into training and test datasets\n" ] }, { @@ -1296,8 +1297,8 @@ "metadata": {}, "outputs": [], "source": [ - "X = df.iloc[:, df.columns != 'Class']\n", - "y = df.iloc[:, df.columns == 'Class']\n", + "X = df.iloc[:, df.columns != \"Class\"]\n", + "y = df.iloc[:, df.columns == \"Class\"]\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)" ] @@ -1306,7 +1307,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Statistical measures of the training dataset of whole data" + "### Statistical measures of the training dataset of whole data\n" ] }, { @@ -1810,14 +1811,14 @@ } ], "source": [ - "pd.concat([X_train, y_train], axis=1)['Class'].value_counts()" + "pd.concat([X_train, y_train], axis=1)[\"Class\"].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Statistical measures of the test dataset of whole data" + "### Statistical measures of the test dataset of whole data\n" ] }, { @@ -2311,14 +2312,14 @@ } ], "source": [ - "pd.concat([X_test, y_test], axis=1)['Class'].value_counts()" + "pd.concat([X_test, y_test], axis=1)[\"Class\"].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Splitting undersampled data into training and test datasets" + "### Splitting undersampled data into training and test datasets\n" ] }, { @@ -2327,14 +2328,16 @@ "metadata": {}, "outputs": [], "source": [ - "X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample, y_undersample, test_size = 0.3, random_state = 0)" + "X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = (\n", + " train_test_split(X_undersample, y_undersample, test_size=0.3, random_state=0)\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Statistical measures of the training dataset of undersampled data" + "### Statistical measures of the training dataset of undersampled data\n" ] }, { @@ -2818,14 +2821,14 @@ } ], "source": [ - "pd.concat([X_train_undersample, y_train_undersample], axis=1)['Class'].value_counts()" + "pd.concat([X_train_undersample, y_train_undersample], axis=1)[\"Class\"].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Statistical measures of the test dataset of undersampled data" + "### Statistical measures of the test dataset of undersampled data\n" ] }, { @@ -3309,7 +3312,7 @@ } ], "source": [ - "pd.concat([X_test_undersample, y_test_undersample], axis=1)['Class'].value_counts()" + "pd.concat([X_test_undersample, y_test_undersample], axis=1)[\"Class\"].value_counts()" ] } ], diff --git a/Jenkinsfile b/Jenkinsfile index 57f76e2..7d93729 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -25,6 +25,7 @@ pipeline { stage('Download dataset') { steps { withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}"]) { + sh 'pip install kaggle' sh 'kaggle datasets download -d mlg-ulb/creditcardfraud' sh 'unzip -o creditcardfraud.zip' sh 'rm creditcardfraud.zip' diff --git a/create-dataset.py b/create-dataset.py index 5ad482c..eb10e44 100644 --- a/create-dataset.py +++ b/create-dataset.py @@ -88,11 +88,8 @@ def save_whole_data(df, X_train, X_test, y_train, y_test): def main(): - # download_kaggle_dataset() os.makedirs("data", exist_ok=True) - os.system - df = load_data("creditcard.csv") df = normalize_data(df) diff --git a/dataset-stats.py b/dataset-stats.py new file mode 100644 index 0000000..41cfe57 --- /dev/null +++ b/dataset-stats.py @@ -0,0 +1,119 @@ +import os +import pandas as pd + + +def write_to_file(file_name): + df = pd.read_csv("data/creditcard.csv") + pd.set_option("display.max_columns", None) + + undersample_data = pd.read_csv("data/undersample_data.csv") + X_test_undersample = pd.read_csv("data/X_test_undersample.csv") + y_test_undersample = pd.read_csv("data/y_test_undersample.csv") + X_train_undersample = pd.read_csv("data/X_train_undersample.csv") + y_train_undersample = pd.read_csv("data/y_train_undersample.csv") + + X_test = pd.read_csv("data/X_test.csv") + y_test = pd.read_csv("data/y_test.csv") + X_train = pd.read_csv("data/X_train.csv") + y_train = pd.read_csv("data/y_train.csv") + + with open("stats_data/" + file_name, "w") as f: + f.write("Check missing values\n") + f.write(str(df.isnull().sum())) + + f.write("\n\n") + + f.write("Size of the dataset\n") + f.write(str(df.info())) + + f.write("\n\n") + + f.write("Summary statistics\n") + f.write(str(df.describe())) + + f.write("\n\n") + + f.write("Distribution of legitimate and fraudulent transactions\n") + f.write(str(df["Class"].value_counts())) + + f.write("\n\n") + + f.write("Size of undersampled dataset\n") + f.write(str(undersample_data.info())) + + f.write("\n\n") + + f.write("Summary statistics of the undersampled dataset\n") + f.write(str(undersample_data.describe())) + + f.write("\n\n") + + f.write( + "Distribution of legitimate and fraudulent transactions in an undersampled dataset\n" + ) + f.write(str(undersample_data["Class"].value_counts())) + + f.write("\n\n") + + f.write("Statistical measures of the training dataset of whole data\n") + f.write(str(pd.concat([X_train, y_train], axis=1).info())) + f.write("\n") + f.write(str(pd.concat([X_train, y_train], axis=1).describe())) + f.write("\n") + f.write(str(pd.concat([X_train, y_train], axis=1)["Class"].value_counts())) + + f.write("\n\n") + + f.write("Statistical measures of the test dataset of whole data\n") + f.write(str(pd.concat([X_test, y_test], axis=1).info())) + f.write("\n") + f.write(str(pd.concat([X_test, y_test], axis=1).describe())) + f.write("\n") + f.write(str(pd.concat([X_test, y_test], axis=1)["Class"].value_counts())) + + f.write("\n\n") + + f.write("Statistical measures of the training dataset of undersampled data\n") + f.write( + str(pd.concat([X_train_undersample, y_train_undersample], axis=1).info()) + ) + f.write("\n") + f.write( + str( + pd.concat([X_train_undersample, y_train_undersample], axis=1).describe() + ) + ) + f.write("\n") + f.write( + str( + pd.concat([X_train_undersample, y_train_undersample], axis=1)[ + "Class" + ].value_counts() + ) + ) + + f.write("\n\n") + + f.write("Statistical measures of the test dataset of undersampled data\n") + f.write(str(pd.concat([X_test_undersample, y_test_undersample], axis=1).info())) + f.write("\n") + f.write( + str(pd.concat([X_test_undersample, y_test_undersample], axis=1).describe()) + ) + f.write("\n") + f.write( + str( + pd.concat([X_test_undersample, y_test_undersample], axis=1)[ + "Class" + ].value_counts() + ) + ) + + +def main(): + os.makedirs("stats_data", exist_ok=True) + write_to_file("stats.txt") + + +if __name__ == "__main__": + main() diff --git a/stats.sh b/dataset-stats.sh similarity index 100% rename from stats.sh rename to dataset-stats.sh diff --git a/stats/Dockerfile b/stats/Dockerfile new file mode 100644 index 0000000..3dc72dd --- /dev/null +++ b/stats/Dockerfile @@ -0,0 +1,5 @@ +FROM ubuntu:latest + +RUN apt update && apt install -y python3-pip + +RUN pip install pandas diff --git a/stats/Jenkinsfile b/stats/Jenkinsfile index 6610ecd..5559a9d 100644 --- a/stats/Jenkinsfile +++ b/stats/Jenkinsfile @@ -1,5 +1,7 @@ pipeline { - agent any + agent docker { + image 'mateusz887/ium:latest' + } parameters { buildSelector( @@ -17,13 +19,13 @@ pipeline { } stage('Copy Artifacts') { steps { - copyArtifacts filter: 'data/*', projectName: 'z-s464913-create-dataset', selector: buildParameter('BUILD_SELECTOR') + copyArtifacts filter: 'data/*', projectName: 'z-s464913-create-dataset-1', selector: buildParameter('BUILD_SELECTOR') } } stage('Generate Report') { steps { - sh 'chmod +x stats.sh' - sh './stats.sh' + sh 'chmod +x dataset_stats.py' + sh './dataset_stats.py' } } stage('Archive Artifacts') {