{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## IUM 2\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Installation of packages\n" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: kaggle in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (1.6.6)\n", "Requirement already satisfied: six>=1.10 in c:\\users\\skype\\appdata\\roaming\\python\\python312\\site-packages (from kaggle) (1.16.0)\n", "Requirement already satisfied: certifi in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from kaggle) (2024.2.2)\n", "Requirement already satisfied: python-dateutil in c:\\users\\skype\\appdata\\roaming\\python\\python312\\site-packages (from kaggle) (2.9.0.post0)\n", "Requirement already satisfied: requests in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from kaggle) (2.31.0)\n", "Requirement already satisfied: tqdm in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from kaggle) (4.66.2)\n", "Requirement already satisfied: python-slugify in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from kaggle) (8.0.4)\n", "Requirement already satisfied: urllib3 in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from kaggle) (2.2.1)\n", "Requirement already satisfied: bleach in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from kaggle) (6.1.0)\n", "Requirement already satisfied: webencodings in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from bleach->kaggle) (0.5.1)\n", "Requirement already satisfied: text-unidecode>=1.3 in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from python-slugify->kaggle) (1.3)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from requests->kaggle) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from requests->kaggle) (3.6)\n", "Requirement already satisfied: colorama in c:\\users\\skype\\appdata\\roaming\\python\\python312\\site-packages (from tqdm->kaggle) (0.4.6)\n", "Note: you may need to restart the kernel to use updated packages.\n", "Requirement already satisfied: pandas in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (2.2.1)\n", "Requirement already satisfied: numpy<2,>=1.26.0 in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (1.26.3)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\skype\\appdata\\roaming\\python\\python312\\site-packages (from pandas) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (2024.1)\n", "Requirement already satisfied: tzdata>=2022.7 in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (2024.1)\n", "Requirement already satisfied: six>=1.5 in c:\\users\\skype\\appdata\\roaming\\python\\python312\\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n", "Note: you may need to restart the kernel to use updated packages.\n", "Requirement already satisfied: numpy in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (1.26.3)\n", "Note: you may need to restart the kernel to use updated packages.\n", "Requirement already satisfied: scikit-learn in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (1.4.1.post1)\n", "Requirement already satisfied: numpy<2.0,>=1.19.5 in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from scikit-learn) (1.26.3)\n", "Requirement already satisfied: scipy>=1.6.0 in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from scikit-learn) (1.12.0)\n", "Requirement already satisfied: joblib>=1.2.0 in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from scikit-learn) (1.3.2)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from scikit-learn) (3.3.0)\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "%pip install kaggle\n", "%pip install pandas\n", "%pip install numpy\n", "%pip install scikit-learn" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Importing libraries\n" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "# To preprocess the data\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "# To split the data\n", "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Downloading a dataset\n" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "creditcardfraud.zip: Skipping, found more recently modified local copy (use --force to force download)\n" ] } ], "source": [ "!kaggle datasets download -d mlg-ulb/creditcardfraud" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Uncompress a file\n" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Archive: creditcardfraud.zip\n", " inflating: creditcard.csv \n" ] } ], "source": [ "!unzip -o creditcardfraud.zip" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load the data\n" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"creditcard.csv\")\n", "pd.set_option(\"display.max_columns\", None)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Check missing values\n" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Time 0\n", "V1 0\n", "V2 0\n", "V3 0\n", "V4 0\n", "V5 0\n", "V6 0\n", "V7 0\n", "V8 0\n", "V9 0\n", "V10 0\n", "V11 0\n", "V12 0\n", "V13 0\n", "V14 0\n", "V15 0\n", "V16 0\n", "V17 0\n", "V18 0\n", "V19 0\n", "V20 0\n", "V21 0\n", "V22 0\n", "V23 0\n", "V24 0\n", "V25 0\n", "V26 0\n", "V27 0\n", "V28 0\n", "Amount 0\n", "Class 0\n", "dtype: int64" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull().sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Size of the dataset\n" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 284807 entries, 0 to 284806\n", "Data columns (total 31 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Time 284807 non-null float64\n", " 1 V1 284807 non-null float64\n", " 2 V2 284807 non-null float64\n", " 3 V3 284807 non-null float64\n", " 4 V4 284807 non-null float64\n", " 5 V5 284807 non-null float64\n", " 6 V6 284807 non-null float64\n", " 7 V7 284807 non-null float64\n", " 8 V8 284807 non-null float64\n", " 9 V9 284807 non-null float64\n", " 10 V10 284807 non-null float64\n", " 11 V11 284807 non-null float64\n", " 12 V12 284807 non-null float64\n", " 13 V13 284807 non-null float64\n", " 14 V14 284807 non-null float64\n", " 15 V15 284807 non-null float64\n", " 16 V16 284807 non-null float64\n", " 17 V17 284807 non-null float64\n", " 18 V18 284807 non-null float64\n", " 19 V19 284807 non-null float64\n", " 20 V20 284807 non-null float64\n", " 21 V21 284807 non-null float64\n", " 22 V22 284807 non-null float64\n", " 23 V23 284807 non-null float64\n", " 24 V24 284807 non-null float64\n", " 25 V25 284807 non-null float64\n", " 26 V26 284807 non-null float64\n", " 27 V27 284807 non-null float64\n", " 28 V28 284807 non-null float64\n", " 29 Amount 284807 non-null float64\n", " 30 Class 284807 non-null int64 \n", "dtypes: float64(30), int64(1)\n", "memory usage: 67.4 MB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Normalising the data\n" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [], "source": [ "scaler = StandardScaler()\n", "\n", "df[\"Amount\"] = scaler.fit_transform(df[\"Amount\"].values.reshape(-1, 1))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Summary statistics\n" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TimeV1V2V3V4V5V6V7V8V9V10V11V12V13V14V15V16V17V18V19V20V21V22V23V24V25V26V27V28AmountClass
count284807.0000002.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+05284807.000000
mean94813.8595751.168375e-153.416908e-16-1.379537e-152.074095e-159.604066e-161.487313e-15-5.556467e-161.213481e-16-2.406331e-152.239053e-151.673327e-15-1.247012e-158.190001e-161.207294e-154.887456e-151.437716e-15-3.772171e-169.564149e-161.039917e-156.406204e-161.654067e-16-3.568593e-162.578648e-164.473266e-155.340915e-161.683437e-15-3.660091e-16-1.227390e-162.913952e-170.001727
std47488.1459551.958696e+001.651309e+001.516255e+001.415869e+001.380247e+001.332271e+001.237094e+001.194353e+001.098632e+001.088850e+001.020713e+009.992014e-019.952742e-019.585956e-019.153160e-018.762529e-018.493371e-018.381762e-018.140405e-017.709250e-017.345240e-017.257016e-016.244603e-016.056471e-015.212781e-014.822270e-014.036325e-013.300833e-011.000002e+000.041527
min0.000000-5.640751e+01-7.271573e+01-4.832559e+01-5.683171e+00-1.137433e+02-2.616051e+01-4.355724e+01-7.321672e+01-1.343407e+01-2.458826e+01-4.797473e+00-1.868371e+01-5.791881e+00-1.921433e+01-4.498945e+00-1.412985e+01-2.516280e+01-9.498746e+00-7.213527e+00-5.449772e+01-3.483038e+01-1.093314e+01-4.480774e+01-2.836627e+00-1.029540e+01-2.604551e+00-2.256568e+01-1.543008e+01-3.532294e-010.000000
25%54201.500000-9.203734e-01-5.985499e-01-8.903648e-01-8.486401e-01-6.915971e-01-7.682956e-01-5.540759e-01-2.086297e-01-6.430976e-01-5.354257e-01-7.624942e-01-4.055715e-01-6.485393e-01-4.255740e-01-5.828843e-01-4.680368e-01-4.837483e-01-4.988498e-01-4.562989e-01-2.117214e-01-2.283949e-01-5.423504e-01-1.618463e-01-3.545861e-01-3.171451e-01-3.269839e-01-7.083953e-02-5.295979e-02-3.308401e-010.000000
50%84692.0000001.810880e-026.548556e-021.798463e-01-1.984653e-02-5.433583e-02-2.741871e-014.010308e-022.235804e-02-5.142873e-02-9.291738e-02-3.275735e-021.400326e-01-1.356806e-025.060132e-024.807155e-026.641332e-02-6.567575e-02-3.636312e-033.734823e-03-6.248109e-02-2.945017e-026.781943e-03-1.119293e-024.097606e-021.659350e-02-5.213911e-021.342146e-031.124383e-02-2.652715e-010.000000
75%139320.5000001.315642e+008.037239e-011.027196e+007.433413e-016.119264e-013.985649e-015.704361e-013.273459e-015.971390e-014.539234e-017.395934e-016.182380e-016.625050e-014.931498e-016.488208e-015.232963e-013.996750e-015.008067e-014.589494e-011.330408e-011.863772e-015.285536e-011.476421e-014.395266e-013.507156e-012.409522e-019.104512e-027.827995e-02-4.471707e-020.000000
max172792.0000002.454930e+002.205773e+019.382558e+001.687534e+013.480167e+017.330163e+011.205895e+022.000721e+011.559499e+012.374514e+011.201891e+017.848392e+007.126883e+001.052677e+018.877742e+001.731511e+019.253526e+005.041069e+005.591971e+003.942090e+012.720284e+011.050309e+012.252841e+014.584549e+007.519589e+003.517346e+003.161220e+013.384781e+011.023622e+021.000000
\n", "
" ], "text/plain": [ " Time V1 V2 V3 V4 \\\n", "count 284807.000000 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 \n", "mean 94813.859575 1.168375e-15 3.416908e-16 -1.379537e-15 2.074095e-15 \n", "std 47488.145955 1.958696e+00 1.651309e+00 1.516255e+00 1.415869e+00 \n", "min 0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00 \n", "25% 54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01 \n", "50% 84692.000000 1.810880e-02 6.548556e-02 1.798463e-01 -1.984653e-02 \n", "75% 139320.500000 1.315642e+00 8.037239e-01 1.027196e+00 7.433413e-01 \n", "max 172792.000000 2.454930e+00 2.205773e+01 9.382558e+00 1.687534e+01 \n", "\n", " V5 V6 V7 V8 V9 \\\n", "count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 \n", "mean 9.604066e-16 1.487313e-15 -5.556467e-16 1.213481e-16 -2.406331e-15 \n", "std 1.380247e+00 1.332271e+00 1.237094e+00 1.194353e+00 1.098632e+00 \n", "min -1.137433e+02 -2.616051e+01 -4.355724e+01 -7.321672e+01 -1.343407e+01 \n", "25% -6.915971e-01 -7.682956e-01 -5.540759e-01 -2.086297e-01 -6.430976e-01 \n", "50% -5.433583e-02 -2.741871e-01 4.010308e-02 2.235804e-02 -5.142873e-02 \n", "75% 6.119264e-01 3.985649e-01 5.704361e-01 3.273459e-01 5.971390e-01 \n", "max 3.480167e+01 7.330163e+01 1.205895e+02 2.000721e+01 1.559499e+01 \n", "\n", " V10 V11 V12 V13 V14 \\\n", "count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 \n", "mean 2.239053e-15 1.673327e-15 -1.247012e-15 8.190001e-16 1.207294e-15 \n", "std 1.088850e+00 1.020713e+00 9.992014e-01 9.952742e-01 9.585956e-01 \n", "min -2.458826e+01 -4.797473e+00 -1.868371e+01 -5.791881e+00 -1.921433e+01 \n", "25% -5.354257e-01 -7.624942e-01 -4.055715e-01 -6.485393e-01 -4.255740e-01 \n", "50% -9.291738e-02 -3.275735e-02 1.400326e-01 -1.356806e-02 5.060132e-02 \n", "75% 4.539234e-01 7.395934e-01 6.182380e-01 6.625050e-01 4.931498e-01 \n", "max 2.374514e+01 1.201891e+01 7.848392e+00 7.126883e+00 1.052677e+01 \n", "\n", " V15 V16 V17 V18 V19 \\\n", "count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 \n", "mean 4.887456e-15 1.437716e-15 -3.772171e-16 9.564149e-16 1.039917e-15 \n", "std 9.153160e-01 8.762529e-01 8.493371e-01 8.381762e-01 8.140405e-01 \n", "min -4.498945e+00 -1.412985e+01 -2.516280e+01 -9.498746e+00 -7.213527e+00 \n", "25% -5.828843e-01 -4.680368e-01 -4.837483e-01 -4.988498e-01 -4.562989e-01 \n", "50% 4.807155e-02 6.641332e-02 -6.567575e-02 -3.636312e-03 3.734823e-03 \n", "75% 6.488208e-01 5.232963e-01 3.996750e-01 5.008067e-01 4.589494e-01 \n", "max 8.877742e+00 1.731511e+01 9.253526e+00 5.041069e+00 5.591971e+00 \n", "\n", " V20 V21 V22 V23 V24 \\\n", "count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 \n", "mean 6.406204e-16 1.654067e-16 -3.568593e-16 2.578648e-16 4.473266e-15 \n", "std 7.709250e-01 7.345240e-01 7.257016e-01 6.244603e-01 6.056471e-01 \n", "min -5.449772e+01 -3.483038e+01 -1.093314e+01 -4.480774e+01 -2.836627e+00 \n", "25% -2.117214e-01 -2.283949e-01 -5.423504e-01 -1.618463e-01 -3.545861e-01 \n", "50% -6.248109e-02 -2.945017e-02 6.781943e-03 -1.119293e-02 4.097606e-02 \n", "75% 1.330408e-01 1.863772e-01 5.285536e-01 1.476421e-01 4.395266e-01 \n", "max 3.942090e+01 2.720284e+01 1.050309e+01 2.252841e+01 4.584549e+00 \n", "\n", " V25 V26 V27 V28 Amount \\\n", "count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 \n", "mean 5.340915e-16 1.683437e-15 -3.660091e-16 -1.227390e-16 2.913952e-17 \n", "std 5.212781e-01 4.822270e-01 4.036325e-01 3.300833e-01 1.000002e+00 \n", "min -1.029540e+01 -2.604551e+00 -2.256568e+01 -1.543008e+01 -3.532294e-01 \n", "25% -3.171451e-01 -3.269839e-01 -7.083953e-02 -5.295979e-02 -3.308401e-01 \n", "50% 1.659350e-02 -5.213911e-02 1.342146e-03 1.124383e-02 -2.652715e-01 \n", "75% 3.507156e-01 2.409522e-01 9.104512e-02 7.827995e-02 -4.471707e-02 \n", "max 7.519589e+00 3.517346e+00 3.161220e+01 3.384781e+01 1.023622e+02 \n", "\n", " Class \n", "count 284807.000000 \n", "mean 0.001727 \n", "std 0.041527 \n", "min 0.000000 \n", "25% 0.000000 \n", "50% 0.000000 \n", "75% 0.000000 \n", "max 1.000000 " ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Distribution of legitimate and fraudulent transactions\n" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Class\n", "0 284315\n", "1 492\n", "Name: count, dtype: int64" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"Class\"].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Undersampling the data\n", "\n", "We will employ undersampling as one class significantly dominates the other.\n" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [], "source": [ "# Determine the number of instances in the minority class\n", "fraud_count = len(df[df.Class == 1])\n", "fraud_indices = np.array(df[df.Class == 1].index)\n", "\n", "# Select indices corresponding to majority class instances\n", "normal_indices = df[df.Class == 0].index\n", "\n", "# Randomly sample the same number of instances from the majority class\n", "random_normal_indices = np.random.choice(normal_indices, fraud_count, replace=False)\n", "random_normal_indices = np.array(random_normal_indices)\n", "\n", "# Combine indices of both classes\n", "undersample_indice = np.concatenate([fraud_indices, random_normal_indices])\n", "\n", "# Undersample dataset\n", "undersample_data = df.iloc[undersample_indice, :]\n", "\n", "X_undersample = undersample_data.iloc[:, undersample_data.columns != \"Class\"]\n", "y_undersample = undersample_data.iloc[:, undersample_data.columns == \"Class\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Size of undersampled dataset\n" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 984 entries, 541 to 141412\n", "Data columns (total 31 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Time 984 non-null float64\n", " 1 V1 984 non-null float64\n", " 2 V2 984 non-null float64\n", " 3 V3 984 non-null float64\n", " 4 V4 984 non-null float64\n", " 5 V5 984 non-null float64\n", " 6 V6 984 non-null float64\n", " 7 V7 984 non-null float64\n", " 8 V8 984 non-null float64\n", " 9 V9 984 non-null float64\n", " 10 V10 984 non-null float64\n", " 11 V11 984 non-null float64\n", " 12 V12 984 non-null float64\n", " 13 V13 984 non-null float64\n", " 14 V14 984 non-null float64\n", " 15 V15 984 non-null float64\n", " 16 V16 984 non-null float64\n", " 17 V17 984 non-null float64\n", " 18 V18 984 non-null float64\n", " 19 V19 984 non-null float64\n", " 20 V20 984 non-null float64\n", " 21 V21 984 non-null float64\n", " 22 V22 984 non-null float64\n", " 23 V23 984 non-null float64\n", " 24 V24 984 non-null float64\n", " 25 V25 984 non-null float64\n", " 26 V26 984 non-null float64\n", " 27 V27 984 non-null float64\n", " 28 V28 984 non-null float64\n", " 29 Amount 984 non-null float64\n", " 30 Class 984 non-null int64 \n", "dtypes: float64(30), int64(1)\n", "memory usage: 246.0 KB\n" ] } ], "source": [ "undersample_data.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Summary statistics of the undersampled dataset\n" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TimeV1V2V3V4V5V6V7V8V9V10V11V12V13V14V15V16V17V18V19V20V21V22V23V24V25V26V27V28AmountClass
count984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000984.000000
mean88501.498984-2.4450791.781022-3.5094062.214004-1.477993-0.713150-2.7874270.279073-1.253108-2.8415001.930697-3.124120-0.026229-3.502384-0.039494-2.097294-3.304208-1.1289500.3436680.1759050.3319110.049631-0.031264-0.0373890.0228120.0276320.0862860.0467380.0396760.500000
std48996.2694455.5123523.7132326.2230013.2310764.2746321.7893505.8561974.8576432.3710554.5630672.7647454.5951031.0543774.6532021.0029113.4656195.9900332.4120321.2909731.1262582.7878841.1670971.1775620.5515180.6775410.4764801.0233320.4791680.8518000.500254
min60.000000-30.552380-15.799625-31.103685-3.863126-22.105532-10.261990-43.557242-41.044261-13.434066-24.588262-2.613374-18.683715-3.223045-19.214325-4.498945-14.129855-25.162799-9.498746-3.681904-7.242879-22.797604-8.887017-19.254328-2.028024-4.781606-1.214960-7.263482-2.735623-0.3532290.000000
25%45531.000000-2.867222-0.155438-5.084967-0.172018-1.700260-1.619179-3.066415-0.204192-2.279453-4.572043-0.187147-5.495221-0.784589-6.721799-0.627097-3.543426-5.302111-1.809496-0.412430-0.187708-0.157259-0.509376-0.240064-0.379825-0.321251-0.281187-0.061809-0.050194-0.3473020.000000
50%83076.500000-0.8232440.957399-1.3819981.287041-0.394605-0.689473-0.6683210.147397-0.694910-0.9484411.170286-0.858094-0.000686-1.110717-0.006070-0.677801-0.513640-0.3830380.2210490.0406300.1554040.080270-0.0303180.0093790.049923-0.0074750.0631000.039464-0.2809840.500000
75%135051.5000000.9194442.7915690.3569114.1753320.6163050.0696200.2650890.8770020.134399-0.0160473.5865020.1903560.6839770.1105410.6729030.2503530.3138410.3349270.9787540.4456160.6427240.6249480.1807350.3656240.3950010.3240590.4571940.2264920.0465391.000000
max172733.0000002.33583322.0577293.47626812.11467214.1039186.4741155.80253720.0072086.81673211.73292612.0189132.5348763.0913283.4424222.4713583.1396566.7393843.7903165.22834211.05900427.2028398.3619855.4662301.2081412.2082092.7452613.0523584.9757928.1461821.000000
\n", "
" ], "text/plain": [ " Time V1 V2 V3 V4 \\\n", "count 984.000000 984.000000 984.000000 984.000000 984.000000 \n", "mean 88501.498984 -2.445079 1.781022 -3.509406 2.214004 \n", "std 48996.269445 5.512352 3.713232 6.223001 3.231076 \n", "min 60.000000 -30.552380 -15.799625 -31.103685 -3.863126 \n", "25% 45531.000000 -2.867222 -0.155438 -5.084967 -0.172018 \n", "50% 83076.500000 -0.823244 0.957399 -1.381998 1.287041 \n", "75% 135051.500000 0.919444 2.791569 0.356911 4.175332 \n", "max 172733.000000 2.335833 22.057729 3.476268 12.114672 \n", "\n", " V5 V6 V7 V8 V9 V10 \\\n", "count 984.000000 984.000000 984.000000 984.000000 984.000000 984.000000 \n", "mean -1.477993 -0.713150 -2.787427 0.279073 -1.253108 -2.841500 \n", "std 4.274632 1.789350 5.856197 4.857643 2.371055 4.563067 \n", "min -22.105532 -10.261990 -43.557242 -41.044261 -13.434066 -24.588262 \n", "25% -1.700260 -1.619179 -3.066415 -0.204192 -2.279453 -4.572043 \n", "50% -0.394605 -0.689473 -0.668321 0.147397 -0.694910 -0.948441 \n", "75% 0.616305 0.069620 0.265089 0.877002 0.134399 -0.016047 \n", "max 14.103918 6.474115 5.802537 20.007208 6.816732 11.732926 \n", "\n", " V11 V12 V13 V14 V15 V16 \\\n", "count 984.000000 984.000000 984.000000 984.000000 984.000000 984.000000 \n", "mean 1.930697 -3.124120 -0.026229 -3.502384 -0.039494 -2.097294 \n", "std 2.764745 4.595103 1.054377 4.653202 1.002911 3.465619 \n", "min -2.613374 -18.683715 -3.223045 -19.214325 -4.498945 -14.129855 \n", "25% -0.187147 -5.495221 -0.784589 -6.721799 -0.627097 -3.543426 \n", "50% 1.170286 -0.858094 -0.000686 -1.110717 -0.006070 -0.677801 \n", "75% 3.586502 0.190356 0.683977 0.110541 0.672903 0.250353 \n", "max 12.018913 2.534876 3.091328 3.442422 2.471358 3.139656 \n", "\n", " V17 V18 V19 V20 V21 V22 \\\n", "count 984.000000 984.000000 984.000000 984.000000 984.000000 984.000000 \n", "mean -3.304208 -1.128950 0.343668 0.175905 0.331911 0.049631 \n", "std 5.990033 2.412032 1.290973 1.126258 2.787884 1.167097 \n", "min -25.162799 -9.498746 -3.681904 -7.242879 -22.797604 -8.887017 \n", "25% -5.302111 -1.809496 -0.412430 -0.187708 -0.157259 -0.509376 \n", "50% -0.513640 -0.383038 0.221049 0.040630 0.155404 0.080270 \n", "75% 0.313841 0.334927 0.978754 0.445616 0.642724 0.624948 \n", "max 6.739384 3.790316 5.228342 11.059004 27.202839 8.361985 \n", "\n", " V23 V24 V25 V26 V27 V28 \\\n", "count 984.000000 984.000000 984.000000 984.000000 984.000000 984.000000 \n", "mean -0.031264 -0.037389 0.022812 0.027632 0.086286 0.046738 \n", "std 1.177562 0.551518 0.677541 0.476480 1.023332 0.479168 \n", "min -19.254328 -2.028024 -4.781606 -1.214960 -7.263482 -2.735623 \n", "25% -0.240064 -0.379825 -0.321251 -0.281187 -0.061809 -0.050194 \n", "50% -0.030318 0.009379 0.049923 -0.007475 0.063100 0.039464 \n", "75% 0.180735 0.365624 0.395001 0.324059 0.457194 0.226492 \n", "max 5.466230 1.208141 2.208209 2.745261 3.052358 4.975792 \n", "\n", " Amount Class \n", "count 984.000000 984.000000 \n", "mean 0.039676 0.500000 \n", "std 0.851800 0.500254 \n", "min -0.353229 0.000000 \n", "25% -0.347302 0.000000 \n", "50% -0.280984 0.500000 \n", "75% 0.046539 1.000000 \n", "max 8.146182 1.000000 " ] }, "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ "undersample_data.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Distribution of legitimate and fraudulent transactions in an undersampled dataset\n" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Class\n", "1 492\n", "0 492\n", "Name: count, dtype: int64" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "undersample_data[\"Class\"].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Splitting whole data into training and test datasets\n" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [], "source": [ "X = df.iloc[:, df.columns != \"Class\"]\n", "y = df.iloc[:, df.columns == \"Class\"]\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Statistical measures of the training dataset of whole data\n" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 199364 entries, 161145 to 117952\n", "Data columns (total 31 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Time 199364 non-null float64\n", " 1 V1 199364 non-null float64\n", " 2 V2 199364 non-null float64\n", " 3 V3 199364 non-null float64\n", " 4 V4 199364 non-null float64\n", " 5 V5 199364 non-null float64\n", " 6 V6 199364 non-null float64\n", " 7 V7 199364 non-null float64\n", " 8 V8 199364 non-null float64\n", " 9 V9 199364 non-null float64\n", " 10 V10 199364 non-null float64\n", " 11 V11 199364 non-null float64\n", " 12 V12 199364 non-null float64\n", " 13 V13 199364 non-null float64\n", " 14 V14 199364 non-null float64\n", " 15 V15 199364 non-null float64\n", " 16 V16 199364 non-null float64\n", " 17 V17 199364 non-null float64\n", " 18 V18 199364 non-null float64\n", " 19 V19 199364 non-null float64\n", " 20 V20 199364 non-null float64\n", " 21 V21 199364 non-null float64\n", " 22 V22 199364 non-null float64\n", " 23 V23 199364 non-null float64\n", " 24 V24 199364 non-null float64\n", " 25 V25 199364 non-null float64\n", " 26 V26 199364 non-null float64\n", " 27 V27 199364 non-null float64\n", " 28 V28 199364 non-null float64\n", " 29 Amount 199364 non-null float64\n", " 30 Class 199364 non-null int64 \n", "dtypes: float64(30), int64(1)\n", "memory usage: 48.7 MB\n" ] } ], "source": [ "pd.concat([X_train, y_train], axis=1).info()" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TimeV1V2V3V4V5V6V7V8V9V10V11V12V13V14V15V16V17V18V19V20V21V22V23V24V25V26V27V28AmountClass
count199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000199364.000000
mean94799.4939360.000315-0.002690-0.0015320.000721-0.001494-0.000210-0.000870-0.0019800.0002120.001357-0.001039-0.0015650.0006930.0001370.0003220.0000840.000292-0.0001340.0004900.000430-0.000014-0.000022-0.0002580.0003620.000395-0.000094-0.0000270.0000150.0012710.001731
std47499.8354911.9635541.6573791.5167161.4171381.3687441.3286731.2260181.2123381.1020211.0928011.0200270.9965260.9977180.9569380.9161430.8761310.8521810.8375560.8145060.7702570.7434500.7276250.6291450.6052980.5211750.4818420.4010420.3248490.9839480.041563
min0.000000-46.855047-63.344698-33.680984-5.560118-42.147898-23.496714-43.557242-73.216718-13.434066-24.588262-4.797473-17.769143-5.791881-19.214325-4.498945-14.129855-25.162799-9.498746-7.213527-23.646890-34.830382-10.933144-44.807735-2.822684-10.295397-2.534330-22.565679-11.710896-0.3532290.000000
25%54126.000000-0.921539-0.601213-0.892838-0.848835-0.692874-0.769177-0.554220-0.209086-0.644753-0.535493-0.762852-0.407660-0.648456-0.425122-0.583616-0.467945-0.484055-0.498850-0.456800-0.211662-0.229272-0.544345-0.162021-0.354179-0.316088-0.327327-0.070864-0.052907-0.3306400.000000
50%84633.5000000.0197050.0637840.177888-0.017852-0.055832-0.2743970.0392280.021803-0.049633-0.092069-0.0341350.137912-0.0134160.0511790.0492890.067772-0.065113-0.0032170.004422-0.062889-0.0290450.006744-0.0109150.0409740.018014-0.0522870.0010640.011119-0.2652710.000000
75%139334.2500001.3167070.8024371.0255290.7455660.6093490.3979280.5696380.3270230.5970960.4581290.7381430.6173930.6641480.4939250.6495890.5230950.4010340.5004360.4603670.1328340.1870950.5310170.1475030.4389530.3508020.2410820.0904910.077989-0.0430580.000000
max172792.0000002.45188822.0577299.38255816.71553734.09930923.91783744.05446120.00720815.59499523.74513612.0189137.8483924.56900910.5267665.8256547.0591329.2070595.0410695.57211339.42090427.20283910.50309022.5284124.0228667.5195893.46324612.15240122.62007278.2352721.000000
\n", "
" ], "text/plain": [ " Time V1 V2 V3 \\\n", "count 199364.000000 199364.000000 199364.000000 199364.000000 \n", "mean 94799.493936 0.000315 -0.002690 -0.001532 \n", "std 47499.835491 1.963554 1.657379 1.516716 \n", "min 0.000000 -46.855047 -63.344698 -33.680984 \n", "25% 54126.000000 -0.921539 -0.601213 -0.892838 \n", "50% 84633.500000 0.019705 0.063784 0.177888 \n", "75% 139334.250000 1.316707 0.802437 1.025529 \n", "max 172792.000000 2.451888 22.057729 9.382558 \n", "\n", " V4 V5 V6 V7 \\\n", "count 199364.000000 199364.000000 199364.000000 199364.000000 \n", "mean 0.000721 -0.001494 -0.000210 -0.000870 \n", "std 1.417138 1.368744 1.328673 1.226018 \n", "min -5.560118 -42.147898 -23.496714 -43.557242 \n", "25% -0.848835 -0.692874 -0.769177 -0.554220 \n", "50% -0.017852 -0.055832 -0.274397 0.039228 \n", "75% 0.745566 0.609349 0.397928 0.569638 \n", "max 16.715537 34.099309 23.917837 44.054461 \n", "\n", " V8 V9 V10 V11 \\\n", "count 199364.000000 199364.000000 199364.000000 199364.000000 \n", "mean -0.001980 0.000212 0.001357 -0.001039 \n", "std 1.212338 1.102021 1.092801 1.020027 \n", "min -73.216718 -13.434066 -24.588262 -4.797473 \n", "25% -0.209086 -0.644753 -0.535493 -0.762852 \n", "50% 0.021803 -0.049633 -0.092069 -0.034135 \n", "75% 0.327023 0.597096 0.458129 0.738143 \n", "max 20.007208 15.594995 23.745136 12.018913 \n", "\n", " V12 V13 V14 V15 \\\n", "count 199364.000000 199364.000000 199364.000000 199364.000000 \n", "mean -0.001565 0.000693 0.000137 0.000322 \n", "std 0.996526 0.997718 0.956938 0.916143 \n", "min -17.769143 -5.791881 -19.214325 -4.498945 \n", "25% -0.407660 -0.648456 -0.425122 -0.583616 \n", "50% 0.137912 -0.013416 0.051179 0.049289 \n", "75% 0.617393 0.664148 0.493925 0.649589 \n", "max 7.848392 4.569009 10.526766 5.825654 \n", "\n", " V16 V17 V18 V19 \\\n", "count 199364.000000 199364.000000 199364.000000 199364.000000 \n", "mean 0.000084 0.000292 -0.000134 0.000490 \n", "std 0.876131 0.852181 0.837556 0.814506 \n", "min -14.129855 -25.162799 -9.498746 -7.213527 \n", "25% -0.467945 -0.484055 -0.498850 -0.456800 \n", "50% 0.067772 -0.065113 -0.003217 0.004422 \n", "75% 0.523095 0.401034 0.500436 0.460367 \n", "max 7.059132 9.207059 5.041069 5.572113 \n", "\n", " V20 V21 V22 V23 \\\n", "count 199364.000000 199364.000000 199364.000000 199364.000000 \n", "mean 0.000430 -0.000014 -0.000022 -0.000258 \n", "std 0.770257 0.743450 0.727625 0.629145 \n", "min -23.646890 -34.830382 -10.933144 -44.807735 \n", "25% -0.211662 -0.229272 -0.544345 -0.162021 \n", "50% -0.062889 -0.029045 0.006744 -0.010915 \n", "75% 0.132834 0.187095 0.531017 0.147503 \n", "max 39.420904 27.202839 10.503090 22.528412 \n", "\n", " V24 V25 V26 V27 \\\n", "count 199364.000000 199364.000000 199364.000000 199364.000000 \n", "mean 0.000362 0.000395 -0.000094 -0.000027 \n", "std 0.605298 0.521175 0.481842 0.401042 \n", "min -2.822684 -10.295397 -2.534330 -22.565679 \n", "25% -0.354179 -0.316088 -0.327327 -0.070864 \n", "50% 0.040974 0.018014 -0.052287 0.001064 \n", "75% 0.438953 0.350802 0.241082 0.090491 \n", "max 4.022866 7.519589 3.463246 12.152401 \n", "\n", " V28 Amount Class \n", "count 199364.000000 199364.000000 199364.000000 \n", "mean 0.000015 0.001271 0.001731 \n", "std 0.324849 0.983948 0.041563 \n", "min -11.710896 -0.353229 0.000000 \n", "25% -0.052907 -0.330640 0.000000 \n", "50% 0.011119 -0.265271 0.000000 \n", "75% 0.077989 -0.043058 0.000000 \n", "max 22.620072 78.235272 1.000000 " ] }, "execution_count": 102, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([X_train, y_train], axis=1).describe()" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Class\n", "0 199019\n", "1 345\n", "Name: count, dtype: int64" ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([X_train, y_train], axis=1)[\"Class\"].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Statistical measures of the test dataset of whole data\n" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 85443 entries, 183484 to 240913\n", "Data columns (total 31 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Time 85443 non-null float64\n", " 1 V1 85443 non-null float64\n", " 2 V2 85443 non-null float64\n", " 3 V3 85443 non-null float64\n", " 4 V4 85443 non-null float64\n", " 5 V5 85443 non-null float64\n", " 6 V6 85443 non-null float64\n", " 7 V7 85443 non-null float64\n", " 8 V8 85443 non-null float64\n", " 9 V9 85443 non-null float64\n", " 10 V10 85443 non-null float64\n", " 11 V11 85443 non-null float64\n", " 12 V12 85443 non-null float64\n", " 13 V13 85443 non-null float64\n", " 14 V14 85443 non-null float64\n", " 15 V15 85443 non-null float64\n", " 16 V16 85443 non-null float64\n", " 17 V17 85443 non-null float64\n", " 18 V18 85443 non-null float64\n", " 19 V19 85443 non-null float64\n", " 20 V20 85443 non-null float64\n", " 21 V21 85443 non-null float64\n", " 22 V22 85443 non-null float64\n", " 23 V23 85443 non-null float64\n", " 24 V24 85443 non-null float64\n", " 25 V25 85443 non-null float64\n", " 26 V26 85443 non-null float64\n", " 27 V27 85443 non-null float64\n", " 28 V28 85443 non-null float64\n", " 29 Amount 85443 non-null float64\n", " 30 Class 85443 non-null int64 \n", "dtypes: float64(30), int64(1)\n", "memory usage: 20.9 MB\n" ] } ], "source": [ "pd.concat([X_test, y_test], axis=1).info()" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TimeV1V2V3V4V5V6V7V8V9V10V11V12V13V14V15V16V17V18V19V20V21V22V23V24V25V26V27V28AmountClass
count85443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.00000085443.000000
mean94847.378896-0.0007340.0062770.003574-0.0016820.0034860.0004890.0020300.004620-0.000495-0.0031670.0024240.003652-0.001616-0.000319-0.000751-0.000195-0.0006820.000312-0.001144-0.0010040.0000330.0000520.000602-0.000845-0.0009220.0002200.000062-0.000036-0.0029660.001720
std47461.1205481.9473251.6370501.5151821.4129081.4067221.3406361.2625621.1512911.0906911.0795741.0223151.0054130.9895530.9624570.9133880.8765420.8426690.8396260.8129570.7724840.7132660.7211980.6133940.6064640.5215200.4831260.4096160.3419871.0364920.041443
min0.000000-56.407510-72.715728-48.325589-5.683171-113.743307-26.160506-28.215112-50.943369-9.481456-20.949192-4.568390-18.683715-3.888606-18.493773-4.391307-13.303888-22.883999-9.287832-6.938297-54.497720-22.665685-9.499423-32.828995-2.836627-8.696627-2.604551-9.793568-15.430084-0.3532290.000000
25%54354.000000-0.916858-0.591858-0.883828-0.848202-0.688280-0.766664-0.553479-0.207216-0.638926-0.535400-0.761716-0.400087-0.648761-0.426516-0.581015-0.468312-0.483139-0.498660-0.455027-0.211881-0.226184-0.537704-0.161490-0.355671-0.319736-0.326068-0.070797-0.053129-0.3312800.000000
50%84850.0000000.0132380.0701850.185047-0.024109-0.051627-0.2736860.0423430.023782-0.053821-0.094949-0.0291290.144948-0.0138030.0492480.0452910.062957-0.066955-0.0042450.002229-0.061529-0.0306870.006971-0.0117890.0409760.013508-0.0516950.0019840.011561-0.2652710.000000
75%139277.5000001.3132570.8066151.0311550.7377840.6180670.3998640.5724230.3283370.5973880.4431260.7435110.6206940.6578260.4919160.6471170.5236080.3967990.5014550.4552490.1336080.1848460.5236890.1479230.4410930.3506170.2406570.0922240.078900-0.0473560.000000
max172788.0000002.45493015.8769234.07916816.87534434.80166673.301626120.58949418.7488729.27237615.33174211.6692054.4063387.1268837.4395668.87774217.3151129.2535264.7123985.59197138.11720922.5797147.22015820.8033444.5845495.8261593.51734631.61219833.847808102.3622431.000000
\n", "
" ], "text/plain": [ " Time V1 V2 V3 V4 \\\n", "count 85443.000000 85443.000000 85443.000000 85443.000000 85443.000000 \n", "mean 94847.378896 -0.000734 0.006277 0.003574 -0.001682 \n", "std 47461.120548 1.947325 1.637050 1.515182 1.412908 \n", "min 0.000000 -56.407510 -72.715728 -48.325589 -5.683171 \n", "25% 54354.000000 -0.916858 -0.591858 -0.883828 -0.848202 \n", "50% 84850.000000 0.013238 0.070185 0.185047 -0.024109 \n", "75% 139277.500000 1.313257 0.806615 1.031155 0.737784 \n", "max 172788.000000 2.454930 15.876923 4.079168 16.875344 \n", "\n", " V5 V6 V7 V8 V9 \\\n", "count 85443.000000 85443.000000 85443.000000 85443.000000 85443.000000 \n", "mean 0.003486 0.000489 0.002030 0.004620 -0.000495 \n", "std 1.406722 1.340636 1.262562 1.151291 1.090691 \n", "min -113.743307 -26.160506 -28.215112 -50.943369 -9.481456 \n", "25% -0.688280 -0.766664 -0.553479 -0.207216 -0.638926 \n", "50% -0.051627 -0.273686 0.042343 0.023782 -0.053821 \n", "75% 0.618067 0.399864 0.572423 0.328337 0.597388 \n", "max 34.801666 73.301626 120.589494 18.748872 9.272376 \n", "\n", " V10 V11 V12 V13 V14 \\\n", "count 85443.000000 85443.000000 85443.000000 85443.000000 85443.000000 \n", "mean -0.003167 0.002424 0.003652 -0.001616 -0.000319 \n", "std 1.079574 1.022315 1.005413 0.989553 0.962457 \n", "min -20.949192 -4.568390 -18.683715 -3.888606 -18.493773 \n", "25% -0.535400 -0.761716 -0.400087 -0.648761 -0.426516 \n", "50% -0.094949 -0.029129 0.144948 -0.013803 0.049248 \n", "75% 0.443126 0.743511 0.620694 0.657826 0.491916 \n", "max 15.331742 11.669205 4.406338 7.126883 7.439566 \n", "\n", " V15 V16 V17 V18 V19 \\\n", "count 85443.000000 85443.000000 85443.000000 85443.000000 85443.000000 \n", "mean -0.000751 -0.000195 -0.000682 0.000312 -0.001144 \n", "std 0.913388 0.876542 0.842669 0.839626 0.812957 \n", "min -4.391307 -13.303888 -22.883999 -9.287832 -6.938297 \n", "25% -0.581015 -0.468312 -0.483139 -0.498660 -0.455027 \n", "50% 0.045291 0.062957 -0.066955 -0.004245 0.002229 \n", "75% 0.647117 0.523608 0.396799 0.501455 0.455249 \n", "max 8.877742 17.315112 9.253526 4.712398 5.591971 \n", "\n", " V20 V21 V22 V23 V24 \\\n", "count 85443.000000 85443.000000 85443.000000 85443.000000 85443.000000 \n", "mean -0.001004 0.000033 0.000052 0.000602 -0.000845 \n", "std 0.772484 0.713266 0.721198 0.613394 0.606464 \n", "min -54.497720 -22.665685 -9.499423 -32.828995 -2.836627 \n", "25% -0.211881 -0.226184 -0.537704 -0.161490 -0.355671 \n", "50% -0.061529 -0.030687 0.006971 -0.011789 0.040976 \n", "75% 0.133608 0.184846 0.523689 0.147923 0.441093 \n", "max 38.117209 22.579714 7.220158 20.803344 4.584549 \n", "\n", " V25 V26 V27 V28 Amount \\\n", "count 85443.000000 85443.000000 85443.000000 85443.000000 85443.000000 \n", "mean -0.000922 0.000220 0.000062 -0.000036 -0.002966 \n", "std 0.521520 0.483126 0.409616 0.341987 1.036492 \n", "min -8.696627 -2.604551 -9.793568 -15.430084 -0.353229 \n", "25% -0.319736 -0.326068 -0.070797 -0.053129 -0.331280 \n", "50% 0.013508 -0.051695 0.001984 0.011561 -0.265271 \n", "75% 0.350617 0.240657 0.092224 0.078900 -0.047356 \n", "max 5.826159 3.517346 31.612198 33.847808 102.362243 \n", "\n", " Class \n", "count 85443.000000 \n", "mean 0.001720 \n", "std 0.041443 \n", "min 0.000000 \n", "25% 0.000000 \n", "50% 0.000000 \n", "75% 0.000000 \n", "max 1.000000 " ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([X_test, y_test], axis=1).describe()" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Class\n", "0 85296\n", "1 147\n", "Name: count, dtype: int64" ] }, "execution_count": 106, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([X_test, y_test], axis=1)[\"Class\"].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Splitting undersampled data into training and test datasets\n" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [], "source": [ "X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = (\n", " train_test_split(X_undersample, y_undersample, test_size=0.3, random_state=0)\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Statistical measures of the training dataset of undersampled data\n" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 688 entries, 6870 to 208266\n", "Data columns (total 31 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Time 688 non-null float64\n", " 1 V1 688 non-null float64\n", " 2 V2 688 non-null float64\n", " 3 V3 688 non-null float64\n", " 4 V4 688 non-null float64\n", " 5 V5 688 non-null float64\n", " 6 V6 688 non-null float64\n", " 7 V7 688 non-null float64\n", " 8 V8 688 non-null float64\n", " 9 V9 688 non-null float64\n", " 10 V10 688 non-null float64\n", " 11 V11 688 non-null float64\n", " 12 V12 688 non-null float64\n", " 13 V13 688 non-null float64\n", " 14 V14 688 non-null float64\n", " 15 V15 688 non-null float64\n", " 16 V16 688 non-null float64\n", " 17 V17 688 non-null float64\n", " 18 V18 688 non-null float64\n", " 19 V19 688 non-null float64\n", " 20 V20 688 non-null float64\n", " 21 V21 688 non-null float64\n", " 22 V22 688 non-null float64\n", " 23 V23 688 non-null float64\n", " 24 V24 688 non-null float64\n", " 25 V25 688 non-null float64\n", " 26 V26 688 non-null float64\n", " 27 V27 688 non-null float64\n", " 28 V28 688 non-null float64\n", " 29 Amount 688 non-null float64\n", " 30 Class 688 non-null int64 \n", "dtypes: float64(30), int64(1)\n", "memory usage: 172.0 KB\n" ] } ], "source": [ "pd.concat([X_train_undersample, y_train_undersample], axis=1).info()" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TimeV1V2V3V4V5V6V7V8V9V10V11V12V13V14V15V16V17V18V19V20V21V22V23V24V25V26V27V28AmountClass
count688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000688.000000
mean88546.635174-2.4436421.748210-3.4906932.161294-1.466909-0.737723-2.7591900.361773-1.222417-2.8081441.937783-3.131850-0.001132-3.568854-0.022936-2.145811-3.365430-1.1372380.3776900.1271570.4464950.012945-0.069031-0.0202030.0317820.0221540.1146840.0415570.0365920.501453
std48529.6617535.3826383.6164266.0203913.1982214.2275531.8295355.4989954.7411542.3365554.4175482.7711374.5607531.0818264.6419600.9816833.4586636.0622162.4626891.2872561.0729602.7493541.1439401.2838820.5494850.6890150.4744110.9231610.4870770.8343600.500362
min117.000000-30.552380-15.799625-31.103685-3.863126-22.105532-10.261990-37.060311-37.353443-11.126624-23.228255-2.613374-18.431131-3.223045-19.214325-4.498945-13.563273-25.162799-9.498746-3.602657-7.242879-16.922016-8.887017-19.254328-2.028024-4.781606-1.214960-7.263482-2.735623-0.3532290.000000
25%45531.000000-2.867222-0.164478-5.049001-0.212543-1.703845-1.691031-3.105154-0.220868-2.205996-4.731895-0.194163-5.643631-0.767631-6.767749-0.562582-3.612856-5.277726-1.816368-0.373523-0.197730-0.142520-0.510247-0.246005-0.373302-0.320463-0.281449-0.061809-0.050983-0.3461130.000000
50%82526.500000-0.8740570.984845-1.4828801.285768-0.400360-0.741307-0.7409520.141389-0.694910-0.9815691.154879-0.8454630.008049-1.1327610.001558-0.750918-0.495063-0.3927430.2464780.0305560.1633230.076684-0.0271430.0143600.046511-0.0262320.0597980.036635-0.2731881.000000
75%135096.7500000.9455822.8509470.3485794.1668570.5998920.0335690.2408430.9199990.196633-0.0010473.6252620.1631040.7440210.0866690.6657360.2198090.3142060.3714810.9787540.4434950.6805970.6291090.1748620.3820760.4060560.3064030.4824880.2355490.0465391.000000
max172573.0000002.33583319.1672393.22897811.92751214.1039186.3559865.80253720.0072086.81673211.73292612.0189132.5348763.0913283.4424222.3641993.1396566.7393843.7903165.2283427.90737827.2028395.7740875.3036071.2081412.2082092.7452613.0523584.9757928.1461821.000000
\n", "
" ], "text/plain": [ " Time V1 V2 V3 V4 \\\n", "count 688.000000 688.000000 688.000000 688.000000 688.000000 \n", "mean 88546.635174 -2.443642 1.748210 -3.490693 2.161294 \n", "std 48529.661753 5.382638 3.616426 6.020391 3.198221 \n", "min 117.000000 -30.552380 -15.799625 -31.103685 -3.863126 \n", "25% 45531.000000 -2.867222 -0.164478 -5.049001 -0.212543 \n", "50% 82526.500000 -0.874057 0.984845 -1.482880 1.285768 \n", "75% 135096.750000 0.945582 2.850947 0.348579 4.166857 \n", "max 172573.000000 2.335833 19.167239 3.228978 11.927512 \n", "\n", " V5 V6 V7 V8 V9 V10 \\\n", "count 688.000000 688.000000 688.000000 688.000000 688.000000 688.000000 \n", "mean -1.466909 -0.737723 -2.759190 0.361773 -1.222417 -2.808144 \n", "std 4.227553 1.829535 5.498995 4.741154 2.336555 4.417548 \n", "min -22.105532 -10.261990 -37.060311 -37.353443 -11.126624 -23.228255 \n", "25% -1.703845 -1.691031 -3.105154 -0.220868 -2.205996 -4.731895 \n", "50% -0.400360 -0.741307 -0.740952 0.141389 -0.694910 -0.981569 \n", "75% 0.599892 0.033569 0.240843 0.919999 0.196633 -0.001047 \n", "max 14.103918 6.355986 5.802537 20.007208 6.816732 11.732926 \n", "\n", " V11 V12 V13 V14 V15 V16 \\\n", "count 688.000000 688.000000 688.000000 688.000000 688.000000 688.000000 \n", "mean 1.937783 -3.131850 -0.001132 -3.568854 -0.022936 -2.145811 \n", "std 2.771137 4.560753 1.081826 4.641960 0.981683 3.458663 \n", "min -2.613374 -18.431131 -3.223045 -19.214325 -4.498945 -13.563273 \n", "25% -0.194163 -5.643631 -0.767631 -6.767749 -0.562582 -3.612856 \n", "50% 1.154879 -0.845463 0.008049 -1.132761 0.001558 -0.750918 \n", "75% 3.625262 0.163104 0.744021 0.086669 0.665736 0.219809 \n", "max 12.018913 2.534876 3.091328 3.442422 2.364199 3.139656 \n", "\n", " V17 V18 V19 V20 V21 V22 \\\n", "count 688.000000 688.000000 688.000000 688.000000 688.000000 688.000000 \n", "mean -3.365430 -1.137238 0.377690 0.127157 0.446495 0.012945 \n", "std 6.062216 2.462689 1.287256 1.072960 2.749354 1.143940 \n", "min -25.162799 -9.498746 -3.602657 -7.242879 -16.922016 -8.887017 \n", "25% -5.277726 -1.816368 -0.373523 -0.197730 -0.142520 -0.510247 \n", "50% -0.495063 -0.392743 0.246478 0.030556 0.163323 0.076684 \n", "75% 0.314206 0.371481 0.978754 0.443495 0.680597 0.629109 \n", "max 6.739384 3.790316 5.228342 7.907378 27.202839 5.774087 \n", "\n", " V23 V24 V25 V26 V27 V28 \\\n", "count 688.000000 688.000000 688.000000 688.000000 688.000000 688.000000 \n", "mean -0.069031 -0.020203 0.031782 0.022154 0.114684 0.041557 \n", "std 1.283882 0.549485 0.689015 0.474411 0.923161 0.487077 \n", "min -19.254328 -2.028024 -4.781606 -1.214960 -7.263482 -2.735623 \n", "25% -0.246005 -0.373302 -0.320463 -0.281449 -0.061809 -0.050983 \n", "50% -0.027143 0.014360 0.046511 -0.026232 0.059798 0.036635 \n", "75% 0.174862 0.382076 0.406056 0.306403 0.482488 0.235549 \n", "max 5.303607 1.208141 2.208209 2.745261 3.052358 4.975792 \n", "\n", " Amount Class \n", "count 688.000000 688.000000 \n", "mean 0.036592 0.501453 \n", "std 0.834360 0.500362 \n", "min -0.353229 0.000000 \n", "25% -0.346113 0.000000 \n", "50% -0.273188 1.000000 \n", "75% 0.046539 1.000000 \n", "max 8.146182 1.000000 " ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([X_train_undersample, y_train_undersample], axis=1).describe()" ] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Class\n", "1 345\n", "0 343\n", "Name: count, dtype: int64" ] }, "execution_count": 110, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([X_train_undersample, y_train_undersample], axis=1)[\"Class\"].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Statistical measures of the test dataset of undersampled data\n" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 296 entries, 102782 to 57921\n", "Data columns (total 31 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Time 296 non-null float64\n", " 1 V1 296 non-null float64\n", " 2 V2 296 non-null float64\n", " 3 V3 296 non-null float64\n", " 4 V4 296 non-null float64\n", " 5 V5 296 non-null float64\n", " 6 V6 296 non-null float64\n", " 7 V7 296 non-null float64\n", " 8 V8 296 non-null float64\n", " 9 V9 296 non-null float64\n", " 10 V10 296 non-null float64\n", " 11 V11 296 non-null float64\n", " 12 V12 296 non-null float64\n", " 13 V13 296 non-null float64\n", " 14 V14 296 non-null float64\n", " 15 V15 296 non-null float64\n", " 16 V16 296 non-null float64\n", " 17 V17 296 non-null float64\n", " 18 V18 296 non-null float64\n", " 19 V19 296 non-null float64\n", " 20 V20 296 non-null float64\n", " 21 V21 296 non-null float64\n", " 22 V22 296 non-null float64\n", " 23 V23 296 non-null float64\n", " 24 V24 296 non-null float64\n", " 25 V25 296 non-null float64\n", " 26 V26 296 non-null float64\n", " 27 V27 296 non-null float64\n", " 28 V28 296 non-null float64\n", " 29 Amount 296 non-null float64\n", " 30 Class 296 non-null int64 \n", "dtypes: float64(30), int64(1)\n", "memory usage: 74.0 KB\n" ] } ], "source": [ "pd.concat([X_test_undersample, y_test_undersample], axis=1).info()" ] }, { "cell_type": "code", "execution_count": 112, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TimeV1V2V3V4V5V6V7V8V9V10V11V12V13V14V15V16V17V18V19V20V21V22V23V24V25V26V27V28AmountClass
count296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000296.000000
mean88396.587838-2.4484191.857288-3.5529002.336519-1.503755-0.656035-2.8530580.086851-1.324446-2.9190281.914227-3.106154-0.084562-3.347887-0.077981-1.984526-3.161909-1.1096860.2645900.2892120.0655820.1349020.056521-0.0773360.0019630.0403640.0202810.0587810.0468450.496622
std50147.1053265.8120723.9343236.6806603.3084174.3892631.6938936.6220085.1212932.4519144.8915172.7544394.6817220.9869374.6834581.0512963.4849895.8264102.2939101.2983101.2358412.8624631.2169350.8779750.5550900.6507520.4818221.2241660.4608410.8924320.500835
min60.000000-29.876366-8.402154-30.558697-2.956827-21.665654-5.773192-43.557242-41.044261-13.434066-24.588262-2.383066-18.683715-3.076318-17.620634-3.092108-14.129855-22.541652-9.090892-3.681904-5.225849-22.797604-8.887017-5.988806-1.742803-2.079928-1.170476-7.263482-1.931920-0.3532290.000000
25%45977.500000-2.867766-0.130600-5.417818-0.118496-1.667035-1.477544-2.835885-0.168935-2.345829-4.445615-0.144802-5.340188-0.815218-6.363108-0.729637-3.303237-5.358990-1.747789-0.563676-0.165023-0.178103-0.483530-0.212828-0.405811-0.324214-0.270853-0.056831-0.042639-0.3492310.000000
50%84069.000000-0.7409150.941852-1.1399641.340723-0.369227-0.596589-0.5018640.169642-0.696902-0.8755211.267304-0.938658-0.060414-1.059352-0.012904-0.547678-0.527389-0.3189040.1698270.0569980.1300600.081904-0.035614-0.0102320.0688900.0319110.0737020.046030-0.3008340.000000
75%135023.5000000.8795112.7003710.3947654.3053610.6244590.1392440.3067880.8333920.011527-0.0510123.5423360.2347520.6096290.1739160.6853000.3511190.3096360.2373580.9483710.4611800.5686110.6175880.2003280.3176530.3868040.3553820.3954120.1927660.0280481.000000
max172733.0000002.30676922.0577293.47626812.1146729.8805646.4741153.79190719.5877734.8663166.36766111.1524911.7251852.8970442.6542752.4713582.6964756.4436492.5918464.85125511.05900427.2028398.3619855.4662301.0774072.1560421.4588282.7065663.0424065.6636101.000000
\n", "
" ], "text/plain": [ " Time V1 V2 V3 V4 \\\n", "count 296.000000 296.000000 296.000000 296.000000 296.000000 \n", "mean 88396.587838 -2.448419 1.857288 -3.552900 2.336519 \n", "std 50147.105326 5.812072 3.934323 6.680660 3.308417 \n", "min 60.000000 -29.876366 -8.402154 -30.558697 -2.956827 \n", "25% 45977.500000 -2.867766 -0.130600 -5.417818 -0.118496 \n", "50% 84069.000000 -0.740915 0.941852 -1.139964 1.340723 \n", "75% 135023.500000 0.879511 2.700371 0.394765 4.305361 \n", "max 172733.000000 2.306769 22.057729 3.476268 12.114672 \n", "\n", " V5 V6 V7 V8 V9 V10 \\\n", "count 296.000000 296.000000 296.000000 296.000000 296.000000 296.000000 \n", "mean -1.503755 -0.656035 -2.853058 0.086851 -1.324446 -2.919028 \n", "std 4.389263 1.693893 6.622008 5.121293 2.451914 4.891517 \n", "min -21.665654 -5.773192 -43.557242 -41.044261 -13.434066 -24.588262 \n", "25% -1.667035 -1.477544 -2.835885 -0.168935 -2.345829 -4.445615 \n", "50% -0.369227 -0.596589 -0.501864 0.169642 -0.696902 -0.875521 \n", "75% 0.624459 0.139244 0.306788 0.833392 0.011527 -0.051012 \n", "max 9.880564 6.474115 3.791907 19.587773 4.866316 6.367661 \n", "\n", " V11 V12 V13 V14 V15 V16 \\\n", "count 296.000000 296.000000 296.000000 296.000000 296.000000 296.000000 \n", "mean 1.914227 -3.106154 -0.084562 -3.347887 -0.077981 -1.984526 \n", "std 2.754439 4.681722 0.986937 4.683458 1.051296 3.484989 \n", "min -2.383066 -18.683715 -3.076318 -17.620634 -3.092108 -14.129855 \n", "25% -0.144802 -5.340188 -0.815218 -6.363108 -0.729637 -3.303237 \n", "50% 1.267304 -0.938658 -0.060414 -1.059352 -0.012904 -0.547678 \n", "75% 3.542336 0.234752 0.609629 0.173916 0.685300 0.351119 \n", "max 11.152491 1.725185 2.897044 2.654275 2.471358 2.696475 \n", "\n", " V17 V18 V19 V20 V21 V22 \\\n", "count 296.000000 296.000000 296.000000 296.000000 296.000000 296.000000 \n", "mean -3.161909 -1.109686 0.264590 0.289212 0.065582 0.134902 \n", "std 5.826410 2.293910 1.298310 1.235841 2.862463 1.216935 \n", "min -22.541652 -9.090892 -3.681904 -5.225849 -22.797604 -8.887017 \n", "25% -5.358990 -1.747789 -0.563676 -0.165023 -0.178103 -0.483530 \n", "50% -0.527389 -0.318904 0.169827 0.056998 0.130060 0.081904 \n", "75% 0.309636 0.237358 0.948371 0.461180 0.568611 0.617588 \n", "max 6.443649 2.591846 4.851255 11.059004 27.202839 8.361985 \n", "\n", " V23 V24 V25 V26 V27 V28 \\\n", "count 296.000000 296.000000 296.000000 296.000000 296.000000 296.000000 \n", "mean 0.056521 -0.077336 0.001963 0.040364 0.020281 0.058781 \n", "std 0.877975 0.555090 0.650752 0.481822 1.224166 0.460841 \n", "min -5.988806 -1.742803 -2.079928 -1.170476 -7.263482 -1.931920 \n", "25% -0.212828 -0.405811 -0.324214 -0.270853 -0.056831 -0.042639 \n", "50% -0.035614 -0.010232 0.068890 0.031911 0.073702 0.046030 \n", "75% 0.200328 0.317653 0.386804 0.355382 0.395412 0.192766 \n", "max 5.466230 1.077407 2.156042 1.458828 2.706566 3.042406 \n", "\n", " Amount Class \n", "count 296.000000 296.000000 \n", "mean 0.046845 0.496622 \n", "std 0.892432 0.500835 \n", "min -0.353229 0.000000 \n", "25% -0.349231 0.000000 \n", "50% -0.300834 0.000000 \n", "75% 0.028048 1.000000 \n", "max 5.663610 1.000000 " ] }, "execution_count": 112, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([X_test_undersample, y_test_undersample], axis=1).describe()" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Class\n", "0 149\n", "1 147\n", "Name: count, dtype: int64" ] }, "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([X_test_undersample, y_test_undersample], axis=1)[\"Class\"].value_counts()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }