ium_464913/IUM_2.ipynb

3280 lines
124 KiB
Plaintext
Raw Normal View History

2024-03-16 14:35:44 +01:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## IUM 2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Installation of packages"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: kaggle in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (1.6.6)\n",
"Requirement already satisfied: six>=1.10 in c:\\users\\skype\\appdata\\roaming\\python\\python312\\site-packages (from kaggle) (1.16.0)\n",
"Requirement already satisfied: certifi in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from kaggle) (2024.2.2)\n",
"Requirement already satisfied: python-dateutil in c:\\users\\skype\\appdata\\roaming\\python\\python312\\site-packages (from kaggle) (2.9.0.post0)\n",
"Requirement already satisfied: requests in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from kaggle) (2.31.0)\n",
"Requirement already satisfied: tqdm in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from kaggle) (4.66.2)\n",
"Requirement already satisfied: python-slugify in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from kaggle) (8.0.4)\n",
"Requirement already satisfied: urllib3 in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from kaggle) (2.2.1)\n",
"Requirement already satisfied: bleach in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from kaggle) (6.1.0)\n",
"Requirement already satisfied: webencodings in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from bleach->kaggle) (0.5.1)\n",
"Requirement already satisfied: text-unidecode>=1.3 in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from python-slugify->kaggle) (1.3)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from requests->kaggle) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from requests->kaggle) (3.6)\n",
"Requirement already satisfied: colorama in c:\\users\\skype\\appdata\\roaming\\python\\python312\\site-packages (from tqdm->kaggle) (0.4.6)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: pandas in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (2.2.1)\n",
"Requirement already satisfied: numpy<2,>=1.26.0 in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (1.26.3)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\skype\\appdata\\roaming\\python\\python312\\site-packages (from pandas) (2.9.0.post0)\n",
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (2024.1)\n",
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (2024.1)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\skype\\appdata\\roaming\\python\\python312\\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: numpy in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (1.26.3)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: scikit-learn in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (1.4.1.post1)\n",
"Requirement already satisfied: numpy<2.0,>=1.19.5 in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from scikit-learn) (1.26.3)\n",
"Requirement already satisfied: scipy>=1.6.0 in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from scikit-learn) (1.12.0)\n",
"Requirement already satisfied: joblib>=1.2.0 in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from scikit-learn) (1.3.2)\n",
"Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\users\\skype\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from scikit-learn) (3.3.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install kaggle\n",
"%pip install pandas\n",
"%pip install numpy\n",
"%pip install scikit-learn"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Importing libraries"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"# To preprocess the data\n",
"from sklearn.preprocessing import StandardScaler\n",
"\n",
"# To split the data\n",
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Downloading a dataset"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"creditcardfraud.zip: Skipping, found more recently modified local copy (use --force to force download)\n"
]
}
],
"source": [
"!kaggle datasets download -d mlg-ulb/creditcardfraud"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Uncompress a file"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Archive: creditcardfraud.zip\n",
" inflating: creditcard.csv \n"
]
}
],
"source": [
"!unzip -o creditcardfraud.zip"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load the data"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('creditcard.csv')\n",
"pd.set_option('display.max_columns', None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Size of the dataset"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 284807 entries, 0 to 284806\n",
"Data columns (total 31 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Time 284807 non-null float64\n",
" 1 V1 284807 non-null float64\n",
" 2 V2 284807 non-null float64\n",
" 3 V3 284807 non-null float64\n",
" 4 V4 284807 non-null float64\n",
" 5 V5 284807 non-null float64\n",
" 6 V6 284807 non-null float64\n",
" 7 V7 284807 non-null float64\n",
" 8 V8 284807 non-null float64\n",
" 9 V9 284807 non-null float64\n",
" 10 V10 284807 non-null float64\n",
" 11 V11 284807 non-null float64\n",
" 12 V12 284807 non-null float64\n",
" 13 V13 284807 non-null float64\n",
" 14 V14 284807 non-null float64\n",
" 15 V15 284807 non-null float64\n",
" 16 V16 284807 non-null float64\n",
" 17 V17 284807 non-null float64\n",
" 18 V18 284807 non-null float64\n",
" 19 V19 284807 non-null float64\n",
" 20 V20 284807 non-null float64\n",
" 21 V21 284807 non-null float64\n",
" 22 V22 284807 non-null float64\n",
" 23 V23 284807 non-null float64\n",
" 24 V24 284807 non-null float64\n",
" 25 V25 284807 non-null float64\n",
" 26 V26 284807 non-null float64\n",
" 27 V27 284807 non-null float64\n",
" 28 V28 284807 non-null float64\n",
" 29 Amount 284807 non-null float64\n",
" 30 Class 284807 non-null int64 \n",
"dtypes: float64(30), int64(1)\n",
"memory usage: 67.4 MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Normalising the data"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"scaler = StandardScaler()\n",
"\n",
"df['Amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1, 1))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Summary statistics"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Time</th>\n",
" <th>V1</th>\n",
" <th>V2</th>\n",
" <th>V3</th>\n",
" <th>V4</th>\n",
" <th>V5</th>\n",
" <th>V6</th>\n",
" <th>V7</th>\n",
" <th>V8</th>\n",
" <th>V9</th>\n",
" <th>V10</th>\n",
" <th>V11</th>\n",
" <th>V12</th>\n",
" <th>V13</th>\n",
" <th>V14</th>\n",
" <th>V15</th>\n",
" <th>V16</th>\n",
" <th>V17</th>\n",
" <th>V18</th>\n",
" <th>V19</th>\n",
" <th>V20</th>\n",
" <th>V21</th>\n",
" <th>V22</th>\n",
" <th>V23</th>\n",
" <th>V24</th>\n",
" <th>V25</th>\n",
" <th>V26</th>\n",
" <th>V27</th>\n",
" <th>V28</th>\n",
" <th>Amount</th>\n",
" <th>Class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>284807.000000</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>2.848070e+05</td>\n",
" <td>284807.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>94813.859575</td>\n",
" <td>1.168375e-15</td>\n",
" <td>3.416908e-16</td>\n",
" <td>-1.379537e-15</td>\n",
" <td>2.074095e-15</td>\n",
" <td>9.604066e-16</td>\n",
" <td>1.487313e-15</td>\n",
" <td>-5.556467e-16</td>\n",
" <td>1.213481e-16</td>\n",
" <td>-2.406331e-15</td>\n",
" <td>2.239053e-15</td>\n",
" <td>1.673327e-15</td>\n",
" <td>-1.247012e-15</td>\n",
" <td>8.190001e-16</td>\n",
" <td>1.207294e-15</td>\n",
" <td>4.887456e-15</td>\n",
" <td>1.437716e-15</td>\n",
" <td>-3.772171e-16</td>\n",
" <td>9.564149e-16</td>\n",
" <td>1.039917e-15</td>\n",
" <td>6.406204e-16</td>\n",
" <td>1.654067e-16</td>\n",
" <td>-3.568593e-16</td>\n",
" <td>2.578648e-16</td>\n",
" <td>4.473266e-15</td>\n",
" <td>5.340915e-16</td>\n",
" <td>1.683437e-15</td>\n",
" <td>-3.660091e-16</td>\n",
" <td>-1.227390e-16</td>\n",
" <td>2.913952e-17</td>\n",
" <td>0.001727</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>47488.145955</td>\n",
" <td>1.958696e+00</td>\n",
" <td>1.651309e+00</td>\n",
" <td>1.516255e+00</td>\n",
" <td>1.415869e+00</td>\n",
" <td>1.380247e+00</td>\n",
" <td>1.332271e+00</td>\n",
" <td>1.237094e+00</td>\n",
" <td>1.194353e+00</td>\n",
" <td>1.098632e+00</td>\n",
" <td>1.088850e+00</td>\n",
" <td>1.020713e+00</td>\n",
" <td>9.992014e-01</td>\n",
" <td>9.952742e-01</td>\n",
" <td>9.585956e-01</td>\n",
" <td>9.153160e-01</td>\n",
" <td>8.762529e-01</td>\n",
" <td>8.493371e-01</td>\n",
" <td>8.381762e-01</td>\n",
" <td>8.140405e-01</td>\n",
" <td>7.709250e-01</td>\n",
" <td>7.345240e-01</td>\n",
" <td>7.257016e-01</td>\n",
" <td>6.244603e-01</td>\n",
" <td>6.056471e-01</td>\n",
" <td>5.212781e-01</td>\n",
" <td>4.822270e-01</td>\n",
" <td>4.036325e-01</td>\n",
" <td>3.300833e-01</td>\n",
" <td>1.000002e+00</td>\n",
" <td>0.041527</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.000000</td>\n",
" <td>-5.640751e+01</td>\n",
" <td>-7.271573e+01</td>\n",
" <td>-4.832559e+01</td>\n",
" <td>-5.683171e+00</td>\n",
" <td>-1.137433e+02</td>\n",
" <td>-2.616051e+01</td>\n",
" <td>-4.355724e+01</td>\n",
" <td>-7.321672e+01</td>\n",
" <td>-1.343407e+01</td>\n",
" <td>-2.458826e+01</td>\n",
" <td>-4.797473e+00</td>\n",
" <td>-1.868371e+01</td>\n",
" <td>-5.791881e+00</td>\n",
" <td>-1.921433e+01</td>\n",
" <td>-4.498945e+00</td>\n",
" <td>-1.412985e+01</td>\n",
" <td>-2.516280e+01</td>\n",
" <td>-9.498746e+00</td>\n",
" <td>-7.213527e+00</td>\n",
" <td>-5.449772e+01</td>\n",
" <td>-3.483038e+01</td>\n",
" <td>-1.093314e+01</td>\n",
" <td>-4.480774e+01</td>\n",
" <td>-2.836627e+00</td>\n",
" <td>-1.029540e+01</td>\n",
" <td>-2.604551e+00</td>\n",
" <td>-2.256568e+01</td>\n",
" <td>-1.543008e+01</td>\n",
" <td>-3.532294e-01</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>54201.500000</td>\n",
" <td>-9.203734e-01</td>\n",
" <td>-5.985499e-01</td>\n",
" <td>-8.903648e-01</td>\n",
" <td>-8.486401e-01</td>\n",
" <td>-6.915971e-01</td>\n",
" <td>-7.682956e-01</td>\n",
" <td>-5.540759e-01</td>\n",
" <td>-2.086297e-01</td>\n",
" <td>-6.430976e-01</td>\n",
" <td>-5.354257e-01</td>\n",
" <td>-7.624942e-01</td>\n",
" <td>-4.055715e-01</td>\n",
" <td>-6.485393e-01</td>\n",
" <td>-4.255740e-01</td>\n",
" <td>-5.828843e-01</td>\n",
" <td>-4.680368e-01</td>\n",
" <td>-4.837483e-01</td>\n",
" <td>-4.988498e-01</td>\n",
" <td>-4.562989e-01</td>\n",
" <td>-2.117214e-01</td>\n",
" <td>-2.283949e-01</td>\n",
" <td>-5.423504e-01</td>\n",
" <td>-1.618463e-01</td>\n",
" <td>-3.545861e-01</td>\n",
" <td>-3.171451e-01</td>\n",
" <td>-3.269839e-01</td>\n",
" <td>-7.083953e-02</td>\n",
" <td>-5.295979e-02</td>\n",
" <td>-3.308401e-01</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>84692.000000</td>\n",
" <td>1.810880e-02</td>\n",
" <td>6.548556e-02</td>\n",
" <td>1.798463e-01</td>\n",
" <td>-1.984653e-02</td>\n",
" <td>-5.433583e-02</td>\n",
" <td>-2.741871e-01</td>\n",
" <td>4.010308e-02</td>\n",
" <td>2.235804e-02</td>\n",
" <td>-5.142873e-02</td>\n",
" <td>-9.291738e-02</td>\n",
" <td>-3.275735e-02</td>\n",
" <td>1.400326e-01</td>\n",
" <td>-1.356806e-02</td>\n",
" <td>5.060132e-02</td>\n",
" <td>4.807155e-02</td>\n",
" <td>6.641332e-02</td>\n",
" <td>-6.567575e-02</td>\n",
" <td>-3.636312e-03</td>\n",
" <td>3.734823e-03</td>\n",
" <td>-6.248109e-02</td>\n",
" <td>-2.945017e-02</td>\n",
" <td>6.781943e-03</td>\n",
" <td>-1.119293e-02</td>\n",
" <td>4.097606e-02</td>\n",
" <td>1.659350e-02</td>\n",
" <td>-5.213911e-02</td>\n",
" <td>1.342146e-03</td>\n",
" <td>1.124383e-02</td>\n",
" <td>-2.652715e-01</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>139320.500000</td>\n",
" <td>1.315642e+00</td>\n",
" <td>8.037239e-01</td>\n",
" <td>1.027196e+00</td>\n",
" <td>7.433413e-01</td>\n",
" <td>6.119264e-01</td>\n",
" <td>3.985649e-01</td>\n",
" <td>5.704361e-01</td>\n",
" <td>3.273459e-01</td>\n",
" <td>5.971390e-01</td>\n",
" <td>4.539234e-01</td>\n",
" <td>7.395934e-01</td>\n",
" <td>6.182380e-01</td>\n",
" <td>6.625050e-01</td>\n",
" <td>4.931498e-01</td>\n",
" <td>6.488208e-01</td>\n",
" <td>5.232963e-01</td>\n",
" <td>3.996750e-01</td>\n",
" <td>5.008067e-01</td>\n",
" <td>4.589494e-01</td>\n",
" <td>1.330408e-01</td>\n",
" <td>1.863772e-01</td>\n",
" <td>5.285536e-01</td>\n",
" <td>1.476421e-01</td>\n",
" <td>4.395266e-01</td>\n",
" <td>3.507156e-01</td>\n",
" <td>2.409522e-01</td>\n",
" <td>9.104512e-02</td>\n",
" <td>7.827995e-02</td>\n",
" <td>-4.471707e-02</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>172792.000000</td>\n",
" <td>2.454930e+00</td>\n",
" <td>2.205773e+01</td>\n",
" <td>9.382558e+00</td>\n",
" <td>1.687534e+01</td>\n",
" <td>3.480167e+01</td>\n",
" <td>7.330163e+01</td>\n",
" <td>1.205895e+02</td>\n",
" <td>2.000721e+01</td>\n",
" <td>1.559499e+01</td>\n",
" <td>2.374514e+01</td>\n",
" <td>1.201891e+01</td>\n",
" <td>7.848392e+00</td>\n",
" <td>7.126883e+00</td>\n",
" <td>1.052677e+01</td>\n",
" <td>8.877742e+00</td>\n",
" <td>1.731511e+01</td>\n",
" <td>9.253526e+00</td>\n",
" <td>5.041069e+00</td>\n",
" <td>5.591971e+00</td>\n",
" <td>3.942090e+01</td>\n",
" <td>2.720284e+01</td>\n",
" <td>1.050309e+01</td>\n",
" <td>2.252841e+01</td>\n",
" <td>4.584549e+00</td>\n",
" <td>7.519589e+00</td>\n",
" <td>3.517346e+00</td>\n",
" <td>3.161220e+01</td>\n",
" <td>3.384781e+01</td>\n",
" <td>1.023622e+02</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Time V1 V2 V3 V4 \\\n",
"count 284807.000000 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 \n",
"mean 94813.859575 1.168375e-15 3.416908e-16 -1.379537e-15 2.074095e-15 \n",
"std 47488.145955 1.958696e+00 1.651309e+00 1.516255e+00 1.415869e+00 \n",
"min 0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00 \n",
"25% 54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01 \n",
"50% 84692.000000 1.810880e-02 6.548556e-02 1.798463e-01 -1.984653e-02 \n",
"75% 139320.500000 1.315642e+00 8.037239e-01 1.027196e+00 7.433413e-01 \n",
"max 172792.000000 2.454930e+00 2.205773e+01 9.382558e+00 1.687534e+01 \n",
"\n",
" V5 V6 V7 V8 V9 \\\n",
"count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 \n",
"mean 9.604066e-16 1.487313e-15 -5.556467e-16 1.213481e-16 -2.406331e-15 \n",
"std 1.380247e+00 1.332271e+00 1.237094e+00 1.194353e+00 1.098632e+00 \n",
"min -1.137433e+02 -2.616051e+01 -4.355724e+01 -7.321672e+01 -1.343407e+01 \n",
"25% -6.915971e-01 -7.682956e-01 -5.540759e-01 -2.086297e-01 -6.430976e-01 \n",
"50% -5.433583e-02 -2.741871e-01 4.010308e-02 2.235804e-02 -5.142873e-02 \n",
"75% 6.119264e-01 3.985649e-01 5.704361e-01 3.273459e-01 5.971390e-01 \n",
"max 3.480167e+01 7.330163e+01 1.205895e+02 2.000721e+01 1.559499e+01 \n",
"\n",
" V10 V11 V12 V13 V14 \\\n",
"count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 \n",
"mean 2.239053e-15 1.673327e-15 -1.247012e-15 8.190001e-16 1.207294e-15 \n",
"std 1.088850e+00 1.020713e+00 9.992014e-01 9.952742e-01 9.585956e-01 \n",
"min -2.458826e+01 -4.797473e+00 -1.868371e+01 -5.791881e+00 -1.921433e+01 \n",
"25% -5.354257e-01 -7.624942e-01 -4.055715e-01 -6.485393e-01 -4.255740e-01 \n",
"50% -9.291738e-02 -3.275735e-02 1.400326e-01 -1.356806e-02 5.060132e-02 \n",
"75% 4.539234e-01 7.395934e-01 6.182380e-01 6.625050e-01 4.931498e-01 \n",
"max 2.374514e+01 1.201891e+01 7.848392e+00 7.126883e+00 1.052677e+01 \n",
"\n",
" V15 V16 V17 V18 V19 \\\n",
"count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 \n",
"mean 4.887456e-15 1.437716e-15 -3.772171e-16 9.564149e-16 1.039917e-15 \n",
"std 9.153160e-01 8.762529e-01 8.493371e-01 8.381762e-01 8.140405e-01 \n",
"min -4.498945e+00 -1.412985e+01 -2.516280e+01 -9.498746e+00 -7.213527e+00 \n",
"25% -5.828843e-01 -4.680368e-01 -4.837483e-01 -4.988498e-01 -4.562989e-01 \n",
"50% 4.807155e-02 6.641332e-02 -6.567575e-02 -3.636312e-03 3.734823e-03 \n",
"75% 6.488208e-01 5.232963e-01 3.996750e-01 5.008067e-01 4.589494e-01 \n",
"max 8.877742e+00 1.731511e+01 9.253526e+00 5.041069e+00 5.591971e+00 \n",
"\n",
" V20 V21 V22 V23 V24 \\\n",
"count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 \n",
"mean 6.406204e-16 1.654067e-16 -3.568593e-16 2.578648e-16 4.473266e-15 \n",
"std 7.709250e-01 7.345240e-01 7.257016e-01 6.244603e-01 6.056471e-01 \n",
"min -5.449772e+01 -3.483038e+01 -1.093314e+01 -4.480774e+01 -2.836627e+00 \n",
"25% -2.117214e-01 -2.283949e-01 -5.423504e-01 -1.618463e-01 -3.545861e-01 \n",
"50% -6.248109e-02 -2.945017e-02 6.781943e-03 -1.119293e-02 4.097606e-02 \n",
"75% 1.330408e-01 1.863772e-01 5.285536e-01 1.476421e-01 4.395266e-01 \n",
"max 3.942090e+01 2.720284e+01 1.050309e+01 2.252841e+01 4.584549e+00 \n",
"\n",
" V25 V26 V27 V28 Amount \\\n",
"count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 \n",
"mean 5.340915e-16 1.683437e-15 -3.660091e-16 -1.227390e-16 2.913952e-17 \n",
"std 5.212781e-01 4.822270e-01 4.036325e-01 3.300833e-01 1.000002e+00 \n",
"min -1.029540e+01 -2.604551e+00 -2.256568e+01 -1.543008e+01 -3.532294e-01 \n",
"25% -3.171451e-01 -3.269839e-01 -7.083953e-02 -5.295979e-02 -3.308401e-01 \n",
"50% 1.659350e-02 -5.213911e-02 1.342146e-03 1.124383e-02 -2.652715e-01 \n",
"75% 3.507156e-01 2.409522e-01 9.104512e-02 7.827995e-02 -4.471707e-02 \n",
"max 7.519589e+00 3.517346e+00 3.161220e+01 3.384781e+01 1.023622e+02 \n",
"\n",
" Class \n",
"count 284807.000000 \n",
"mean 0.001727 \n",
"std 0.041527 \n",
"min 0.000000 \n",
"25% 0.000000 \n",
"50% 0.000000 \n",
"75% 0.000000 \n",
"max 1.000000 "
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Distribution of legitimate and fraudulent transactions"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Class\n",
"0 284315\n",
"1 492\n",
"Name: count, dtype: int64"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Class'].value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Undersampling the data\n",
"We will employ undersampling as one class significantly dominates the other."
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"# Determine the number of instances in the minority class\n",
"fraud_count = len(df[df.Class == 1])\n",
"fraud_indices = np.array(df[df.Class == 1].index)\n",
"\n",
"# Select indices corresponding to majority class instances\n",
"normal_indices = df[df.Class == 0].index\n",
"\n",
"# Randomly sample the same number of instances from the majority class\n",
"random_normal_indices = np.random.choice(normal_indices, fraud_count, replace=False)\n",
"random_normal_indices = np.array(random_normal_indices)\n",
"\n",
"# Combine indices of both classes\n",
"undersample_indice = np.concatenate([fraud_indices, random_normal_indices])\n",
"\n",
"# Undersample dataset\n",
"undersample_data = df.iloc[undersample_indice, :]\n",
"\n",
"X_undersample = undersample_data.iloc[:, undersample_data.columns != 'Class']\n",
"y_undersample = undersample_data.iloc[:, undersample_data.columns == 'Class']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Size of undersampled dataset"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 984 entries, 541 to 216408\n",
"Data columns (total 31 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Time 984 non-null float64\n",
" 1 V1 984 non-null float64\n",
" 2 V2 984 non-null float64\n",
" 3 V3 984 non-null float64\n",
" 4 V4 984 non-null float64\n",
" 5 V5 984 non-null float64\n",
" 6 V6 984 non-null float64\n",
" 7 V7 984 non-null float64\n",
" 8 V8 984 non-null float64\n",
" 9 V9 984 non-null float64\n",
" 10 V10 984 non-null float64\n",
" 11 V11 984 non-null float64\n",
" 12 V12 984 non-null float64\n",
" 13 V13 984 non-null float64\n",
" 14 V14 984 non-null float64\n",
" 15 V15 984 non-null float64\n",
" 16 V16 984 non-null float64\n",
" 17 V17 984 non-null float64\n",
" 18 V18 984 non-null float64\n",
" 19 V19 984 non-null float64\n",
" 20 V20 984 non-null float64\n",
" 21 V21 984 non-null float64\n",
" 22 V22 984 non-null float64\n",
" 23 V23 984 non-null float64\n",
" 24 V24 984 non-null float64\n",
" 25 V25 984 non-null float64\n",
" 26 V26 984 non-null float64\n",
" 27 V27 984 non-null float64\n",
" 28 V28 984 non-null float64\n",
" 29 Amount 984 non-null float64\n",
" 30 Class 984 non-null int64 \n",
"dtypes: float64(30), int64(1)\n",
"memory usage: 246.0 KB\n"
]
}
],
"source": [
"undersample_data.info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Summary statistics of the undersampled dataset"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Time</th>\n",
" <th>V1</th>\n",
" <th>V2</th>\n",
" <th>V3</th>\n",
" <th>V4</th>\n",
" <th>V5</th>\n",
" <th>V6</th>\n",
" <th>V7</th>\n",
" <th>V8</th>\n",
" <th>V9</th>\n",
" <th>V10</th>\n",
" <th>V11</th>\n",
" <th>V12</th>\n",
" <th>V13</th>\n",
" <th>V14</th>\n",
" <th>V15</th>\n",
" <th>V16</th>\n",
" <th>V17</th>\n",
" <th>V18</th>\n",
" <th>V19</th>\n",
" <th>V20</th>\n",
" <th>V21</th>\n",
" <th>V22</th>\n",
" <th>V23</th>\n",
" <th>V24</th>\n",
" <th>V25</th>\n",
" <th>V26</th>\n",
" <th>V27</th>\n",
" <th>V28</th>\n",
" <th>Amount</th>\n",
" <th>Class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" <td>984.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>88776.640244</td>\n",
" <td>-2.405835</td>\n",
" <td>1.819622</td>\n",
" <td>-3.511875</td>\n",
" <td>2.253022</td>\n",
" <td>-1.598199</td>\n",
" <td>-0.703490</td>\n",
" <td>-2.771834</td>\n",
" <td>0.290248</td>\n",
" <td>-1.271753</td>\n",
" <td>-2.866398</td>\n",
" <td>1.950912</td>\n",
" <td>-3.085801</td>\n",
" <td>-0.038172</td>\n",
" <td>-3.480604</td>\n",
" <td>-0.047034</td>\n",
" <td>-2.048191</td>\n",
" <td>-3.320723</td>\n",
" <td>-1.121329</td>\n",
" <td>0.338180</td>\n",
" <td>0.193890</td>\n",
" <td>0.363782</td>\n",
" <td>0.002554</td>\n",
" <td>0.005715</td>\n",
" <td>-0.068002</td>\n",
" <td>0.007024</td>\n",
" <td>0.018569</td>\n",
" <td>0.062892</td>\n",
" <td>0.027783</td>\n",
" <td>0.077270</td>\n",
" <td>0.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>48243.944271</td>\n",
" <td>5.526010</td>\n",
" <td>3.753287</td>\n",
" <td>6.215601</td>\n",
" <td>3.188581</td>\n",
" <td>4.284063</td>\n",
" <td>1.787638</td>\n",
" <td>5.918129</td>\n",
" <td>4.878508</td>\n",
" <td>2.321658</td>\n",
" <td>4.526228</td>\n",
" <td>2.737584</td>\n",
" <td>4.620098</td>\n",
" <td>1.046915</td>\n",
" <td>4.656872</td>\n",
" <td>0.957319</td>\n",
" <td>3.494192</td>\n",
" <td>5.975039</td>\n",
" <td>2.410427</td>\n",
" <td>1.275460</td>\n",
" <td>1.197462</td>\n",
" <td>2.801160</td>\n",
" <td>1.171430</td>\n",
" <td>1.286473</td>\n",
" <td>0.572075</td>\n",
" <td>0.671570</td>\n",
" <td>0.488011</td>\n",
" <td>1.023450</td>\n",
" <td>0.425708</td>\n",
" <td>1.322416</td>\n",
" <td>0.500254</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>406.000000</td>\n",
" <td>-30.552380</td>\n",
" <td>-20.984898</td>\n",
" <td>-31.103685</td>\n",
" <td>-3.421874</td>\n",
" <td>-22.105532</td>\n",
" <td>-14.425011</td>\n",
" <td>-43.557242</td>\n",
" <td>-41.044261</td>\n",
" <td>-13.434066</td>\n",
" <td>-24.588262</td>\n",
" <td>-2.279476</td>\n",
" <td>-18.683715</td>\n",
" <td>-3.184202</td>\n",
" <td>-19.214325</td>\n",
" <td>-4.498945</td>\n",
" <td>-14.129855</td>\n",
" <td>-25.162799</td>\n",
" <td>-9.498746</td>\n",
" <td>-3.681904</td>\n",
" <td>-5.244333</td>\n",
" <td>-22.797604</td>\n",
" <td>-8.887017</td>\n",
" <td>-19.254328</td>\n",
" <td>-2.082546</td>\n",
" <td>-4.781606</td>\n",
" <td>-1.407558</td>\n",
" <td>-7.263482</td>\n",
" <td>-1.869290</td>\n",
" <td>-0.353229</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>47898.750000</td>\n",
" <td>-2.896794</td>\n",
" <td>-0.156682</td>\n",
" <td>-5.084967</td>\n",
" <td>-0.100812</td>\n",
" <td>-1.758911</td>\n",
" <td>-1.509571</td>\n",
" <td>-3.060742</td>\n",
" <td>-0.179491</td>\n",
" <td>-2.279453</td>\n",
" <td>-4.593030</td>\n",
" <td>-0.047452</td>\n",
" <td>-5.495221</td>\n",
" <td>-0.761168</td>\n",
" <td>-6.721799</td>\n",
" <td>-0.639308</td>\n",
" <td>-3.543426</td>\n",
" <td>-5.302111</td>\n",
" <td>-1.809496</td>\n",
" <td>-0.441326</td>\n",
" <td>-0.203323</td>\n",
" <td>-0.159503</td>\n",
" <td>-0.523196</td>\n",
" <td>-0.233359</td>\n",
" <td>-0.409463</td>\n",
" <td>-0.331296</td>\n",
" <td>-0.316920</td>\n",
" <td>-0.061141</td>\n",
" <td>-0.053338</td>\n",
" <td>-0.347042</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>80545.000000</td>\n",
" <td>-0.803813</td>\n",
" <td>0.993854</td>\n",
" <td>-1.370281</td>\n",
" <td>1.291576</td>\n",
" <td>-0.425110</td>\n",
" <td>-0.640524</td>\n",
" <td>-0.640804</td>\n",
" <td>0.180271</td>\n",
" <td>-0.666598</td>\n",
" <td>-0.885646</td>\n",
" <td>1.171937</td>\n",
" <td>-0.734549</td>\n",
" <td>0.002739</td>\n",
" <td>-0.986762</td>\n",
" <td>-0.012609</td>\n",
" <td>-0.599078</td>\n",
" <td>-0.466877</td>\n",
" <td>-0.322246</td>\n",
" <td>0.228471</td>\n",
" <td>0.017882</td>\n",
" <td>0.140922</td>\n",
" <td>0.013228</td>\n",
" <td>-0.012981</td>\n",
" <td>-0.001297</td>\n",
" <td>0.035358</td>\n",
" <td>-0.027331</td>\n",
" <td>0.044271</td>\n",
" <td>0.032226</td>\n",
" <td>-0.278285</td>\n",
" <td>0.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>135096.750000</td>\n",
" <td>1.018644</td>\n",
" <td>2.798885</td>\n",
" <td>0.345737</td>\n",
" <td>4.235631</td>\n",
" <td>0.442399</td>\n",
" <td>0.088865</td>\n",
" <td>0.245823</td>\n",
" <td>0.879226</td>\n",
" <td>0.205275</td>\n",
" <td>-0.027386</td>\n",
" <td>3.586130</td>\n",
" <td>0.285771</td>\n",
" <td>0.646090</td>\n",
" <td>0.083101</td>\n",
" <td>0.608586</td>\n",
" <td>0.302161</td>\n",
" <td>0.269949</td>\n",
" <td>0.333976</td>\n",
" <td>0.955802</td>\n",
" <td>0.399500</td>\n",
" <td>0.654990</td>\n",
" <td>0.582570</td>\n",
" <td>0.196764</td>\n",
" <td>0.368293</td>\n",
" <td>0.371463</td>\n",
" <td>0.328812</td>\n",
" <td>0.424067</td>\n",
" <td>0.201361</td>\n",
" <td>0.046029</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>172743.000000</td>\n",
" <td>2.308878</td>\n",
" <td>22.057729</td>\n",
" <td>3.940337</td>\n",
" <td>12.114672</td>\n",
" <td>20.277728</td>\n",
" <td>12.128950</td>\n",
" <td>26.237722</td>\n",
" <td>20.007208</td>\n",
" <td>7.168878</td>\n",
" <td>10.423505</td>\n",
" <td>12.018913</td>\n",
" <td>1.948126</td>\n",
" <td>3.685570</td>\n",
" <td>3.442422</td>\n",
" <td>2.471358</td>\n",
" <td>3.139656</td>\n",
" <td>6.739384</td>\n",
" <td>3.790316</td>\n",
" <td>5.228342</td>\n",
" <td>16.436920</td>\n",
" <td>27.202839</td>\n",
" <td>8.361985</td>\n",
" <td>17.606637</td>\n",
" <td>1.102636</td>\n",
" <td>3.410742</td>\n",
" <td>2.745261</td>\n",
" <td>3.052358</td>\n",
" <td>1.779364</td>\n",
" <td>29.799137</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Time V1 V2 V3 V4 \\\n",
"count 984.000000 984.000000 984.000000 984.000000 984.000000 \n",
"mean 88776.640244 -2.405835 1.819622 -3.511875 2.253022 \n",
"std 48243.944271 5.526010 3.753287 6.215601 3.188581 \n",
"min 406.000000 -30.552380 -20.984898 -31.103685 -3.421874 \n",
"25% 47898.750000 -2.896794 -0.156682 -5.084967 -0.100812 \n",
"50% 80545.000000 -0.803813 0.993854 -1.370281 1.291576 \n",
"75% 135096.750000 1.018644 2.798885 0.345737 4.235631 \n",
"max 172743.000000 2.308878 22.057729 3.940337 12.114672 \n",
"\n",
" V5 V6 V7 V8 V9 V10 \\\n",
"count 984.000000 984.000000 984.000000 984.000000 984.000000 984.000000 \n",
"mean -1.598199 -0.703490 -2.771834 0.290248 -1.271753 -2.866398 \n",
"std 4.284063 1.787638 5.918129 4.878508 2.321658 4.526228 \n",
"min -22.105532 -14.425011 -43.557242 -41.044261 -13.434066 -24.588262 \n",
"25% -1.758911 -1.509571 -3.060742 -0.179491 -2.279453 -4.593030 \n",
"50% -0.425110 -0.640524 -0.640804 0.180271 -0.666598 -0.885646 \n",
"75% 0.442399 0.088865 0.245823 0.879226 0.205275 -0.027386 \n",
"max 20.277728 12.128950 26.237722 20.007208 7.168878 10.423505 \n",
"\n",
" V11 V12 V13 V14 V15 V16 \\\n",
"count 984.000000 984.000000 984.000000 984.000000 984.000000 984.000000 \n",
"mean 1.950912 -3.085801 -0.038172 -3.480604 -0.047034 -2.048191 \n",
"std 2.737584 4.620098 1.046915 4.656872 0.957319 3.494192 \n",
"min -2.279476 -18.683715 -3.184202 -19.214325 -4.498945 -14.129855 \n",
"25% -0.047452 -5.495221 -0.761168 -6.721799 -0.639308 -3.543426 \n",
"50% 1.171937 -0.734549 0.002739 -0.986762 -0.012609 -0.599078 \n",
"75% 3.586130 0.285771 0.646090 0.083101 0.608586 0.302161 \n",
"max 12.018913 1.948126 3.685570 3.442422 2.471358 3.139656 \n",
"\n",
" V17 V18 V19 V20 V21 V22 \\\n",
"count 984.000000 984.000000 984.000000 984.000000 984.000000 984.000000 \n",
"mean -3.320723 -1.121329 0.338180 0.193890 0.363782 0.002554 \n",
"std 5.975039 2.410427 1.275460 1.197462 2.801160 1.171430 \n",
"min -25.162799 -9.498746 -3.681904 -5.244333 -22.797604 -8.887017 \n",
"25% -5.302111 -1.809496 -0.441326 -0.203323 -0.159503 -0.523196 \n",
"50% -0.466877 -0.322246 0.228471 0.017882 0.140922 0.013228 \n",
"75% 0.269949 0.333976 0.955802 0.399500 0.654990 0.582570 \n",
"max 6.739384 3.790316 5.228342 16.436920 27.202839 8.361985 \n",
"\n",
" V23 V24 V25 V26 V27 V28 \\\n",
"count 984.000000 984.000000 984.000000 984.000000 984.000000 984.000000 \n",
"mean 0.005715 -0.068002 0.007024 0.018569 0.062892 0.027783 \n",
"std 1.286473 0.572075 0.671570 0.488011 1.023450 0.425708 \n",
"min -19.254328 -2.082546 -4.781606 -1.407558 -7.263482 -1.869290 \n",
"25% -0.233359 -0.409463 -0.331296 -0.316920 -0.061141 -0.053338 \n",
"50% -0.012981 -0.001297 0.035358 -0.027331 0.044271 0.032226 \n",
"75% 0.196764 0.368293 0.371463 0.328812 0.424067 0.201361 \n",
"max 17.606637 1.102636 3.410742 2.745261 3.052358 1.779364 \n",
"\n",
" Amount Class \n",
"count 984.000000 984.000000 \n",
"mean 0.077270 0.500000 \n",
"std 1.322416 0.500254 \n",
"min -0.353229 0.000000 \n",
"25% -0.347042 0.000000 \n",
"50% -0.278285 0.500000 \n",
"75% 0.046029 1.000000 \n",
"max 29.799137 1.000000 "
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"undersample_data.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Distribution of legitimate and fraudulent transactions in an undersampled dataset"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Class\n",
"1 492\n",
"0 492\n",
"Name: count, dtype: int64"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"undersample_data['Class'].value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Splitting whole data into training and test datasets"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"X = df.iloc[:, df.columns != 'Class']\n",
"y = df.iloc[:, df.columns == 'Class']\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Statistical measures of the training dataset of whole data"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 199364 entries, 161145 to 117952\n",
"Data columns (total 31 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Time 199364 non-null float64\n",
" 1 V1 199364 non-null float64\n",
" 2 V2 199364 non-null float64\n",