ium_487184/zad_02.ipynb
bartosz.maslanka.consultant 3f7a91f1db add jenkinsfile
2023-04-21 15:03:48 +02:00

1050 lines
36 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"id": "12dba44a",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "1d480e94",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"zsh:1: command not found: kaggle\r\n"
]
}
],
"source": [
"!kaggle datasets download -d gender_classification_v7.csv"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "13a40d88",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>long_hair</th>\n",
" <th>forehead_width_cm</th>\n",
" <th>forehead_height_cm</th>\n",
" <th>nose_wide</th>\n",
" <th>nose_long</th>\n",
" <th>lips_thin</th>\n",
" <th>distance_nose_to_lip_long</th>\n",
" <th>gender</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>11.8</td>\n",
" <td>6.1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>14.0</td>\n",
" <td>5.4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>11.8</td>\n",
" <td>6.3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>14.4</td>\n",
" <td>6.1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>13.5</td>\n",
" <td>5.9</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4996</th>\n",
" <td>1</td>\n",
" <td>13.6</td>\n",
" <td>5.1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4997</th>\n",
" <td>1</td>\n",
" <td>11.9</td>\n",
" <td>5.4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4998</th>\n",
" <td>1</td>\n",
" <td>12.9</td>\n",
" <td>5.7</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4999</th>\n",
" <td>1</td>\n",
" <td>13.2</td>\n",
" <td>6.2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5000</th>\n",
" <td>1</td>\n",
" <td>15.4</td>\n",
" <td>5.4</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5001 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n",
"0 1 11.8 6.1 1 0 \n",
"1 0 14.0 5.4 0 0 \n",
"2 0 11.8 6.3 1 1 \n",
"3 0 14.4 6.1 0 1 \n",
"4 1 13.5 5.9 0 0 \n",
"... ... ... ... ... ... \n",
"4996 1 13.6 5.1 0 0 \n",
"4997 1 11.9 5.4 0 0 \n",
"4998 1 12.9 5.7 0 0 \n",
"4999 1 13.2 6.2 0 0 \n",
"5000 1 15.4 5.4 1 1 \n",
"\n",
" lips_thin distance_nose_to_lip_long gender \n",
"0 1 1 Male \n",
"1 1 0 Female \n",
"2 1 1 Male \n",
"3 1 1 Male \n",
"4 0 0 Female \n",
"... ... ... ... \n",
"4996 0 0 Female \n",
"4997 0 0 Female \n",
"4998 0 0 Female \n",
"4999 0 0 Female \n",
"5000 1 1 Male \n",
"\n",
"[5001 rows x 8 columns]"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_data = pd.read_csv(\"gender_class.csv\")\n",
"raw_data"
]
},
{
"cell_type": "markdown",
"id": "51c05e9a",
"metadata": {},
"source": [
"Wyczyści zbiór z artefaktów (np. puste linie, przykłady z niepoprawnymi wartościami)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "c70571df",
"metadata": {},
"outputs": [],
"source": [
"def clean_data(data):\n",
" data.dropna(inplace=True)\n",
" return data"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "0481b0dd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>long_hair</th>\n",
" <th>forehead_width_cm</th>\n",
" <th>forehead_height_cm</th>\n",
" <th>nose_wide</th>\n",
" <th>nose_long</th>\n",
" <th>lips_thin</th>\n",
" <th>distance_nose_to_lip_long</th>\n",
" <th>gender</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>11.8</td>\n",
" <td>6.1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>14.0</td>\n",
" <td>5.4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>11.8</td>\n",
" <td>6.3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>14.4</td>\n",
" <td>6.1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>13.5</td>\n",
" <td>5.9</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4996</th>\n",
" <td>1</td>\n",
" <td>13.6</td>\n",
" <td>5.1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4997</th>\n",
" <td>1</td>\n",
" <td>11.9</td>\n",
" <td>5.4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4998</th>\n",
" <td>1</td>\n",
" <td>12.9</td>\n",
" <td>5.7</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4999</th>\n",
" <td>1</td>\n",
" <td>13.2</td>\n",
" <td>6.2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5000</th>\n",
" <td>1</td>\n",
" <td>15.4</td>\n",
" <td>5.4</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5001 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n",
"0 1 11.8 6.1 1 0 \n",
"1 0 14.0 5.4 0 0 \n",
"2 0 11.8 6.3 1 1 \n",
"3 0 14.4 6.1 0 1 \n",
"4 1 13.5 5.9 0 0 \n",
"... ... ... ... ... ... \n",
"4996 1 13.6 5.1 0 0 \n",
"4997 1 11.9 5.4 0 0 \n",
"4998 1 12.9 5.7 0 0 \n",
"4999 1 13.2 6.2 0 0 \n",
"5000 1 15.4 5.4 1 1 \n",
"\n",
" lips_thin distance_nose_to_lip_long gender \n",
"0 1 1 Male \n",
"1 1 0 Female \n",
"2 1 1 Male \n",
"3 1 1 Male \n",
"4 0 0 Female \n",
"... ... ... ... \n",
"4996 0 0 Female \n",
"4997 0 0 Female \n",
"4998 0 0 Female \n",
"4999 0 0 Female \n",
"5000 1 1 Male \n",
"\n",
"[5001 rows x 8 columns]"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_data = clean_data(raw_data)\n",
"raw_data"
]
},
{
"cell_type": "markdown",
"id": "717fab23",
"metadata": {},
"source": [
"Dokona normalizacji danych w zbiorze (np. normalizacja wartości float do zakresu 0.0 - 1.0)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "7fcacf03",
"metadata": {},
"outputs": [],
"source": [
"def normalize_data(data):\n",
" # znormalizuj wartości float do zakresu 0.0 - 1.0\n",
" for col in data.columns:\n",
" if data[col].dtype == float:\n",
" data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())\n",
"\n",
" return data"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "bfd844ad",
"metadata": {},
"outputs": [],
"source": [
"normalized_data = normalize_data(raw_data)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "2d0b8499",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>long_hair</th>\n",
" <th>forehead_width_cm</th>\n",
" <th>forehead_height_cm</th>\n",
" <th>nose_wide</th>\n",
" <th>nose_long</th>\n",
" <th>lips_thin</th>\n",
" <th>distance_nose_to_lip_long</th>\n",
" <th>gender</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>0.097561</td>\n",
" <td>0.50</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>0.634146</td>\n",
" <td>0.15</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>0.097561</td>\n",
" <td>0.60</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>0.731707</td>\n",
" <td>0.50</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>0.512195</td>\n",
" <td>0.40</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4996</th>\n",
" <td>1</td>\n",
" <td>0.536585</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4997</th>\n",
" <td>1</td>\n",
" <td>0.121951</td>\n",
" <td>0.15</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4998</th>\n",
" <td>1</td>\n",
" <td>0.365854</td>\n",
" <td>0.30</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4999</th>\n",
" <td>1</td>\n",
" <td>0.439024</td>\n",
" <td>0.55</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5000</th>\n",
" <td>1</td>\n",
" <td>0.975610</td>\n",
" <td>0.15</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5001 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n",
"0 1 0.097561 0.50 1 0 \n",
"1 0 0.634146 0.15 0 0 \n",
"2 0 0.097561 0.60 1 1 \n",
"3 0 0.731707 0.50 0 1 \n",
"4 1 0.512195 0.40 0 0 \n",
"... ... ... ... ... ... \n",
"4996 1 0.536585 0.00 0 0 \n",
"4997 1 0.121951 0.15 0 0 \n",
"4998 1 0.365854 0.30 0 0 \n",
"4999 1 0.439024 0.55 0 0 \n",
"5000 1 0.975610 0.15 1 1 \n",
"\n",
" lips_thin distance_nose_to_lip_long gender \n",
"0 1 1 Male \n",
"1 1 0 Female \n",
"2 1 1 Male \n",
"3 1 1 Male \n",
"4 0 0 Female \n",
"... ... ... ... \n",
"4996 0 0 Female \n",
"4997 0 0 Female \n",
"4998 0 0 Female \n",
"4999 0 0 Female \n",
"5000 1 1 Male \n",
"\n",
"[5001 rows x 8 columns]"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"normalized_data"
]
},
{
"cell_type": "markdown",
"id": "61fbcddc",
"metadata": {},
"source": [
"2. Jeśli brak w zbiorze gotowego podziału na podzbiory train/dev/test, to dokona takiego podziału"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "dc386189",
"metadata": {},
"outputs": [],
"source": [
"train, dev, test = np.split(normalized_data.sample(frac=1, random_state=42), [int(.7*len(normalized_data)), int(.85*len(normalized_data))])\n",
"\n",
"# zapisz dane w osobnych plikach csv\n",
"train.to_csv('train.csv', index=False)\n",
"dev.to_csv('dev.csv', index=False)\n",
"test.to_csv('test.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "9f888962",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>long_hair</th>\n",
" <th>forehead_width_cm</th>\n",
" <th>forehead_height_cm</th>\n",
" <th>nose_wide</th>\n",
" <th>nose_long</th>\n",
" <th>lips_thin</th>\n",
" <th>distance_nose_to_lip_long</th>\n",
" <th>gender</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>4432</th>\n",
" <td>1</td>\n",
" <td>0.512195</td>\n",
" <td>0.10</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2162</th>\n",
" <td>1</td>\n",
" <td>0.243902</td>\n",
" <td>0.70</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2396</th>\n",
" <td>1</td>\n",
" <td>0.512195</td>\n",
" <td>0.15</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4769</th>\n",
" <td>1</td>\n",
" <td>0.853659</td>\n",
" <td>0.10</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2271</th>\n",
" <td>1</td>\n",
" <td>0.292683</td>\n",
" <td>0.70</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>846</th>\n",
" <td>1</td>\n",
" <td>0.097561</td>\n",
" <td>0.45</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2551</th>\n",
" <td>0</td>\n",
" <td>0.243902</td>\n",
" <td>0.35</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2928</th>\n",
" <td>1</td>\n",
" <td>0.634146</td>\n",
" <td>0.20</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>117</th>\n",
" <td>1</td>\n",
" <td>0.707317</td>\n",
" <td>0.50</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>645</th>\n",
" <td>1</td>\n",
" <td>0.195122</td>\n",
" <td>0.05</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>750 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n",
"4432 1 0.512195 0.10 1 1 \n",
"2162 1 0.243902 0.70 1 1 \n",
"2396 1 0.512195 0.15 1 0 \n",
"4769 1 0.853659 0.10 1 1 \n",
"2271 1 0.292683 0.70 0 1 \n",
"... ... ... ... ... ... \n",
"846 1 0.097561 0.45 1 1 \n",
"2551 0 0.243902 0.35 1 1 \n",
"2928 1 0.634146 0.20 0 0 \n",
"117 1 0.707317 0.50 0 0 \n",
"645 1 0.195122 0.05 1 0 \n",
"\n",
" lips_thin distance_nose_to_lip_long gender \n",
"4432 1 1 Male \n",
"2162 1 1 Male \n",
"2396 0 0 Female \n",
"4769 0 1 Male \n",
"2271 0 0 Female \n",
"... ... ... ... \n",
"846 1 1 Male \n",
"2551 1 1 Male \n",
"2928 0 0 Female \n",
"117 0 0 Female \n",
"645 0 0 Female \n",
"\n",
"[750 rows x 8 columns]"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dev"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "4598cea1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" long_hair forehead_width_cm forehead_height_cm nose_wide \\\n",
"count 5001.000000 5001.000000 5001.000000 5001.000000 \n",
"mean 0.869626 0.434508 0.423155 0.493901 \n",
"std 0.336748 0.270031 0.270634 0.500013 \n",
"min 0.000000 0.000000 0.000000 0.000000 \n",
"25% 1.000000 0.195122 0.200000 0.000000 \n",
"50% 1.000000 0.414634 0.400000 0.000000 \n",
"75% 1.000000 0.634146 0.650000 1.000000 \n",
"max 1.000000 1.000000 1.000000 1.000000 \n",
"\n",
" nose_long lips_thin distance_nose_to_lip_long \n",
"count 5001.000000 5001.000000 5001.000000 \n",
"mean 0.507898 0.493101 0.498900 \n",
"std 0.499988 0.500002 0.500049 \n",
"min 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 0.000000 \n",
"50% 1.000000 0.000000 0.000000 \n",
"75% 1.000000 1.000000 1.000000 \n",
"max 1.000000 1.000000 1.000000 \n",
" long_hair forehead_width_cm forehead_height_cm nose_wide \\\n",
"count 3500.000000 3500.000000 3500.000000 3500.000000 \n",
"mean 0.870000 0.436021 0.425900 0.505714 \n",
"std 0.336351 0.270492 0.271348 0.500039 \n",
"min 0.000000 0.000000 0.000000 0.000000 \n",
"25% 1.000000 0.195122 0.200000 0.000000 \n",
"50% 1.000000 0.414634 0.400000 1.000000 \n",
"75% 1.000000 0.634146 0.650000 1.000000 \n",
"max 1.000000 1.000000 1.000000 1.000000 \n",
"\n",
" nose_long lips_thin distance_nose_to_lip_long \n",
"count 3500.000000 3500.000000 3500.000000 \n",
"mean 0.522000 0.499429 0.507714 \n",
"std 0.499587 0.500071 0.500012 \n",
"min 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 0.000000 \n",
"50% 1.000000 0.000000 1.000000 \n",
"75% 1.000000 1.000000 1.000000 \n",
"max 1.000000 1.000000 1.000000 \n",
" long_hair forehead_width_cm forehead_height_cm nose_wide \\\n",
"count 750.000000 750.000000 750.000000 750.000000 \n",
"mean 0.870667 0.419285 0.416933 0.472000 \n",
"std 0.335792 0.264474 0.269500 0.499549 \n",
"min 0.000000 0.000000 0.000000 0.000000 \n",
"25% 1.000000 0.195122 0.200000 0.000000 \n",
"50% 1.000000 0.414634 0.400000 0.000000 \n",
"75% 1.000000 0.634146 0.637500 1.000000 \n",
"max 1.000000 1.000000 1.000000 1.000000 \n",
"\n",
" nose_long lips_thin distance_nose_to_lip_long \n",
"count 750.000000 750.000000 750.000000 \n",
"mean 0.466667 0.481333 0.465333 \n",
"std 0.499221 0.499985 0.499130 \n",
"min 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 0.000000 \n",
"50% 0.000000 0.000000 0.000000 \n",
"75% 1.000000 1.000000 1.000000 \n",
"max 1.000000 1.000000 1.000000 \n",
" long_hair forehead_width_cm forehead_height_cm nose_wide \\\n",
"count 751.000000 751.000000 751.000000 751.000000 \n",
"mean 0.866844 0.442662 0.416578 0.460719 \n",
"std 0.339969 0.273141 0.268567 0.498787 \n",
"min 0.000000 0.000000 0.000000 0.000000 \n",
"25% 1.000000 0.195122 0.200000 0.000000 \n",
"50% 1.000000 0.439024 0.400000 0.000000 \n",
"75% 1.000000 0.658537 0.600000 1.000000 \n",
"max 1.000000 1.000000 1.000000 1.000000 \n",
"\n",
" nose_long lips_thin distance_nose_to_lip_long \n",
"count 751.000000 751.000000 751.000000 \n",
"mean 0.483356 0.475366 0.491345 \n",
"std 0.500056 0.499726 0.500258 \n",
"min 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 0.000000 \n",
"50% 0.000000 0.000000 0.000000 \n",
"75% 1.000000 1.000000 1.000000 \n",
"max 1.000000 1.000000 1.000000 \n"
]
}
],
"source": [
"for d in [raw_data,train, dev, test]:\n",
" print( d.describe())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8fa84a56",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}