ium_487184/zad_02.ipynb

1050 lines
36 KiB
Plaintext
Raw Normal View History

2023-04-21 14:50:47 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
2023-04-21 15:03:48 +02:00
"id": "12dba44a",
2023-04-21 14:50:47 +02:00
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 7,
2023-04-21 15:03:48 +02:00
"id": "1d480e94",
2023-04-21 14:50:47 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"zsh:1: command not found: kaggle\r\n"
]
}
],
"source": [
"!kaggle datasets download -d gender_classification_v7.csv"
]
},
{
"cell_type": "code",
2023-04-21 15:03:48 +02:00
"execution_count": 41,
"id": "13a40d88",
2023-04-21 14:50:47 +02:00
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>long_hair</th>\n",
" <th>forehead_width_cm</th>\n",
" <th>forehead_height_cm</th>\n",
" <th>nose_wide</th>\n",
" <th>nose_long</th>\n",
" <th>lips_thin</th>\n",
" <th>distance_nose_to_lip_long</th>\n",
" <th>gender</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>11.8</td>\n",
" <td>6.1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>14.0</td>\n",
" <td>5.4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>11.8</td>\n",
" <td>6.3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>14.4</td>\n",
" <td>6.1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>13.5</td>\n",
" <td>5.9</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4996</th>\n",
" <td>1</td>\n",
" <td>13.6</td>\n",
" <td>5.1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4997</th>\n",
" <td>1</td>\n",
" <td>11.9</td>\n",
" <td>5.4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4998</th>\n",
" <td>1</td>\n",
" <td>12.9</td>\n",
" <td>5.7</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4999</th>\n",
" <td>1</td>\n",
" <td>13.2</td>\n",
" <td>6.2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5000</th>\n",
" <td>1</td>\n",
" <td>15.4</td>\n",
" <td>5.4</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5001 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n",
"0 1 11.8 6.1 1 0 \n",
"1 0 14.0 5.4 0 0 \n",
"2 0 11.8 6.3 1 1 \n",
"3 0 14.4 6.1 0 1 \n",
"4 1 13.5 5.9 0 0 \n",
"... ... ... ... ... ... \n",
"4996 1 13.6 5.1 0 0 \n",
"4997 1 11.9 5.4 0 0 \n",
"4998 1 12.9 5.7 0 0 \n",
"4999 1 13.2 6.2 0 0 \n",
"5000 1 15.4 5.4 1 1 \n",
"\n",
" lips_thin distance_nose_to_lip_long gender \n",
"0 1 1 Male \n",
"1 1 0 Female \n",
"2 1 1 Male \n",
"3 1 1 Male \n",
"4 0 0 Female \n",
"... ... ... ... \n",
"4996 0 0 Female \n",
"4997 0 0 Female \n",
"4998 0 0 Female \n",
"4999 0 0 Female \n",
"5000 1 1 Male \n",
"\n",
"[5001 rows x 8 columns]"
]
},
2023-04-21 15:03:48 +02:00
"execution_count": 41,
2023-04-21 14:50:47 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_data = pd.read_csv(\"gender_class.csv\")\n",
"raw_data"
]
},
{
"cell_type": "markdown",
2023-04-21 15:03:48 +02:00
"id": "51c05e9a",
2023-04-21 14:50:47 +02:00
"metadata": {},
"source": [
"Wyczyści zbiór z artefaktów (np. puste linie, przykłady z niepoprawnymi wartościami)"
]
},
{
"cell_type": "code",
2023-04-21 15:03:48 +02:00
"execution_count": 42,
"id": "c70571df",
2023-04-21 14:50:47 +02:00
"metadata": {},
"outputs": [],
"source": [
"def clean_data(data):\n",
" data.dropna(inplace=True)\n",
" return data"
]
},
{
"cell_type": "code",
2023-04-21 15:03:48 +02:00
"execution_count": 43,
"id": "0481b0dd",
2023-04-21 14:50:47 +02:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>long_hair</th>\n",
" <th>forehead_width_cm</th>\n",
" <th>forehead_height_cm</th>\n",
" <th>nose_wide</th>\n",
" <th>nose_long</th>\n",
" <th>lips_thin</th>\n",
" <th>distance_nose_to_lip_long</th>\n",
" <th>gender</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
2023-04-21 15:03:48 +02:00
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>11.8</td>\n",
" <td>6.1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>14.0</td>\n",
" <td>5.4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>11.8</td>\n",
" <td>6.3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>14.4</td>\n",
" <td>6.1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>13.5</td>\n",
" <td>5.9</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4996</th>\n",
" <td>1</td>\n",
" <td>13.6</td>\n",
" <td>5.1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4997</th>\n",
" <td>1</td>\n",
" <td>11.9</td>\n",
" <td>5.4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4998</th>\n",
" <td>1</td>\n",
" <td>12.9</td>\n",
" <td>5.7</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4999</th>\n",
" <td>1</td>\n",
" <td>13.2</td>\n",
" <td>6.2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5000</th>\n",
" <td>1</td>\n",
" <td>15.4</td>\n",
" <td>5.4</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
2023-04-21 14:50:47 +02:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
2023-04-21 15:03:48 +02:00
"<p>5001 rows × 8 columns</p>\n",
2023-04-21 14:50:47 +02:00
"</div>"
],
"text/plain": [
2023-04-21 15:03:48 +02:00
" long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n",
"0 1 11.8 6.1 1 0 \n",
"1 0 14.0 5.4 0 0 \n",
"2 0 11.8 6.3 1 1 \n",
"3 0 14.4 6.1 0 1 \n",
"4 1 13.5 5.9 0 0 \n",
"... ... ... ... ... ... \n",
"4996 1 13.6 5.1 0 0 \n",
"4997 1 11.9 5.4 0 0 \n",
"4998 1 12.9 5.7 0 0 \n",
"4999 1 13.2 6.2 0 0 \n",
"5000 1 15.4 5.4 1 1 \n",
2023-04-21 14:50:47 +02:00
"\n",
2023-04-21 15:03:48 +02:00
" lips_thin distance_nose_to_lip_long gender \n",
"0 1 1 Male \n",
"1 1 0 Female \n",
"2 1 1 Male \n",
"3 1 1 Male \n",
"4 0 0 Female \n",
"... ... ... ... \n",
"4996 0 0 Female \n",
"4997 0 0 Female \n",
"4998 0 0 Female \n",
"4999 0 0 Female \n",
"5000 1 1 Male \n",
2023-04-21 14:50:47 +02:00
"\n",
2023-04-21 15:03:48 +02:00
"[5001 rows x 8 columns]"
2023-04-21 14:50:47 +02:00
]
},
2023-04-21 15:03:48 +02:00
"execution_count": 43,
2023-04-21 14:50:47 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2023-04-21 15:03:48 +02:00
"raw_data = clean_data(raw_data)\n",
"raw_data"
2023-04-21 14:50:47 +02:00
]
},
{
2023-04-21 15:03:48 +02:00
"cell_type": "markdown",
"id": "717fab23",
"metadata": {},
2023-04-21 14:50:47 +02:00
"source": [
2023-04-21 15:03:48 +02:00
"Dokona normalizacji danych w zbiorze (np. normalizacja wartości float do zakresu 0.0 - 1.0)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "7fcacf03",
"metadata": {},
"outputs": [],
"source": [
"def normalize_data(data):\n",
" # znormalizuj wartości float do zakresu 0.0 - 1.0\n",
" for col in data.columns:\n",
" if data[col].dtype == float:\n",
" data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())\n",
"\n",
" return data"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "bfd844ad",
"metadata": {},
"outputs": [],
"source": [
"normalized_data = normalize_data(raw_data)"
2023-04-21 14:50:47 +02:00
]
},
{
"cell_type": "code",
2023-04-21 15:03:48 +02:00
"execution_count": 46,
"id": "2d0b8499",
2023-04-21 14:50:47 +02:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>long_hair</th>\n",
" <th>forehead_width_cm</th>\n",
" <th>forehead_height_cm</th>\n",
" <th>nose_wide</th>\n",
" <th>nose_long</th>\n",
" <th>lips_thin</th>\n",
" <th>distance_nose_to_lip_long</th>\n",
" <th>gender</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
2023-04-21 15:03:48 +02:00
" <th>0</th>\n",
2023-04-21 14:50:47 +02:00
" <td>1</td>\n",
2023-04-21 15:03:48 +02:00
" <td>0.097561</td>\n",
" <td>0.50</td>\n",
2023-04-21 14:50:47 +02:00
" <td>1</td>\n",
2023-04-21 15:03:48 +02:00
" <td>0</td>\n",
2023-04-21 14:50:47 +02:00
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
2023-04-21 15:03:48 +02:00
" <th>1</th>\n",
2023-04-21 14:50:47 +02:00
" <td>0</td>\n",
2023-04-21 15:03:48 +02:00
" <td>0.634146</td>\n",
" <td>0.15</td>\n",
2023-04-21 14:50:47 +02:00
" <td>0</td>\n",
" <td>0</td>\n",
2023-04-21 15:03:48 +02:00
" <td>1</td>\n",
2023-04-21 14:50:47 +02:00
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
2023-04-21 15:03:48 +02:00
" <th>2</th>\n",
2023-04-21 14:50:47 +02:00
" <td>0</td>\n",
2023-04-21 15:03:48 +02:00
" <td>0.097561</td>\n",
" <td>0.60</td>\n",
2023-04-21 14:50:47 +02:00
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
2023-04-21 15:03:48 +02:00
" <th>3</th>\n",
" <td>0</td>\n",
" <td>0.731707</td>\n",
" <td>0.50</td>\n",
2023-04-21 14:50:47 +02:00
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
2023-04-21 15:03:48 +02:00
" <th>4</th>\n",
" <td>1</td>\n",
" <td>0.512195</td>\n",
" <td>0.40</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
2023-04-21 14:50:47 +02:00
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
2023-04-21 15:03:48 +02:00
" <th>4996</th>\n",
2023-04-21 14:50:47 +02:00
" <td>1</td>\n",
2023-04-21 15:03:48 +02:00
" <td>0.536585</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
2023-04-21 14:50:47 +02:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
2023-04-21 15:03:48 +02:00
" <th>4997</th>\n",
2023-04-21 14:50:47 +02:00
" <td>1</td>\n",
2023-04-21 15:03:48 +02:00
" <td>0.121951</td>\n",
2023-04-21 14:50:47 +02:00
" <td>0.15</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
2023-04-21 15:03:48 +02:00
" <th>4998</th>\n",
2023-04-21 14:50:47 +02:00
" <td>1</td>\n",
2023-04-21 15:03:48 +02:00
" <td>0.365854</td>\n",
" <td>0.30</td>\n",
2023-04-21 14:50:47 +02:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
2023-04-21 15:03:48 +02:00
" <th>4999</th>\n",
2023-04-21 14:50:47 +02:00
" <td>1</td>\n",
2023-04-21 15:03:48 +02:00
" <td>0.439024</td>\n",
" <td>0.55</td>\n",
2023-04-21 14:50:47 +02:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
2023-04-21 15:03:48 +02:00
" <tr>\n",
" <th>5000</th>\n",
" <td>1</td>\n",
" <td>0.975610</td>\n",
" <td>0.15</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
2023-04-21 14:50:47 +02:00
" </tbody>\n",
"</table>\n",
2023-04-21 15:03:48 +02:00
"<p>5001 rows × 8 columns</p>\n",
2023-04-21 14:50:47 +02:00
"</div>"
],
"text/plain": [
" long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n",
2023-04-21 15:03:48 +02:00
"0 1 0.097561 0.50 1 0 \n",
"1 0 0.634146 0.15 0 0 \n",
"2 0 0.097561 0.60 1 1 \n",
"3 0 0.731707 0.50 0 1 \n",
"4 1 0.512195 0.40 0 0 \n",
2023-04-21 14:50:47 +02:00
"... ... ... ... ... ... \n",
2023-04-21 15:03:48 +02:00
"4996 1 0.536585 0.00 0 0 \n",
"4997 1 0.121951 0.15 0 0 \n",
"4998 1 0.365854 0.30 0 0 \n",
"4999 1 0.439024 0.55 0 0 \n",
"5000 1 0.975610 0.15 1 1 \n",
2023-04-21 14:50:47 +02:00
"\n",
" lips_thin distance_nose_to_lip_long gender \n",
2023-04-21 15:03:48 +02:00
"0 1 1 Male \n",
"1 1 0 Female \n",
"2 1 1 Male \n",
"3 1 1 Male \n",
"4 0 0 Female \n",
2023-04-21 14:50:47 +02:00
"... ... ... ... \n",
2023-04-21 15:03:48 +02:00
"4996 0 0 Female \n",
"4997 0 0 Female \n",
"4998 0 0 Female \n",
"4999 0 0 Female \n",
"5000 1 1 Male \n",
2023-04-21 14:50:47 +02:00
"\n",
2023-04-21 15:03:48 +02:00
"[5001 rows x 8 columns]"
2023-04-21 14:50:47 +02:00
]
},
2023-04-21 15:03:48 +02:00
"execution_count": 46,
2023-04-21 14:50:47 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2023-04-21 15:03:48 +02:00
"normalized_data"
]
},
{
"cell_type": "markdown",
"id": "61fbcddc",
"metadata": {},
"source": [
"2. Jeśli brak w zbiorze gotowego podziału na podzbiory train/dev/test, to dokona takiego podziału"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "dc386189",
"metadata": {},
"outputs": [],
"source": [
"train, dev, test = np.split(normalized_data.sample(frac=1, random_state=42), [int(.7*len(normalized_data)), int(.85*len(normalized_data))])\n",
"\n",
"# zapisz dane w osobnych plikach csv\n",
"train.to_csv('train.csv', index=False)\n",
"dev.to_csv('dev.csv', index=False)\n",
"test.to_csv('test.csv', index=False)"
2023-04-21 14:50:47 +02:00
]
},
{
"cell_type": "code",
2023-04-21 15:03:48 +02:00
"execution_count": 48,
"id": "9f888962",
2023-04-21 14:50:47 +02:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>long_hair</th>\n",
" <th>forehead_width_cm</th>\n",
" <th>forehead_height_cm</th>\n",
" <th>nose_wide</th>\n",
" <th>nose_long</th>\n",
" <th>lips_thin</th>\n",
" <th>distance_nose_to_lip_long</th>\n",
" <th>gender</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
2023-04-21 15:03:48 +02:00
" <th>4432</th>\n",
2023-04-21 14:50:47 +02:00
" <td>1</td>\n",
2023-04-21 15:03:48 +02:00
" <td>0.512195</td>\n",
" <td>0.10</td>\n",
2023-04-21 14:50:47 +02:00
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
2023-04-21 15:03:48 +02:00
" <th>2162</th>\n",
2023-04-21 14:50:47 +02:00
" <td>1</td>\n",
2023-04-21 15:03:48 +02:00
" <td>0.243902</td>\n",
" <td>0.70</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
2023-04-21 14:50:47 +02:00
" </tr>\n",
" <tr>\n",
2023-04-21 15:03:48 +02:00
" <th>2396</th>\n",
" <td>1</td>\n",
" <td>0.512195</td>\n",
" <td>0.15</td>\n",
" <td>1</td>\n",
2023-04-21 14:50:47 +02:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
2023-04-21 15:03:48 +02:00
" <th>4769</th>\n",
2023-04-21 14:50:47 +02:00
" <td>1</td>\n",
2023-04-21 15:03:48 +02:00
" <td>0.853659</td>\n",
" <td>0.10</td>\n",
2023-04-21 14:50:47 +02:00
" <td>1</td>\n",
" <td>1</td>\n",
2023-04-21 15:03:48 +02:00
" <td>0</td>\n",
2023-04-21 14:50:47 +02:00
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
2023-04-21 15:03:48 +02:00
" <th>2271</th>\n",
2023-04-21 14:50:47 +02:00
" <td>1</td>\n",
2023-04-21 15:03:48 +02:00
" <td>0.292683</td>\n",
" <td>0.70</td>\n",
" <td>0</td>\n",
2023-04-21 14:50:47 +02:00
" <td>1</td>\n",
2023-04-21 15:03:48 +02:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
2023-04-21 14:50:47 +02:00
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
2023-04-21 15:03:48 +02:00
" <th>846</th>\n",
2023-04-21 14:50:47 +02:00
" <td>1</td>\n",
2023-04-21 15:03:48 +02:00
" <td>0.097561</td>\n",
2023-04-21 14:50:47 +02:00
" <td>0.45</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
2023-04-21 15:03:48 +02:00
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2551</th>\n",
2023-04-21 14:50:47 +02:00
" <td>0</td>\n",
2023-04-21 15:03:48 +02:00
" <td>0.243902</td>\n",
" <td>0.35</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
2023-04-21 14:50:47 +02:00
" </tr>\n",
" <tr>\n",
2023-04-21 15:03:48 +02:00
" <th>2928</th>\n",
2023-04-21 14:50:47 +02:00
" <td>1</td>\n",
2023-04-21 15:03:48 +02:00
" <td>0.634146</td>\n",
" <td>0.20</td>\n",
2023-04-21 14:50:47 +02:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
2023-04-21 15:03:48 +02:00
" <th>117</th>\n",
2023-04-21 14:50:47 +02:00
" <td>1</td>\n",
2023-04-21 15:03:48 +02:00
" <td>0.707317</td>\n",
" <td>0.50</td>\n",
2023-04-21 14:50:47 +02:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
2023-04-21 15:03:48 +02:00
" <th>645</th>\n",
2023-04-21 14:50:47 +02:00
" <td>1</td>\n",
2023-04-21 15:03:48 +02:00
" <td>0.195122</td>\n",
" <td>0.05</td>\n",
2023-04-21 14:50:47 +02:00
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
2023-04-21 15:03:48 +02:00
"<p>750 rows × 8 columns</p>\n",
2023-04-21 14:50:47 +02:00
"</div>"
],
"text/plain": [
" long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n",
2023-04-21 15:03:48 +02:00
"4432 1 0.512195 0.10 1 1 \n",
"2162 1 0.243902 0.70 1 1 \n",
"2396 1 0.512195 0.15 1 0 \n",
"4769 1 0.853659 0.10 1 1 \n",
"2271 1 0.292683 0.70 0 1 \n",
2023-04-21 14:50:47 +02:00
"... ... ... ... ... ... \n",
2023-04-21 15:03:48 +02:00
"846 1 0.097561 0.45 1 1 \n",
"2551 0 0.243902 0.35 1 1 \n",
"2928 1 0.634146 0.20 0 0 \n",
"117 1 0.707317 0.50 0 0 \n",
"645 1 0.195122 0.05 1 0 \n",
2023-04-21 14:50:47 +02:00
"\n",
" lips_thin distance_nose_to_lip_long gender \n",
2023-04-21 15:03:48 +02:00
"4432 1 1 Male \n",
"2162 1 1 Male \n",
"2396 0 0 Female \n",
"4769 0 1 Male \n",
"2271 0 0 Female \n",
2023-04-21 14:50:47 +02:00
"... ... ... ... \n",
2023-04-21 15:03:48 +02:00
"846 1 1 Male \n",
"2551 1 1 Male \n",
"2928 0 0 Female \n",
"117 0 0 Female \n",
"645 0 0 Female \n",
2023-04-21 14:50:47 +02:00
"\n",
2023-04-21 15:03:48 +02:00
"[750 rows x 8 columns]"
2023-04-21 14:50:47 +02:00
]
},
2023-04-21 15:03:48 +02:00
"execution_count": 48,
2023-04-21 14:50:47 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2023-04-21 15:03:48 +02:00
"dev"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "4598cea1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" long_hair forehead_width_cm forehead_height_cm nose_wide \\\n",
"count 5001.000000 5001.000000 5001.000000 5001.000000 \n",
"mean 0.869626 0.434508 0.423155 0.493901 \n",
"std 0.336748 0.270031 0.270634 0.500013 \n",
"min 0.000000 0.000000 0.000000 0.000000 \n",
"25% 1.000000 0.195122 0.200000 0.000000 \n",
"50% 1.000000 0.414634 0.400000 0.000000 \n",
"75% 1.000000 0.634146 0.650000 1.000000 \n",
"max 1.000000 1.000000 1.000000 1.000000 \n",
"\n",
" nose_long lips_thin distance_nose_to_lip_long \n",
"count 5001.000000 5001.000000 5001.000000 \n",
"mean 0.507898 0.493101 0.498900 \n",
"std 0.499988 0.500002 0.500049 \n",
"min 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 0.000000 \n",
"50% 1.000000 0.000000 0.000000 \n",
"75% 1.000000 1.000000 1.000000 \n",
"max 1.000000 1.000000 1.000000 \n",
" long_hair forehead_width_cm forehead_height_cm nose_wide \\\n",
"count 3500.000000 3500.000000 3500.000000 3500.000000 \n",
"mean 0.870000 0.436021 0.425900 0.505714 \n",
"std 0.336351 0.270492 0.271348 0.500039 \n",
"min 0.000000 0.000000 0.000000 0.000000 \n",
"25% 1.000000 0.195122 0.200000 0.000000 \n",
"50% 1.000000 0.414634 0.400000 1.000000 \n",
"75% 1.000000 0.634146 0.650000 1.000000 \n",
"max 1.000000 1.000000 1.000000 1.000000 \n",
"\n",
" nose_long lips_thin distance_nose_to_lip_long \n",
"count 3500.000000 3500.000000 3500.000000 \n",
"mean 0.522000 0.499429 0.507714 \n",
"std 0.499587 0.500071 0.500012 \n",
"min 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 0.000000 \n",
"50% 1.000000 0.000000 1.000000 \n",
"75% 1.000000 1.000000 1.000000 \n",
"max 1.000000 1.000000 1.000000 \n",
" long_hair forehead_width_cm forehead_height_cm nose_wide \\\n",
"count 750.000000 750.000000 750.000000 750.000000 \n",
"mean 0.870667 0.419285 0.416933 0.472000 \n",
"std 0.335792 0.264474 0.269500 0.499549 \n",
"min 0.000000 0.000000 0.000000 0.000000 \n",
"25% 1.000000 0.195122 0.200000 0.000000 \n",
"50% 1.000000 0.414634 0.400000 0.000000 \n",
"75% 1.000000 0.634146 0.637500 1.000000 \n",
"max 1.000000 1.000000 1.000000 1.000000 \n",
"\n",
" nose_long lips_thin distance_nose_to_lip_long \n",
"count 750.000000 750.000000 750.000000 \n",
"mean 0.466667 0.481333 0.465333 \n",
"std 0.499221 0.499985 0.499130 \n",
"min 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 0.000000 \n",
"50% 0.000000 0.000000 0.000000 \n",
"75% 1.000000 1.000000 1.000000 \n",
"max 1.000000 1.000000 1.000000 \n",
" long_hair forehead_width_cm forehead_height_cm nose_wide \\\n",
"count 751.000000 751.000000 751.000000 751.000000 \n",
"mean 0.866844 0.442662 0.416578 0.460719 \n",
"std 0.339969 0.273141 0.268567 0.498787 \n",
"min 0.000000 0.000000 0.000000 0.000000 \n",
"25% 1.000000 0.195122 0.200000 0.000000 \n",
"50% 1.000000 0.439024 0.400000 0.000000 \n",
"75% 1.000000 0.658537 0.600000 1.000000 \n",
"max 1.000000 1.000000 1.000000 1.000000 \n",
"\n",
" nose_long lips_thin distance_nose_to_lip_long \n",
"count 751.000000 751.000000 751.000000 \n",
"mean 0.483356 0.475366 0.491345 \n",
"std 0.500056 0.499726 0.500258 \n",
"min 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 0.000000 \n",
"50% 0.000000 0.000000 0.000000 \n",
"75% 1.000000 1.000000 1.000000 \n",
"max 1.000000 1.000000 1.000000 \n"
]
}
],
"source": [
"for d in [raw_data,train, dev, test]:\n",
" print( d.describe())"
2023-04-21 14:50:47 +02:00
]
},
{
"cell_type": "code",
"execution_count": null,
2023-04-21 15:03:48 +02:00
"id": "8fa84a56",
2023-04-21 14:50:47 +02:00
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}