ium_487184/zad_02.ipynb

1026 lines
34 KiB
Plaintext
Raw Normal View History

2023-04-21 14:50:47 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"id": "f5229180",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "2d3b5bee",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"zsh:1: command not found: kaggle\r\n"
]
}
],
"source": [
"!kaggle datasets download -d gender_classification_v7.csv"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "fbbeb52d",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>long_hair</th>\n",
" <th>forehead_width_cm</th>\n",
" <th>forehead_height_cm</th>\n",
" <th>nose_wide</th>\n",
" <th>nose_long</th>\n",
" <th>lips_thin</th>\n",
" <th>distance_nose_to_lip_long</th>\n",
" <th>gender</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>11.8</td>\n",
" <td>6.1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>14.0</td>\n",
" <td>5.4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>11.8</td>\n",
" <td>6.3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>14.4</td>\n",
" <td>6.1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>13.5</td>\n",
" <td>5.9</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4996</th>\n",
" <td>1</td>\n",
" <td>13.6</td>\n",
" <td>5.1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4997</th>\n",
" <td>1</td>\n",
" <td>11.9</td>\n",
" <td>5.4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4998</th>\n",
" <td>1</td>\n",
" <td>12.9</td>\n",
" <td>5.7</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4999</th>\n",
" <td>1</td>\n",
" <td>13.2</td>\n",
" <td>6.2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5000</th>\n",
" <td>1</td>\n",
" <td>15.4</td>\n",
" <td>5.4</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5001 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n",
"0 1 11.8 6.1 1 0 \n",
"1 0 14.0 5.4 0 0 \n",
"2 0 11.8 6.3 1 1 \n",
"3 0 14.4 6.1 0 1 \n",
"4 1 13.5 5.9 0 0 \n",
"... ... ... ... ... ... \n",
"4996 1 13.6 5.1 0 0 \n",
"4997 1 11.9 5.4 0 0 \n",
"4998 1 12.9 5.7 0 0 \n",
"4999 1 13.2 6.2 0 0 \n",
"5000 1 15.4 5.4 1 1 \n",
"\n",
" lips_thin distance_nose_to_lip_long gender \n",
"0 1 1 Male \n",
"1 1 0 Female \n",
"2 1 1 Male \n",
"3 1 1 Male \n",
"4 0 0 Female \n",
"... ... ... ... \n",
"4996 0 0 Female \n",
"4997 0 0 Female \n",
"4998 0 0 Female \n",
"4999 0 0 Female \n",
"5000 1 1 Male \n",
"\n",
"[5001 rows x 8 columns]"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_data = pd.read_csv(\"gender_class.csv\")\n",
"raw_data"
]
},
{
"cell_type": "markdown",
"id": "1f9629f1",
"metadata": {},
"source": [
"Wyczyści zbiór z artefaktów (np. puste linie, przykłady z niepoprawnymi wartościami)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "ad18b250",
"metadata": {},
"outputs": [],
"source": [
"def clean_data(data):\n",
" data.dropna(inplace=True)\n",
"\n",
" # usuń wiersze z niepoprawnymi wartościami\n",
" for col in data.columns:\n",
" if data[col].dtype == float:\n",
" data = data[(data[col] >= 0.0) & (data[col] <= 1.0)]\n",
" elif data[col].dtype == int:\n",
" data = data[(data[col] >= 0)]\n",
"\n",
" return data"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "8154dfd8",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>long_hair</th>\n",
" <th>forehead_width_cm</th>\n",
" <th>forehead_height_cm</th>\n",
" <th>nose_wide</th>\n",
" <th>nose_long</th>\n",
" <th>lips_thin</th>\n",
" <th>distance_nose_to_lip_long</th>\n",
" <th>gender</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [long_hair, forehead_width_cm, forehead_height_cm, nose_wide, nose_long, lips_thin, distance_nose_to_lip_long, gender]\n",
"Index: []"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_data = clean_data(raw_data)\n",
"raw_data"
]
},
{
"cell_type": "markdown",
"id": "bd27b530",
"metadata": {},
"source": [
"Dokona normalizacji danych w zbiorze (np. normalizacja wartości float do zakresu 0.0 - 1.0)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "b81c3005",
"metadata": {},
"outputs": [],
"source": [
"def normalize_data(data):\n",
" # znormalizuj wartości float do zakresu 0.0 - 1.0\n",
" for col in data.columns:\n",
" if data[col].dtype == float:\n",
" data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())\n",
"\n",
" return data"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "611929ca",
"metadata": {},
"outputs": [],
"source": [
"normalized_data = normalize_data(raw_data)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "64724998",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>long_hair</th>\n",
" <th>forehead_width_cm</th>\n",
" <th>forehead_height_cm</th>\n",
" <th>nose_wide</th>\n",
" <th>nose_long</th>\n",
" <th>lips_thin</th>\n",
" <th>distance_nose_to_lip_long</th>\n",
" <th>gender</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [long_hair, forehead_width_cm, forehead_height_cm, nose_wide, nose_long, lips_thin, distance_nose_to_lip_long, gender]\n",
"Index: []"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"normalized_data"
]
},
{
"cell_type": "markdown",
"id": "7013b00e",
"metadata": {},
"source": [
"2. Jeśli brak w zbiorze gotowego podziału na podzbiory train/dev/test, to dokona takiego podziału"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "9eb24b71",
"metadata": {},
"outputs": [],
"source": [
"train, dev, test = np.split(normalized_data.sample(frac=1, random_state=42), [int(.7*len(normalized_data)), int(.85*len(normalized_data))])\n",
"\n",
"# zapisz dane w osobnych plikach csv\n",
"train.to_csv('train.csv', index=False)\n",
"dev.to_csv('dev.csv', index=False)\n",
"test.to_csv('test.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "81d1cd62",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>long_hair</th>\n",
" <th>forehead_width_cm</th>\n",
" <th>forehead_height_cm</th>\n",
" <th>nose_wide</th>\n",
" <th>nose_long</th>\n",
" <th>lips_thin</th>\n",
" <th>distance_nose_to_lip_long</th>\n",
" <th>gender</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [long_hair, forehead_width_cm, forehead_height_cm, nose_wide, nose_long, lips_thin, distance_nose_to_lip_long, gender]\n",
"Index: []"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dev"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "851d9aa0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" long_hair forehead_width_cm forehead_height_cm nose_wide \\\n",
"count 5001.000000 5001.000000 5001.000000 5001.000000 \n",
"mean 0.869626 13.181484 5.946311 0.493901 \n",
"std 0.336748 1.107128 0.541268 0.500013 \n",
"min 0.000000 11.400000 5.100000 0.000000 \n",
"25% 1.000000 12.200000 5.500000 0.000000 \n",
"50% 1.000000 13.100000 5.900000 0.000000 \n",
"75% 1.000000 14.000000 6.400000 1.000000 \n",
"max 1.000000 15.500000 7.100000 1.000000 \n",
"\n",
" nose_long lips_thin distance_nose_to_lip_long \n",
"count 5001.000000 5001.000000 5001.000000 \n",
"mean 0.507898 0.493101 0.498900 \n",
"std 0.499988 0.500002 0.500049 \n",
"min 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 0.000000 \n",
"50% 1.000000 0.000000 0.000000 \n",
"75% 1.000000 1.000000 1.000000 \n",
"max 1.000000 1.000000 1.000000 \n",
" long_hair forehead_width_cm forehead_height_cm nose_wide \\\n",
"count 3500.000000 3500.000000 3500.000000 3500.000000 \n",
"mean 0.870000 13.187686 5.951800 0.505714 \n",
"std 0.336351 1.109019 0.542695 0.500039 \n",
"min 0.000000 11.400000 5.100000 0.000000 \n",
"25% 1.000000 12.200000 5.500000 0.000000 \n",
"50% 1.000000 13.100000 5.900000 1.000000 \n",
"75% 1.000000 14.000000 6.400000 1.000000 \n",
"max 1.000000 15.500000 7.100000 1.000000 \n",
"\n",
" nose_long lips_thin distance_nose_to_lip_long \n",
"count 3500.000000 3500.000000 3500.000000 \n",
"mean 0.522000 0.499429 0.507714 \n",
"std 0.499587 0.500071 0.500012 \n",
"min 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 0.000000 \n",
"50% 1.000000 0.000000 1.000000 \n",
"75% 1.000000 1.000000 1.000000 \n",
"max 1.000000 1.000000 1.000000 \n",
" long_hair forehead_width_cm forehead_height_cm nose_wide \\\n",
"count 750.000000 750.000000 750.000000 750.000000 \n",
"mean 0.870667 13.119067 5.933867 0.472000 \n",
"std 0.335792 1.084345 0.538999 0.499549 \n",
"min 0.000000 11.400000 5.100000 0.000000 \n",
"25% 1.000000 12.200000 5.500000 0.000000 \n",
"50% 1.000000 13.100000 5.900000 0.000000 \n",
"75% 1.000000 14.000000 6.375000 1.000000 \n",
"max 1.000000 15.500000 7.100000 1.000000 \n",
"\n",
" nose_long lips_thin distance_nose_to_lip_long \n",
"count 750.000000 750.000000 750.000000 \n",
"mean 0.466667 0.481333 0.465333 \n",
"std 0.499221 0.499985 0.499130 \n",
"min 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 0.000000 \n",
"50% 0.000000 0.000000 0.000000 \n",
"75% 1.000000 1.000000 1.000000 \n",
"max 1.000000 1.000000 1.000000 \n",
" long_hair forehead_width_cm forehead_height_cm nose_wide \\\n",
"count 751.000000 751.000000 751.000000 751.000000 \n",
"mean 0.866844 13.214913 5.933156 0.460719 \n",
"std 0.339969 1.119877 0.537134 0.498787 \n",
"min 0.000000 11.400000 5.100000 0.000000 \n",
"25% 1.000000 12.200000 5.500000 0.000000 \n",
"50% 1.000000 13.200000 5.900000 0.000000 \n",
"75% 1.000000 14.100000 6.300000 1.000000 \n",
"max 1.000000 15.500000 7.100000 1.000000 \n",
"\n",
" nose_long lips_thin distance_nose_to_lip_long \n",
"count 751.000000 751.000000 751.000000 \n",
"mean 0.483356 0.475366 0.491345 \n",
"std 0.500056 0.499726 0.500258 \n",
"min 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 0.000000 \n",
"50% 0.000000 0.000000 0.000000 \n",
"75% 1.000000 1.000000 1.000000 \n",
"max 1.000000 1.000000 1.000000 \n"
]
}
],
"source": [
"for d in [raw_data,train, dev, test]:\n",
" print( d.describe())"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "f52a79aa",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>long_hair</th>\n",
" <th>forehead_width_cm</th>\n",
" <th>forehead_height_cm</th>\n",
" <th>nose_wide</th>\n",
" <th>nose_long</th>\n",
" <th>lips_thin</th>\n",
" <th>distance_nose_to_lip_long</th>\n",
" <th>gender</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1501</th>\n",
" <td>1</td>\n",
" <td>0.439024</td>\n",
" <td>0.30</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2586</th>\n",
" <td>1</td>\n",
" <td>0.560976</td>\n",
" <td>0.45</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2653</th>\n",
" <td>0</td>\n",
" <td>0.365854</td>\n",
" <td>0.10</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1055</th>\n",
" <td>1</td>\n",
" <td>0.439024</td>\n",
" <td>0.40</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>705</th>\n",
" <td>0</td>\n",
" <td>0.926829</td>\n",
" <td>0.25</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2087</th>\n",
" <td>1</td>\n",
" <td>0.048780</td>\n",
" <td>0.45</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1889</th>\n",
" <td>1</td>\n",
" <td>0.048780</td>\n",
" <td>0.15</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4623</th>\n",
" <td>1</td>\n",
" <td>0.536585</td>\n",
" <td>0.20</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1591</th>\n",
" <td>1</td>\n",
" <td>1.000000</td>\n",
" <td>0.95</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1346</th>\n",
" <td>1</td>\n",
" <td>0.536585</td>\n",
" <td>0.35</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3500 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n",
"1501 1 0.439024 0.30 1 1 \n",
"2586 1 0.560976 0.45 0 0 \n",
"2653 0 0.365854 0.10 0 0 \n",
"1055 1 0.439024 0.40 1 1 \n",
"705 0 0.926829 0.25 1 1 \n",
"... ... ... ... ... ... \n",
"2087 1 0.048780 0.45 0 1 \n",
"1889 1 0.048780 0.15 0 0 \n",
"4623 1 0.536585 0.20 0 0 \n",
"1591 1 1.000000 0.95 1 0 \n",
"1346 1 0.536585 0.35 0 0 \n",
"\n",
" lips_thin distance_nose_to_lip_long gender \n",
"1501 1 1 Male \n",
"2586 0 0 Female \n",
"2653 0 1 Female \n",
"1055 1 1 Male \n",
"705 1 1 Male \n",
"... ... ... ... \n",
"2087 1 0 Female \n",
"1889 0 0 Female \n",
"4623 0 0 Female \n",
"1591 1 0 Male \n",
"1346 0 0 Female \n",
"\n",
"[3500 rows x 8 columns]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"normalize_data(train)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "2653e41d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>long_hair</th>\n",
" <th>forehead_width_cm</th>\n",
" <th>forehead_height_cm</th>\n",
" <th>nose_wide</th>\n",
" <th>nose_long</th>\n",
" <th>lips_thin</th>\n",
" <th>distance_nose_to_lip_long</th>\n",
" <th>gender</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1501</th>\n",
" <td>1</td>\n",
" <td>0.439024</td>\n",
" <td>0.30</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2586</th>\n",
" <td>1</td>\n",
" <td>0.560976</td>\n",
" <td>0.45</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2653</th>\n",
" <td>0</td>\n",
" <td>0.365854</td>\n",
" <td>0.10</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1055</th>\n",
" <td>1</td>\n",
" <td>0.439024</td>\n",
" <td>0.40</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>705</th>\n",
" <td>0</td>\n",
" <td>0.926829</td>\n",
" <td>0.25</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2087</th>\n",
" <td>1</td>\n",
" <td>0.048780</td>\n",
" <td>0.45</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1889</th>\n",
" <td>1</td>\n",
" <td>0.048780</td>\n",
" <td>0.15</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4623</th>\n",
" <td>1</td>\n",
" <td>0.536585</td>\n",
" <td>0.20</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1591</th>\n",
" <td>1</td>\n",
" <td>1.000000</td>\n",
" <td>0.95</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1346</th>\n",
" <td>1</td>\n",
" <td>0.536585</td>\n",
" <td>0.35</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3500 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n",
"1501 1 0.439024 0.30 1 1 \n",
"2586 1 0.560976 0.45 0 0 \n",
"2653 0 0.365854 0.10 0 0 \n",
"1055 1 0.439024 0.40 1 1 \n",
"705 0 0.926829 0.25 1 1 \n",
"... ... ... ... ... ... \n",
"2087 1 0.048780 0.45 0 1 \n",
"1889 1 0.048780 0.15 0 0 \n",
"4623 1 0.536585 0.20 0 0 \n",
"1591 1 1.000000 0.95 1 0 \n",
"1346 1 0.536585 0.35 0 0 \n",
"\n",
" lips_thin distance_nose_to_lip_long gender \n",
"1501 1 1 Male \n",
"2586 0 0 Female \n",
"2653 0 1 Female \n",
"1055 1 1 Male \n",
"705 1 1 Male \n",
"... ... ... ... \n",
"2087 1 0 Female \n",
"1889 0 0 Female \n",
"4623 0 0 Female \n",
"1591 1 0 Male \n",
"1346 0 0 Female \n",
"\n",
"[3500 rows x 8 columns]"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clean_data(train)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bb1439e3",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}