{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"id": "f5229180",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "2d3b5bee",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"zsh:1: command not found: kaggle\r\n"
]
}
],
"source": [
"!kaggle datasets download -d gender_classification_v7.csv"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "fbbeb52d",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" long_hair | \n",
" forehead_width_cm | \n",
" forehead_height_cm | \n",
" nose_wide | \n",
" nose_long | \n",
" lips_thin | \n",
" distance_nose_to_lip_long | \n",
" gender | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 11.8 | \n",
" 6.1 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" Male | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 14.0 | \n",
" 5.4 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" Female | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 11.8 | \n",
" 6.3 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" Male | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 14.4 | \n",
" 6.1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" Male | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" 13.5 | \n",
" 5.9 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" Female | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 4996 | \n",
" 1 | \n",
" 13.6 | \n",
" 5.1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" Female | \n",
"
\n",
" \n",
" 4997 | \n",
" 1 | \n",
" 11.9 | \n",
" 5.4 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" Female | \n",
"
\n",
" \n",
" 4998 | \n",
" 1 | \n",
" 12.9 | \n",
" 5.7 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" Female | \n",
"
\n",
" \n",
" 4999 | \n",
" 1 | \n",
" 13.2 | \n",
" 6.2 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" Female | \n",
"
\n",
" \n",
" 5000 | \n",
" 1 | \n",
" 15.4 | \n",
" 5.4 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" Male | \n",
"
\n",
" \n",
"
\n",
"
5001 rows × 8 columns
\n",
"
"
],
"text/plain": [
" long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n",
"0 1 11.8 6.1 1 0 \n",
"1 0 14.0 5.4 0 0 \n",
"2 0 11.8 6.3 1 1 \n",
"3 0 14.4 6.1 0 1 \n",
"4 1 13.5 5.9 0 0 \n",
"... ... ... ... ... ... \n",
"4996 1 13.6 5.1 0 0 \n",
"4997 1 11.9 5.4 0 0 \n",
"4998 1 12.9 5.7 0 0 \n",
"4999 1 13.2 6.2 0 0 \n",
"5000 1 15.4 5.4 1 1 \n",
"\n",
" lips_thin distance_nose_to_lip_long gender \n",
"0 1 1 Male \n",
"1 1 0 Female \n",
"2 1 1 Male \n",
"3 1 1 Male \n",
"4 0 0 Female \n",
"... ... ... ... \n",
"4996 0 0 Female \n",
"4997 0 0 Female \n",
"4998 0 0 Female \n",
"4999 0 0 Female \n",
"5000 1 1 Male \n",
"\n",
"[5001 rows x 8 columns]"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_data = pd.read_csv(\"gender_class.csv\")\n",
"raw_data"
]
},
{
"cell_type": "markdown",
"id": "1f9629f1",
"metadata": {},
"source": [
"Wyczyści zbiór z artefaktów (np. puste linie, przykłady z niepoprawnymi wartościami)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "ad18b250",
"metadata": {},
"outputs": [],
"source": [
"def clean_data(data):\n",
" data.dropna(inplace=True)\n",
"\n",
" # usuń wiersze z niepoprawnymi wartościami\n",
" for col in data.columns:\n",
" if data[col].dtype == float:\n",
" data = data[(data[col] >= 0.0) & (data[col] <= 1.0)]\n",
" elif data[col].dtype == int:\n",
" data = data[(data[col] >= 0)]\n",
"\n",
" return data"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "8154dfd8",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" long_hair | \n",
" forehead_width_cm | \n",
" forehead_height_cm | \n",
" nose_wide | \n",
" nose_long | \n",
" lips_thin | \n",
" distance_nose_to_lip_long | \n",
" gender | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [long_hair, forehead_width_cm, forehead_height_cm, nose_wide, nose_long, lips_thin, distance_nose_to_lip_long, gender]\n",
"Index: []"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_data = clean_data(raw_data)\n",
"raw_data"
]
},
{
"cell_type": "markdown",
"id": "bd27b530",
"metadata": {},
"source": [
"Dokona normalizacji danych w zbiorze (np. normalizacja wartości float do zakresu 0.0 - 1.0)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "b81c3005",
"metadata": {},
"outputs": [],
"source": [
"def normalize_data(data):\n",
" # znormalizuj wartości float do zakresu 0.0 - 1.0\n",
" for col in data.columns:\n",
" if data[col].dtype == float:\n",
" data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())\n",
"\n",
" return data"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "611929ca",
"metadata": {},
"outputs": [],
"source": [
"normalized_data = normalize_data(raw_data)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "64724998",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" long_hair | \n",
" forehead_width_cm | \n",
" forehead_height_cm | \n",
" nose_wide | \n",
" nose_long | \n",
" lips_thin | \n",
" distance_nose_to_lip_long | \n",
" gender | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [long_hair, forehead_width_cm, forehead_height_cm, nose_wide, nose_long, lips_thin, distance_nose_to_lip_long, gender]\n",
"Index: []"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"normalized_data"
]
},
{
"cell_type": "markdown",
"id": "7013b00e",
"metadata": {},
"source": [
"2. Jeśli brak w zbiorze gotowego podziału na podzbiory train/dev/test, to dokona takiego podziału"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "9eb24b71",
"metadata": {},
"outputs": [],
"source": [
"train, dev, test = np.split(normalized_data.sample(frac=1, random_state=42), [int(.7*len(normalized_data)), int(.85*len(normalized_data))])\n",
"\n",
"# zapisz dane w osobnych plikach csv\n",
"train.to_csv('train.csv', index=False)\n",
"dev.to_csv('dev.csv', index=False)\n",
"test.to_csv('test.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "81d1cd62",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" long_hair | \n",
" forehead_width_cm | \n",
" forehead_height_cm | \n",
" nose_wide | \n",
" nose_long | \n",
" lips_thin | \n",
" distance_nose_to_lip_long | \n",
" gender | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [long_hair, forehead_width_cm, forehead_height_cm, nose_wide, nose_long, lips_thin, distance_nose_to_lip_long, gender]\n",
"Index: []"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dev"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "851d9aa0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" long_hair forehead_width_cm forehead_height_cm nose_wide \\\n",
"count 5001.000000 5001.000000 5001.000000 5001.000000 \n",
"mean 0.869626 13.181484 5.946311 0.493901 \n",
"std 0.336748 1.107128 0.541268 0.500013 \n",
"min 0.000000 11.400000 5.100000 0.000000 \n",
"25% 1.000000 12.200000 5.500000 0.000000 \n",
"50% 1.000000 13.100000 5.900000 0.000000 \n",
"75% 1.000000 14.000000 6.400000 1.000000 \n",
"max 1.000000 15.500000 7.100000 1.000000 \n",
"\n",
" nose_long lips_thin distance_nose_to_lip_long \n",
"count 5001.000000 5001.000000 5001.000000 \n",
"mean 0.507898 0.493101 0.498900 \n",
"std 0.499988 0.500002 0.500049 \n",
"min 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 0.000000 \n",
"50% 1.000000 0.000000 0.000000 \n",
"75% 1.000000 1.000000 1.000000 \n",
"max 1.000000 1.000000 1.000000 \n",
" long_hair forehead_width_cm forehead_height_cm nose_wide \\\n",
"count 3500.000000 3500.000000 3500.000000 3500.000000 \n",
"mean 0.870000 13.187686 5.951800 0.505714 \n",
"std 0.336351 1.109019 0.542695 0.500039 \n",
"min 0.000000 11.400000 5.100000 0.000000 \n",
"25% 1.000000 12.200000 5.500000 0.000000 \n",
"50% 1.000000 13.100000 5.900000 1.000000 \n",
"75% 1.000000 14.000000 6.400000 1.000000 \n",
"max 1.000000 15.500000 7.100000 1.000000 \n",
"\n",
" nose_long lips_thin distance_nose_to_lip_long \n",
"count 3500.000000 3500.000000 3500.000000 \n",
"mean 0.522000 0.499429 0.507714 \n",
"std 0.499587 0.500071 0.500012 \n",
"min 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 0.000000 \n",
"50% 1.000000 0.000000 1.000000 \n",
"75% 1.000000 1.000000 1.000000 \n",
"max 1.000000 1.000000 1.000000 \n",
" long_hair forehead_width_cm forehead_height_cm nose_wide \\\n",
"count 750.000000 750.000000 750.000000 750.000000 \n",
"mean 0.870667 13.119067 5.933867 0.472000 \n",
"std 0.335792 1.084345 0.538999 0.499549 \n",
"min 0.000000 11.400000 5.100000 0.000000 \n",
"25% 1.000000 12.200000 5.500000 0.000000 \n",
"50% 1.000000 13.100000 5.900000 0.000000 \n",
"75% 1.000000 14.000000 6.375000 1.000000 \n",
"max 1.000000 15.500000 7.100000 1.000000 \n",
"\n",
" nose_long lips_thin distance_nose_to_lip_long \n",
"count 750.000000 750.000000 750.000000 \n",
"mean 0.466667 0.481333 0.465333 \n",
"std 0.499221 0.499985 0.499130 \n",
"min 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 0.000000 \n",
"50% 0.000000 0.000000 0.000000 \n",
"75% 1.000000 1.000000 1.000000 \n",
"max 1.000000 1.000000 1.000000 \n",
" long_hair forehead_width_cm forehead_height_cm nose_wide \\\n",
"count 751.000000 751.000000 751.000000 751.000000 \n",
"mean 0.866844 13.214913 5.933156 0.460719 \n",
"std 0.339969 1.119877 0.537134 0.498787 \n",
"min 0.000000 11.400000 5.100000 0.000000 \n",
"25% 1.000000 12.200000 5.500000 0.000000 \n",
"50% 1.000000 13.200000 5.900000 0.000000 \n",
"75% 1.000000 14.100000 6.300000 1.000000 \n",
"max 1.000000 15.500000 7.100000 1.000000 \n",
"\n",
" nose_long lips_thin distance_nose_to_lip_long \n",
"count 751.000000 751.000000 751.000000 \n",
"mean 0.483356 0.475366 0.491345 \n",
"std 0.500056 0.499726 0.500258 \n",
"min 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 0.000000 \n",
"50% 0.000000 0.000000 0.000000 \n",
"75% 1.000000 1.000000 1.000000 \n",
"max 1.000000 1.000000 1.000000 \n"
]
}
],
"source": [
"for d in [raw_data,train, dev, test]:\n",
" print( d.describe())"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "f52a79aa",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" long_hair | \n",
" forehead_width_cm | \n",
" forehead_height_cm | \n",
" nose_wide | \n",
" nose_long | \n",
" lips_thin | \n",
" distance_nose_to_lip_long | \n",
" gender | \n",
"
\n",
" \n",
" \n",
" \n",
" 1501 | \n",
" 1 | \n",
" 0.439024 | \n",
" 0.30 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" Male | \n",
"
\n",
" \n",
" 2586 | \n",
" 1 | \n",
" 0.560976 | \n",
" 0.45 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" Female | \n",
"
\n",
" \n",
" 2653 | \n",
" 0 | \n",
" 0.365854 | \n",
" 0.10 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" Female | \n",
"
\n",
" \n",
" 1055 | \n",
" 1 | \n",
" 0.439024 | \n",
" 0.40 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" Male | \n",
"
\n",
" \n",
" 705 | \n",
" 0 | \n",
" 0.926829 | \n",
" 0.25 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" Male | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 2087 | \n",
" 1 | \n",
" 0.048780 | \n",
" 0.45 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" Female | \n",
"
\n",
" \n",
" 1889 | \n",
" 1 | \n",
" 0.048780 | \n",
" 0.15 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" Female | \n",
"
\n",
" \n",
" 4623 | \n",
" 1 | \n",
" 0.536585 | \n",
" 0.20 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" Female | \n",
"
\n",
" \n",
" 1591 | \n",
" 1 | \n",
" 1.000000 | \n",
" 0.95 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" Male | \n",
"
\n",
" \n",
" 1346 | \n",
" 1 | \n",
" 0.536585 | \n",
" 0.35 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" Female | \n",
"
\n",
" \n",
"
\n",
"
3500 rows × 8 columns
\n",
"
"
],
"text/plain": [
" long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n",
"1501 1 0.439024 0.30 1 1 \n",
"2586 1 0.560976 0.45 0 0 \n",
"2653 0 0.365854 0.10 0 0 \n",
"1055 1 0.439024 0.40 1 1 \n",
"705 0 0.926829 0.25 1 1 \n",
"... ... ... ... ... ... \n",
"2087 1 0.048780 0.45 0 1 \n",
"1889 1 0.048780 0.15 0 0 \n",
"4623 1 0.536585 0.20 0 0 \n",
"1591 1 1.000000 0.95 1 0 \n",
"1346 1 0.536585 0.35 0 0 \n",
"\n",
" lips_thin distance_nose_to_lip_long gender \n",
"1501 1 1 Male \n",
"2586 0 0 Female \n",
"2653 0 1 Female \n",
"1055 1 1 Male \n",
"705 1 1 Male \n",
"... ... ... ... \n",
"2087 1 0 Female \n",
"1889 0 0 Female \n",
"4623 0 0 Female \n",
"1591 1 0 Male \n",
"1346 0 0 Female \n",
"\n",
"[3500 rows x 8 columns]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"normalize_data(train)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "2653e41d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" long_hair | \n",
" forehead_width_cm | \n",
" forehead_height_cm | \n",
" nose_wide | \n",
" nose_long | \n",
" lips_thin | \n",
" distance_nose_to_lip_long | \n",
" gender | \n",
"
\n",
" \n",
" \n",
" \n",
" 1501 | \n",
" 1 | \n",
" 0.439024 | \n",
" 0.30 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" Male | \n",
"
\n",
" \n",
" 2586 | \n",
" 1 | \n",
" 0.560976 | \n",
" 0.45 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" Female | \n",
"
\n",
" \n",
" 2653 | \n",
" 0 | \n",
" 0.365854 | \n",
" 0.10 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" Female | \n",
"
\n",
" \n",
" 1055 | \n",
" 1 | \n",
" 0.439024 | \n",
" 0.40 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" Male | \n",
"
\n",
" \n",
" 705 | \n",
" 0 | \n",
" 0.926829 | \n",
" 0.25 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" Male | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 2087 | \n",
" 1 | \n",
" 0.048780 | \n",
" 0.45 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" Female | \n",
"
\n",
" \n",
" 1889 | \n",
" 1 | \n",
" 0.048780 | \n",
" 0.15 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" Female | \n",
"
\n",
" \n",
" 4623 | \n",
" 1 | \n",
" 0.536585 | \n",
" 0.20 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" Female | \n",
"
\n",
" \n",
" 1591 | \n",
" 1 | \n",
" 1.000000 | \n",
" 0.95 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" Male | \n",
"
\n",
" \n",
" 1346 | \n",
" 1 | \n",
" 0.536585 | \n",
" 0.35 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" Female | \n",
"
\n",
" \n",
"
\n",
"
3500 rows × 8 columns
\n",
"
"
],
"text/plain": [
" long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n",
"1501 1 0.439024 0.30 1 1 \n",
"2586 1 0.560976 0.45 0 0 \n",
"2653 0 0.365854 0.10 0 0 \n",
"1055 1 0.439024 0.40 1 1 \n",
"705 0 0.926829 0.25 1 1 \n",
"... ... ... ... ... ... \n",
"2087 1 0.048780 0.45 0 1 \n",
"1889 1 0.048780 0.15 0 0 \n",
"4623 1 0.536585 0.20 0 0 \n",
"1591 1 1.000000 0.95 1 0 \n",
"1346 1 0.536585 0.35 0 0 \n",
"\n",
" lips_thin distance_nose_to_lip_long gender \n",
"1501 1 1 Male \n",
"2586 0 0 Female \n",
"2653 0 1 Female \n",
"1055 1 1 Male \n",
"705 1 1 Male \n",
"... ... ... ... \n",
"2087 1 0 Female \n",
"1889 0 0 Female \n",
"4623 0 0 Female \n",
"1591 1 0 Male \n",
"1346 0 0 Female \n",
"\n",
"[3500 rows x 8 columns]"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clean_data(train)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bb1439e3",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}