{ "cells": [ { "cell_type": "code", "execution_count": 6, "id": "f5229180", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 7, "id": "2d3b5bee", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "zsh:1: command not found: kaggle\r\n" ] } ], "source": [ "!kaggle datasets download -d gender_classification_v7.csv" ] }, { "cell_type": "code", "execution_count": 34, "id": "fbbeb52d", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
long_hairforehead_width_cmforehead_height_cmnose_widenose_longlips_thindistance_nose_to_lip_longgender
0111.86.11011Male
1014.05.40010Female
2011.86.31111Male
3014.46.10111Male
4113.55.90000Female
...........................
4996113.65.10000Female
4997111.95.40000Female
4998112.95.70000Female
4999113.26.20000Female
5000115.45.41111Male
\n", "

5001 rows × 8 columns

\n", "
" ], "text/plain": [ " long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n", "0 1 11.8 6.1 1 0 \n", "1 0 14.0 5.4 0 0 \n", "2 0 11.8 6.3 1 1 \n", "3 0 14.4 6.1 0 1 \n", "4 1 13.5 5.9 0 0 \n", "... ... ... ... ... ... \n", "4996 1 13.6 5.1 0 0 \n", "4997 1 11.9 5.4 0 0 \n", "4998 1 12.9 5.7 0 0 \n", "4999 1 13.2 6.2 0 0 \n", "5000 1 15.4 5.4 1 1 \n", "\n", " lips_thin distance_nose_to_lip_long gender \n", "0 1 1 Male \n", "1 1 0 Female \n", "2 1 1 Male \n", "3 1 1 Male \n", "4 0 0 Female \n", "... ... ... ... \n", "4996 0 0 Female \n", "4997 0 0 Female \n", "4998 0 0 Female \n", "4999 0 0 Female \n", "5000 1 1 Male \n", "\n", "[5001 rows x 8 columns]" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_data = pd.read_csv(\"gender_class.csv\")\n", "raw_data" ] }, { "cell_type": "markdown", "id": "1f9629f1", "metadata": {}, "source": [ "Wyczyści zbiór z artefaktów (np. puste linie, przykłady z niepoprawnymi wartościami)" ] }, { "cell_type": "code", "execution_count": 37, "id": "ad18b250", "metadata": {}, "outputs": [], "source": [ "def clean_data(data):\n", " data.dropna(inplace=True)\n", "\n", " # usuń wiersze z niepoprawnymi wartościami\n", " for col in data.columns:\n", " if data[col].dtype == float:\n", " data = data[(data[col] >= 0.0) & (data[col] <= 1.0)]\n", " elif data[col].dtype == int:\n", " data = data[(data[col] >= 0)]\n", "\n", " return data" ] }, { "cell_type": "code", "execution_count": 38, "id": "8154dfd8", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
long_hairforehead_width_cmforehead_height_cmnose_widenose_longlips_thindistance_nose_to_lip_longgender
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [long_hair, forehead_width_cm, forehead_height_cm, nose_wide, nose_long, lips_thin, distance_nose_to_lip_long, gender]\n", "Index: []" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_data = clean_data(raw_data)\n", "raw_data" ] }, { "cell_type": "markdown", "id": "bd27b530", "metadata": {}, "source": [ "Dokona normalizacji danych w zbiorze (np. normalizacja wartości float do zakresu 0.0 - 1.0)" ] }, { "cell_type": "code", "execution_count": 19, "id": "b81c3005", "metadata": {}, "outputs": [], "source": [ "def normalize_data(data):\n", " # znormalizuj wartości float do zakresu 0.0 - 1.0\n", " for col in data.columns:\n", " if data[col].dtype == float:\n", " data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())\n", "\n", " return data" ] }, { "cell_type": "code", "execution_count": 27, "id": "611929ca", "metadata": {}, "outputs": [], "source": [ "normalized_data = normalize_data(raw_data)" ] }, { "cell_type": "code", "execution_count": 31, "id": "64724998", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
long_hairforehead_width_cmforehead_height_cmnose_widenose_longlips_thindistance_nose_to_lip_longgender
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [long_hair, forehead_width_cm, forehead_height_cm, nose_wide, nose_long, lips_thin, distance_nose_to_lip_long, gender]\n", "Index: []" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "normalized_data" ] }, { "cell_type": "markdown", "id": "7013b00e", "metadata": {}, "source": [ "2. Jeśli brak w zbiorze gotowego podziału na podzbiory train/dev/test, to dokona takiego podziału" ] }, { "cell_type": "code", "execution_count": 28, "id": "9eb24b71", "metadata": {}, "outputs": [], "source": [ "train, dev, test = np.split(normalized_data.sample(frac=1, random_state=42), [int(.7*len(normalized_data)), int(.85*len(normalized_data))])\n", "\n", "# zapisz dane w osobnych plikach csv\n", "train.to_csv('train.csv', index=False)\n", "dev.to_csv('dev.csv', index=False)\n", "test.to_csv('test.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 30, "id": "81d1cd62", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
long_hairforehead_width_cmforehead_height_cmnose_widenose_longlips_thindistance_nose_to_lip_longgender
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [long_hair, forehead_width_cm, forehead_height_cm, nose_wide, nose_long, lips_thin, distance_nose_to_lip_long, gender]\n", "Index: []" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dev" ] }, { "cell_type": "code", "execution_count": 18, "id": "851d9aa0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " long_hair forehead_width_cm forehead_height_cm nose_wide \\\n", "count 5001.000000 5001.000000 5001.000000 5001.000000 \n", "mean 0.869626 13.181484 5.946311 0.493901 \n", "std 0.336748 1.107128 0.541268 0.500013 \n", "min 0.000000 11.400000 5.100000 0.000000 \n", "25% 1.000000 12.200000 5.500000 0.000000 \n", "50% 1.000000 13.100000 5.900000 0.000000 \n", "75% 1.000000 14.000000 6.400000 1.000000 \n", "max 1.000000 15.500000 7.100000 1.000000 \n", "\n", " nose_long lips_thin distance_nose_to_lip_long \n", "count 5001.000000 5001.000000 5001.000000 \n", "mean 0.507898 0.493101 0.498900 \n", "std 0.499988 0.500002 0.500049 \n", "min 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.000000 0.000000 \n", "50% 1.000000 0.000000 0.000000 \n", "75% 1.000000 1.000000 1.000000 \n", "max 1.000000 1.000000 1.000000 \n", " long_hair forehead_width_cm forehead_height_cm nose_wide \\\n", "count 3500.000000 3500.000000 3500.000000 3500.000000 \n", "mean 0.870000 13.187686 5.951800 0.505714 \n", "std 0.336351 1.109019 0.542695 0.500039 \n", "min 0.000000 11.400000 5.100000 0.000000 \n", "25% 1.000000 12.200000 5.500000 0.000000 \n", "50% 1.000000 13.100000 5.900000 1.000000 \n", "75% 1.000000 14.000000 6.400000 1.000000 \n", "max 1.000000 15.500000 7.100000 1.000000 \n", "\n", " nose_long lips_thin distance_nose_to_lip_long \n", "count 3500.000000 3500.000000 3500.000000 \n", "mean 0.522000 0.499429 0.507714 \n", "std 0.499587 0.500071 0.500012 \n", "min 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.000000 0.000000 \n", "50% 1.000000 0.000000 1.000000 \n", "75% 1.000000 1.000000 1.000000 \n", "max 1.000000 1.000000 1.000000 \n", " long_hair forehead_width_cm forehead_height_cm nose_wide \\\n", "count 750.000000 750.000000 750.000000 750.000000 \n", "mean 0.870667 13.119067 5.933867 0.472000 \n", "std 0.335792 1.084345 0.538999 0.499549 \n", "min 0.000000 11.400000 5.100000 0.000000 \n", "25% 1.000000 12.200000 5.500000 0.000000 \n", "50% 1.000000 13.100000 5.900000 0.000000 \n", "75% 1.000000 14.000000 6.375000 1.000000 \n", "max 1.000000 15.500000 7.100000 1.000000 \n", "\n", " nose_long lips_thin distance_nose_to_lip_long \n", "count 750.000000 750.000000 750.000000 \n", "mean 0.466667 0.481333 0.465333 \n", "std 0.499221 0.499985 0.499130 \n", "min 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.000000 0.000000 \n", "50% 0.000000 0.000000 0.000000 \n", "75% 1.000000 1.000000 1.000000 \n", "max 1.000000 1.000000 1.000000 \n", " long_hair forehead_width_cm forehead_height_cm nose_wide \\\n", "count 751.000000 751.000000 751.000000 751.000000 \n", "mean 0.866844 13.214913 5.933156 0.460719 \n", "std 0.339969 1.119877 0.537134 0.498787 \n", "min 0.000000 11.400000 5.100000 0.000000 \n", "25% 1.000000 12.200000 5.500000 0.000000 \n", "50% 1.000000 13.200000 5.900000 0.000000 \n", "75% 1.000000 14.100000 6.300000 1.000000 \n", "max 1.000000 15.500000 7.100000 1.000000 \n", "\n", " nose_long lips_thin distance_nose_to_lip_long \n", "count 751.000000 751.000000 751.000000 \n", "mean 0.483356 0.475366 0.491345 \n", "std 0.500056 0.499726 0.500258 \n", "min 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.000000 0.000000 \n", "50% 0.000000 0.000000 0.000000 \n", "75% 1.000000 1.000000 1.000000 \n", "max 1.000000 1.000000 1.000000 \n" ] } ], "source": [ "for d in [raw_data,train, dev, test]:\n", " print( d.describe())" ] }, { "cell_type": "code", "execution_count": 20, "id": "f52a79aa", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
long_hairforehead_width_cmforehead_height_cmnose_widenose_longlips_thindistance_nose_to_lip_longgender
150110.4390240.301111Male
258610.5609760.450000Female
265300.3658540.100001Female
105510.4390240.401111Male
70500.9268290.251111Male
...........................
208710.0487800.450110Female
188910.0487800.150000Female
462310.5365850.200000Female
159111.0000000.951010Male
134610.5365850.350000Female
\n", "

3500 rows × 8 columns

\n", "
" ], "text/plain": [ " long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n", "1501 1 0.439024 0.30 1 1 \n", "2586 1 0.560976 0.45 0 0 \n", "2653 0 0.365854 0.10 0 0 \n", "1055 1 0.439024 0.40 1 1 \n", "705 0 0.926829 0.25 1 1 \n", "... ... ... ... ... ... \n", "2087 1 0.048780 0.45 0 1 \n", "1889 1 0.048780 0.15 0 0 \n", "4623 1 0.536585 0.20 0 0 \n", "1591 1 1.000000 0.95 1 0 \n", "1346 1 0.536585 0.35 0 0 \n", "\n", " lips_thin distance_nose_to_lip_long gender \n", "1501 1 1 Male \n", "2586 0 0 Female \n", "2653 0 1 Female \n", "1055 1 1 Male \n", "705 1 1 Male \n", "... ... ... ... \n", "2087 1 0 Female \n", "1889 0 0 Female \n", "4623 0 0 Female \n", "1591 1 0 Male \n", "1346 0 0 Female \n", "\n", "[3500 rows x 8 columns]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "normalize_data(train)" ] }, { "cell_type": "code", "execution_count": 23, "id": "2653e41d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
long_hairforehead_width_cmforehead_height_cmnose_widenose_longlips_thindistance_nose_to_lip_longgender
150110.4390240.301111Male
258610.5609760.450000Female
265300.3658540.100001Female
105510.4390240.401111Male
70500.9268290.251111Male
...........................
208710.0487800.450110Female
188910.0487800.150000Female
462310.5365850.200000Female
159111.0000000.951010Male
134610.5365850.350000Female
\n", "

3500 rows × 8 columns

\n", "
" ], "text/plain": [ " long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n", "1501 1 0.439024 0.30 1 1 \n", "2586 1 0.560976 0.45 0 0 \n", "2653 0 0.365854 0.10 0 0 \n", "1055 1 0.439024 0.40 1 1 \n", "705 0 0.926829 0.25 1 1 \n", "... ... ... ... ... ... \n", "2087 1 0.048780 0.45 0 1 \n", "1889 1 0.048780 0.15 0 0 \n", "4623 1 0.536585 0.20 0 0 \n", "1591 1 1.000000 0.95 1 0 \n", "1346 1 0.536585 0.35 0 0 \n", "\n", " lips_thin distance_nose_to_lip_long gender \n", "1501 1 1 Male \n", "2586 0 0 Female \n", "2653 0 1 Female \n", "1055 1 1 Male \n", "705 1 1 Male \n", "... ... ... ... \n", "2087 1 0 Female \n", "1889 0 0 Female \n", "4623 0 0 Female \n", "1591 1 0 Male \n", "1346 0 0 Female \n", "\n", "[3500 rows x 8 columns]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clean_data(train)" ] }, { "cell_type": "code", "execution_count": null, "id": "bb1439e3", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 5 }