{ "cells": [ { "cell_type": "code", "execution_count": 6, "id": "12dba44a", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 7, "id": "1d480e94", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "zsh:1: command not found: kaggle\r\n" ] } ], "source": [ "!kaggle datasets download -d gender_classification_v7.csv" ] }, { "cell_type": "code", "execution_count": 41, "id": "13a40d88", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
long_hairforehead_width_cmforehead_height_cmnose_widenose_longlips_thindistance_nose_to_lip_longgender
0111.86.11011Male
1014.05.40010Female
2011.86.31111Male
3014.46.10111Male
4113.55.90000Female
...........................
4996113.65.10000Female
4997111.95.40000Female
4998112.95.70000Female
4999113.26.20000Female
5000115.45.41111Male
\n", "

5001 rows × 8 columns

\n", "
" ], "text/plain": [ " long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n", "0 1 11.8 6.1 1 0 \n", "1 0 14.0 5.4 0 0 \n", "2 0 11.8 6.3 1 1 \n", "3 0 14.4 6.1 0 1 \n", "4 1 13.5 5.9 0 0 \n", "... ... ... ... ... ... \n", "4996 1 13.6 5.1 0 0 \n", "4997 1 11.9 5.4 0 0 \n", "4998 1 12.9 5.7 0 0 \n", "4999 1 13.2 6.2 0 0 \n", "5000 1 15.4 5.4 1 1 \n", "\n", " lips_thin distance_nose_to_lip_long gender \n", "0 1 1 Male \n", "1 1 0 Female \n", "2 1 1 Male \n", "3 1 1 Male \n", "4 0 0 Female \n", "... ... ... ... \n", "4996 0 0 Female \n", "4997 0 0 Female \n", "4998 0 0 Female \n", "4999 0 0 Female \n", "5000 1 1 Male \n", "\n", "[5001 rows x 8 columns]" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_data = pd.read_csv(\"gender_class.csv\")\n", "raw_data" ] }, { "cell_type": "markdown", "id": "51c05e9a", "metadata": {}, "source": [ "Wyczyści zbiór z artefaktów (np. puste linie, przykłady z niepoprawnymi wartościami)" ] }, { "cell_type": "code", "execution_count": 42, "id": "c70571df", "metadata": {}, "outputs": [], "source": [ "def clean_data(data):\n", " data.dropna(inplace=True)\n", " return data" ] }, { "cell_type": "code", "execution_count": 43, "id": "0481b0dd", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
long_hairforehead_width_cmforehead_height_cmnose_widenose_longlips_thindistance_nose_to_lip_longgender
0111.86.11011Male
1014.05.40010Female
2011.86.31111Male
3014.46.10111Male
4113.55.90000Female
...........................
4996113.65.10000Female
4997111.95.40000Female
4998112.95.70000Female
4999113.26.20000Female
5000115.45.41111Male
\n", "

5001 rows × 8 columns

\n", "
" ], "text/plain": [ " long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n", "0 1 11.8 6.1 1 0 \n", "1 0 14.0 5.4 0 0 \n", "2 0 11.8 6.3 1 1 \n", "3 0 14.4 6.1 0 1 \n", "4 1 13.5 5.9 0 0 \n", "... ... ... ... ... ... \n", "4996 1 13.6 5.1 0 0 \n", "4997 1 11.9 5.4 0 0 \n", "4998 1 12.9 5.7 0 0 \n", "4999 1 13.2 6.2 0 0 \n", "5000 1 15.4 5.4 1 1 \n", "\n", " lips_thin distance_nose_to_lip_long gender \n", "0 1 1 Male \n", "1 1 0 Female \n", "2 1 1 Male \n", "3 1 1 Male \n", "4 0 0 Female \n", "... ... ... ... \n", "4996 0 0 Female \n", "4997 0 0 Female \n", "4998 0 0 Female \n", "4999 0 0 Female \n", "5000 1 1 Male \n", "\n", "[5001 rows x 8 columns]" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_data = clean_data(raw_data)\n", "raw_data" ] }, { "cell_type": "markdown", "id": "717fab23", "metadata": {}, "source": [ "Dokona normalizacji danych w zbiorze (np. normalizacja wartości float do zakresu 0.0 - 1.0)" ] }, { "cell_type": "code", "execution_count": 44, "id": "7fcacf03", "metadata": {}, "outputs": [], "source": [ "def normalize_data(data):\n", " # znormalizuj wartości float do zakresu 0.0 - 1.0\n", " for col in data.columns:\n", " if data[col].dtype == float:\n", " data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())\n", "\n", " return data" ] }, { "cell_type": "code", "execution_count": 45, "id": "bfd844ad", "metadata": {}, "outputs": [], "source": [ "normalized_data = normalize_data(raw_data)" ] }, { "cell_type": "code", "execution_count": 46, "id": "2d0b8499", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
long_hairforehead_width_cmforehead_height_cmnose_widenose_longlips_thindistance_nose_to_lip_longgender
010.0975610.501011Male
100.6341460.150010Female
200.0975610.601111Male
300.7317070.500111Male
410.5121950.400000Female
...........................
499610.5365850.000000Female
499710.1219510.150000Female
499810.3658540.300000Female
499910.4390240.550000Female
500010.9756100.151111Male
\n", "

5001 rows × 8 columns

\n", "
" ], "text/plain": [ " long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n", "0 1 0.097561 0.50 1 0 \n", "1 0 0.634146 0.15 0 0 \n", "2 0 0.097561 0.60 1 1 \n", "3 0 0.731707 0.50 0 1 \n", "4 1 0.512195 0.40 0 0 \n", "... ... ... ... ... ... \n", "4996 1 0.536585 0.00 0 0 \n", "4997 1 0.121951 0.15 0 0 \n", "4998 1 0.365854 0.30 0 0 \n", "4999 1 0.439024 0.55 0 0 \n", "5000 1 0.975610 0.15 1 1 \n", "\n", " lips_thin distance_nose_to_lip_long gender \n", "0 1 1 Male \n", "1 1 0 Female \n", "2 1 1 Male \n", "3 1 1 Male \n", "4 0 0 Female \n", "... ... ... ... \n", "4996 0 0 Female \n", "4997 0 0 Female \n", "4998 0 0 Female \n", "4999 0 0 Female \n", "5000 1 1 Male \n", "\n", "[5001 rows x 8 columns]" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "normalized_data" ] }, { "cell_type": "markdown", "id": "61fbcddc", "metadata": {}, "source": [ "2. Jeśli brak w zbiorze gotowego podziału na podzbiory train/dev/test, to dokona takiego podziału" ] }, { "cell_type": "code", "execution_count": 47, "id": "dc386189", "metadata": {}, "outputs": [], "source": [ "train, dev, test = np.split(normalized_data.sample(frac=1, random_state=42), [int(.7*len(normalized_data)), int(.85*len(normalized_data))])\n", "\n", "# zapisz dane w osobnych plikach csv\n", "train.to_csv('train.csv', index=False)\n", "dev.to_csv('dev.csv', index=False)\n", "test.to_csv('test.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 48, "id": "9f888962", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
long_hairforehead_width_cmforehead_height_cmnose_widenose_longlips_thindistance_nose_to_lip_longgender
443210.5121950.101111Male
216210.2439020.701111Male
239610.5121950.151000Female
476910.8536590.101101Male
227110.2926830.700100Female
...........................
84610.0975610.451111Male
255100.2439020.351111Male
292810.6341460.200000Female
11710.7073170.500000Female
64510.1951220.051000Female
\n", "

750 rows × 8 columns

\n", "
" ], "text/plain": [ " long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n", "4432 1 0.512195 0.10 1 1 \n", "2162 1 0.243902 0.70 1 1 \n", "2396 1 0.512195 0.15 1 0 \n", "4769 1 0.853659 0.10 1 1 \n", "2271 1 0.292683 0.70 0 1 \n", "... ... ... ... ... ... \n", "846 1 0.097561 0.45 1 1 \n", "2551 0 0.243902 0.35 1 1 \n", "2928 1 0.634146 0.20 0 0 \n", "117 1 0.707317 0.50 0 0 \n", "645 1 0.195122 0.05 1 0 \n", "\n", " lips_thin distance_nose_to_lip_long gender \n", "4432 1 1 Male \n", "2162 1 1 Male \n", "2396 0 0 Female \n", "4769 0 1 Male \n", "2271 0 0 Female \n", "... ... ... ... \n", "846 1 1 Male \n", "2551 1 1 Male \n", "2928 0 0 Female \n", "117 0 0 Female \n", "645 0 0 Female \n", "\n", "[750 rows x 8 columns]" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dev" ] }, { "cell_type": "code", "execution_count": 49, "id": "4598cea1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " long_hair forehead_width_cm forehead_height_cm nose_wide \\\n", "count 5001.000000 5001.000000 5001.000000 5001.000000 \n", "mean 0.869626 0.434508 0.423155 0.493901 \n", "std 0.336748 0.270031 0.270634 0.500013 \n", "min 0.000000 0.000000 0.000000 0.000000 \n", "25% 1.000000 0.195122 0.200000 0.000000 \n", "50% 1.000000 0.414634 0.400000 0.000000 \n", "75% 1.000000 0.634146 0.650000 1.000000 \n", "max 1.000000 1.000000 1.000000 1.000000 \n", "\n", " nose_long lips_thin distance_nose_to_lip_long \n", "count 5001.000000 5001.000000 5001.000000 \n", "mean 0.507898 0.493101 0.498900 \n", "std 0.499988 0.500002 0.500049 \n", "min 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.000000 0.000000 \n", "50% 1.000000 0.000000 0.000000 \n", "75% 1.000000 1.000000 1.000000 \n", "max 1.000000 1.000000 1.000000 \n", " long_hair forehead_width_cm forehead_height_cm nose_wide \\\n", "count 3500.000000 3500.000000 3500.000000 3500.000000 \n", "mean 0.870000 0.436021 0.425900 0.505714 \n", "std 0.336351 0.270492 0.271348 0.500039 \n", "min 0.000000 0.000000 0.000000 0.000000 \n", "25% 1.000000 0.195122 0.200000 0.000000 \n", "50% 1.000000 0.414634 0.400000 1.000000 \n", "75% 1.000000 0.634146 0.650000 1.000000 \n", "max 1.000000 1.000000 1.000000 1.000000 \n", "\n", " nose_long lips_thin distance_nose_to_lip_long \n", "count 3500.000000 3500.000000 3500.000000 \n", "mean 0.522000 0.499429 0.507714 \n", "std 0.499587 0.500071 0.500012 \n", "min 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.000000 0.000000 \n", "50% 1.000000 0.000000 1.000000 \n", "75% 1.000000 1.000000 1.000000 \n", "max 1.000000 1.000000 1.000000 \n", " long_hair forehead_width_cm forehead_height_cm nose_wide \\\n", "count 750.000000 750.000000 750.000000 750.000000 \n", "mean 0.870667 0.419285 0.416933 0.472000 \n", "std 0.335792 0.264474 0.269500 0.499549 \n", "min 0.000000 0.000000 0.000000 0.000000 \n", "25% 1.000000 0.195122 0.200000 0.000000 \n", "50% 1.000000 0.414634 0.400000 0.000000 \n", "75% 1.000000 0.634146 0.637500 1.000000 \n", "max 1.000000 1.000000 1.000000 1.000000 \n", "\n", " nose_long lips_thin distance_nose_to_lip_long \n", "count 750.000000 750.000000 750.000000 \n", "mean 0.466667 0.481333 0.465333 \n", "std 0.499221 0.499985 0.499130 \n", "min 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.000000 0.000000 \n", "50% 0.000000 0.000000 0.000000 \n", "75% 1.000000 1.000000 1.000000 \n", "max 1.000000 1.000000 1.000000 \n", " long_hair forehead_width_cm forehead_height_cm nose_wide \\\n", "count 751.000000 751.000000 751.000000 751.000000 \n", "mean 0.866844 0.442662 0.416578 0.460719 \n", "std 0.339969 0.273141 0.268567 0.498787 \n", "min 0.000000 0.000000 0.000000 0.000000 \n", "25% 1.000000 0.195122 0.200000 0.000000 \n", "50% 1.000000 0.439024 0.400000 0.000000 \n", "75% 1.000000 0.658537 0.600000 1.000000 \n", "max 1.000000 1.000000 1.000000 1.000000 \n", "\n", " nose_long lips_thin distance_nose_to_lip_long \n", "count 751.000000 751.000000 751.000000 \n", "mean 0.483356 0.475366 0.491345 \n", "std 0.500056 0.499726 0.500258 \n", "min 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.000000 0.000000 \n", "50% 0.000000 0.000000 0.000000 \n", "75% 1.000000 1.000000 1.000000 \n", "max 1.000000 1.000000 1.000000 \n" ] } ], "source": [ "for d in [raw_data,train, dev, test]:\n", " print( d.describe())" ] }, { "cell_type": "code", "execution_count": null, "id": "8fa84a56", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 5 }