ium_487184/zad_02.ipynb at 166cc7407a88e55891984f85a6c27d027bc956cd

import numpy as np
import pandas as pd

!kaggle datasets download -d gender_classification_v7.csv

zsh:1: command not found: kaggle

raw_data = pd.read_csv("gender_class.csv")
raw_data

	long_hair	forehead_width_cm	forehead_height_cm	nose_wide	nose_long	lips_thin	distance_nose_to_lip_long	gender
0	1	11.8	6.1	1	0	1	1	Male
1	0	14.0	5.4	0	0	1	0	Female
2	0	11.8	6.3	1	1	1	1	Male
3	0	14.4	6.1	0	1	1	1	Male
4	1	13.5	5.9	0	0	0	0	Female
...	...	...	...	...	...	...	...	...
4996	1	13.6	5.1	0	0	0	0	Female
4997	1	11.9	5.4	0	0	0	0	Female
4998	1	12.9	5.7	0	0	0	0	Female
4999	1	13.2	6.2	0	0	0	0	Female
5000	1	15.4	5.4	1	1	1	1	Male

5001 rows × 8 columns

Wyczyści zbiór z artefaktów (np. puste linie, przykłady z niepoprawnymi wartościami)

def clean_data(data):
    data.dropna(inplace=True)

    # usuń wiersze z niepoprawnymi wartościami
    for col in data.columns:
        if data[col].dtype == float:
            data = data[(data[col] >= 0.0) & (data[col] <= 1.0)]
        elif data[col].dtype == int:
            data = data[(data[col] >= 0)]

    return data

raw_data = clean_data(raw_data)
raw_data

	long_hair	forehead_width_cm	forehead_height_cm	nose_wide	nose_long	lips_thin	distance_nose_to_lip_long	gender

Dokona normalizacji danych w zbiorze (np. normalizacja wartości float do zakresu 0.0 - 1.0)

def normalize_data(data):
    # znormalizuj wartości float do zakresu 0.0 - 1.0
    for col in data.columns:
        if data[col].dtype == float:
            data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())

    return data

normalized_data = normalize_data(raw_data)

normalized_data

	long_hair	forehead_width_cm	forehead_height_cm	nose_wide	nose_long	lips_thin	distance_nose_to_lip_long	gender

Jeśli brak w zbiorze gotowego podziału na podzbiory train/dev/test, to dokona takiego podziału

train, dev, test = np.split(normalized_data.sample(frac=1, random_state=42), [int(.7*len(normalized_data)), int(.85*len(normalized_data))])

# zapisz dane w osobnych plikach csv
train.to_csv('train.csv', index=False)
dev.to_csv('dev.csv', index=False)
test.to_csv('test.csv', index=False)

dev

	long_hair	forehead_width_cm	forehead_height_cm	nose_wide	nose_long	lips_thin	distance_nose_to_lip_long	gender

for d in [raw_data,train, dev, test]:
    print( d.describe())

         long_hair  forehead_width_cm  forehead_height_cm    nose_wide  \
count  5001.000000        5001.000000         5001.000000  5001.000000   
mean      0.869626          13.181484            5.946311     0.493901   
std       0.336748           1.107128            0.541268     0.500013   
min       0.000000          11.400000            5.100000     0.000000   
25%       1.000000          12.200000            5.500000     0.000000   
50%       1.000000          13.100000            5.900000     0.000000   
75%       1.000000          14.000000            6.400000     1.000000   
max       1.000000          15.500000            7.100000     1.000000   

         nose_long    lips_thin  distance_nose_to_lip_long  
count  5001.000000  5001.000000                5001.000000  
mean      0.507898     0.493101                   0.498900  
std       0.499988     0.500002                   0.500049  
min       0.000000     0.000000                   0.000000  
25%       0.000000     0.000000                   0.000000  
50%       1.000000     0.000000                   0.000000  
75%       1.000000     1.000000                   1.000000  
max       1.000000     1.000000                   1.000000  
         long_hair  forehead_width_cm  forehead_height_cm    nose_wide  \
count  3500.000000        3500.000000         3500.000000  3500.000000   
mean      0.870000          13.187686            5.951800     0.505714   
std       0.336351           1.109019            0.542695     0.500039   
min       0.000000          11.400000            5.100000     0.000000   
25%       1.000000          12.200000            5.500000     0.000000   
50%       1.000000          13.100000            5.900000     1.000000   
75%       1.000000          14.000000            6.400000     1.000000   
max       1.000000          15.500000            7.100000     1.000000   

         nose_long    lips_thin  distance_nose_to_lip_long  
count  3500.000000  3500.000000                3500.000000  
mean      0.522000     0.499429                   0.507714  
std       0.499587     0.500071                   0.500012  
min       0.000000     0.000000                   0.000000  
25%       0.000000     0.000000                   0.000000  
50%       1.000000     0.000000                   1.000000  
75%       1.000000     1.000000                   1.000000  
max       1.000000     1.000000                   1.000000  
        long_hair  forehead_width_cm  forehead_height_cm   nose_wide  \
count  750.000000         750.000000          750.000000  750.000000   
mean     0.870667          13.119067            5.933867    0.472000   
std      0.335792           1.084345            0.538999    0.499549   
min      0.000000          11.400000            5.100000    0.000000   
25%      1.000000          12.200000            5.500000    0.000000   
50%      1.000000          13.100000            5.900000    0.000000   
75%      1.000000          14.000000            6.375000    1.000000   
max      1.000000          15.500000            7.100000    1.000000   

        nose_long   lips_thin  distance_nose_to_lip_long  
count  750.000000  750.000000                 750.000000  
mean     0.466667    0.481333                   0.465333  
std      0.499221    0.499985                   0.499130  
min      0.000000    0.000000                   0.000000  
25%      0.000000    0.000000                   0.000000  
50%      0.000000    0.000000                   0.000000  
75%      1.000000    1.000000                   1.000000  
max      1.000000    1.000000                   1.000000  
        long_hair  forehead_width_cm  forehead_height_cm   nose_wide  \
count  751.000000         751.000000          751.000000  751.000000   
mean     0.866844          13.214913            5.933156    0.460719   
std      0.339969           1.119877            0.537134    0.498787   
min      0.000000          11.400000            5.100000    0.000000   
25%      1.000000          12.200000            5.500000    0.000000   
50%      1.000000          13.200000            5.900000    0.000000   
75%      1.000000          14.100000            6.300000    1.000000   
max      1.000000          15.500000            7.100000    1.000000   

        nose_long   lips_thin  distance_nose_to_lip_long  
count  751.000000  751.000000                 751.000000  
mean     0.483356    0.475366                   0.491345  
std      0.500056    0.499726                   0.500258  
min      0.000000    0.000000                   0.000000  
25%      0.000000    0.000000                   0.000000  
50%      0.000000    0.000000                   0.000000  
75%      1.000000    1.000000                   1.000000  
max      1.000000    1.000000                   1.000000

normalize_data(train)

	long_hair	forehead_width_cm	forehead_height_cm	nose_wide	nose_long	lips_thin	distance_nose_to_lip_long	gender
1501	1	0.439024	0.30	1	1	1	1	Male
2586	1	0.560976	0.45	0	0	0	0	Female
2653	0	0.365854	0.10	0	0	0	1	Female
1055	1	0.439024	0.40	1	1	1	1	Male
705	0	0.926829	0.25	1	1	1	1	Male
...	...	...	...	...	...	...	...	...
2087	1	0.048780	0.45	0	1	1	0	Female
1889	1	0.048780	0.15	0	0	0	0	Female
4623	1	0.536585	0.20	0	0	0	0	Female
1591	1	1.000000	0.95	1	0	1	0	Male
1346	1	0.536585	0.35	0	0	0	0	Female

3500 rows × 8 columns

clean_data(train)

	long_hair	forehead_width_cm	forehead_height_cm	nose_wide	nose_long	lips_thin	distance_nose_to_lip_long	gender
1501	1	0.439024	0.30	1	1	1	1	Male
2586	1	0.560976	0.45	0	0	0	0	Female
2653	0	0.365854	0.10	0	0	0	1	Female
1055	1	0.439024	0.40	1	1	1	1	Male
705	0	0.926829	0.25	1	1	1	1	Male
...	...	...	...	...	...	...	...	...
2087	1	0.048780	0.45	0	1	1	0	Female
1889	1	0.048780	0.15	0	0	0	0	Female
4623	1	0.536585	0.20	0	0	0	0	Female
1591	1	1.000000	0.95	1	0	1	0	Male
1346	1	0.536585	0.35	0	0	0	0	Female

3500 rows × 8 columns

34 KiB Raw Blame History Unescape Escape

34 KiB

Raw Blame History