ium_487184/zad_02.ipynb

34 KiB
Raw Blame History

import numpy as np
import pandas as pd
!kaggle datasets download -d gender_classification_v7.csv
zsh:1: command not found: kaggle
raw_data = pd.read_csv("gender_class.csv")
raw_data
long_hair forehead_width_cm forehead_height_cm nose_wide nose_long lips_thin distance_nose_to_lip_long gender
0 1 11.8 6.1 1 0 1 1 Male
1 0 14.0 5.4 0 0 1 0 Female
2 0 11.8 6.3 1 1 1 1 Male
3 0 14.4 6.1 0 1 1 1 Male
4 1 13.5 5.9 0 0 0 0 Female
... ... ... ... ... ... ... ... ...
4996 1 13.6 5.1 0 0 0 0 Female
4997 1 11.9 5.4 0 0 0 0 Female
4998 1 12.9 5.7 0 0 0 0 Female
4999 1 13.2 6.2 0 0 0 0 Female
5000 1 15.4 5.4 1 1 1 1 Male

5001 rows × 8 columns

Wyczyści zbiór z artefaktów (np. puste linie, przykłady z niepoprawnymi wartościami)

def clean_data(data):
    data.dropna(inplace=True)

    # usuń wiersze z niepoprawnymi wartościami
    for col in data.columns:
        if data[col].dtype == float:
            data = data[(data[col] >= 0.0) & (data[col] <= 1.0)]
        elif data[col].dtype == int:
            data = data[(data[col] >= 0)]

    return data
raw_data = clean_data(raw_data)
raw_data
long_hair forehead_width_cm forehead_height_cm nose_wide nose_long lips_thin distance_nose_to_lip_long gender

Dokona normalizacji danych w zbiorze (np. normalizacja wartości float do zakresu 0.0 - 1.0)

def normalize_data(data):
    # znormalizuj wartości float do zakresu 0.0 - 1.0
    for col in data.columns:
        if data[col].dtype == float:
            data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())

    return data
normalized_data = normalize_data(raw_data)
normalized_data
long_hair forehead_width_cm forehead_height_cm nose_wide nose_long lips_thin distance_nose_to_lip_long gender
  1. Jeśli brak w zbiorze gotowego podziału na podzbiory train/dev/test, to dokona takiego podziału
train, dev, test = np.split(normalized_data.sample(frac=1, random_state=42), [int(.7*len(normalized_data)), int(.85*len(normalized_data))])

# zapisz dane w osobnych plikach csv
train.to_csv('train.csv', index=False)
dev.to_csv('dev.csv', index=False)
test.to_csv('test.csv', index=False)
dev
long_hair forehead_width_cm forehead_height_cm nose_wide nose_long lips_thin distance_nose_to_lip_long gender
for d in [raw_data,train, dev, test]:
    print( d.describe())
         long_hair  forehead_width_cm  forehead_height_cm    nose_wide  \
count  5001.000000        5001.000000         5001.000000  5001.000000   
mean      0.869626          13.181484            5.946311     0.493901   
std       0.336748           1.107128            0.541268     0.500013   
min       0.000000          11.400000            5.100000     0.000000   
25%       1.000000          12.200000            5.500000     0.000000   
50%       1.000000          13.100000            5.900000     0.000000   
75%       1.000000          14.000000            6.400000     1.000000   
max       1.000000          15.500000            7.100000     1.000000   

         nose_long    lips_thin  distance_nose_to_lip_long  
count  5001.000000  5001.000000                5001.000000  
mean      0.507898     0.493101                   0.498900  
std       0.499988     0.500002                   0.500049  
min       0.000000     0.000000                   0.000000  
25%       0.000000     0.000000                   0.000000  
50%       1.000000     0.000000                   0.000000  
75%       1.000000     1.000000                   1.000000  
max       1.000000     1.000000                   1.000000  
         long_hair  forehead_width_cm  forehead_height_cm    nose_wide  \
count  3500.000000        3500.000000         3500.000000  3500.000000   
mean      0.870000          13.187686            5.951800     0.505714   
std       0.336351           1.109019            0.542695     0.500039   
min       0.000000          11.400000            5.100000     0.000000   
25%       1.000000          12.200000            5.500000     0.000000   
50%       1.000000          13.100000            5.900000     1.000000   
75%       1.000000          14.000000            6.400000     1.000000   
max       1.000000          15.500000            7.100000     1.000000   

         nose_long    lips_thin  distance_nose_to_lip_long  
count  3500.000000  3500.000000                3500.000000  
mean      0.522000     0.499429                   0.507714  
std       0.499587     0.500071                   0.500012  
min       0.000000     0.000000                   0.000000  
25%       0.000000     0.000000                   0.000000  
50%       1.000000     0.000000                   1.000000  
75%       1.000000     1.000000                   1.000000  
max       1.000000     1.000000                   1.000000  
        long_hair  forehead_width_cm  forehead_height_cm   nose_wide  \
count  750.000000         750.000000          750.000000  750.000000   
mean     0.870667          13.119067            5.933867    0.472000   
std      0.335792           1.084345            0.538999    0.499549   
min      0.000000          11.400000            5.100000    0.000000   
25%      1.000000          12.200000            5.500000    0.000000   
50%      1.000000          13.100000            5.900000    0.000000   
75%      1.000000          14.000000            6.375000    1.000000   
max      1.000000          15.500000            7.100000    1.000000   

        nose_long   lips_thin  distance_nose_to_lip_long  
count  750.000000  750.000000                 750.000000  
mean     0.466667    0.481333                   0.465333  
std      0.499221    0.499985                   0.499130  
min      0.000000    0.000000                   0.000000  
25%      0.000000    0.000000                   0.000000  
50%      0.000000    0.000000                   0.000000  
75%      1.000000    1.000000                   1.000000  
max      1.000000    1.000000                   1.000000  
        long_hair  forehead_width_cm  forehead_height_cm   nose_wide  \
count  751.000000         751.000000          751.000000  751.000000   
mean     0.866844          13.214913            5.933156    0.460719   
std      0.339969           1.119877            0.537134    0.498787   
min      0.000000          11.400000            5.100000    0.000000   
25%      1.000000          12.200000            5.500000    0.000000   
50%      1.000000          13.200000            5.900000    0.000000   
75%      1.000000          14.100000            6.300000    1.000000   
max      1.000000          15.500000            7.100000    1.000000   

        nose_long   lips_thin  distance_nose_to_lip_long  
count  751.000000  751.000000                 751.000000  
mean     0.483356    0.475366                   0.491345  
std      0.500056    0.499726                   0.500258  
min      0.000000    0.000000                   0.000000  
25%      0.000000    0.000000                   0.000000  
50%      0.000000    0.000000                   0.000000  
75%      1.000000    1.000000                   1.000000  
max      1.000000    1.000000                   1.000000  
normalize_data(train)
long_hair forehead_width_cm forehead_height_cm nose_wide nose_long lips_thin distance_nose_to_lip_long gender
1501 1 0.439024 0.30 1 1 1 1 Male
2586 1 0.560976 0.45 0 0 0 0 Female
2653 0 0.365854 0.10 0 0 0 1 Female
1055 1 0.439024 0.40 1 1 1 1 Male
705 0 0.926829 0.25 1 1 1 1 Male
... ... ... ... ... ... ... ... ...
2087 1 0.048780 0.45 0 1 1 0 Female
1889 1 0.048780 0.15 0 0 0 0 Female
4623 1 0.536585 0.20 0 0 0 0 Female
1591 1 1.000000 0.95 1 0 1 0 Male
1346 1 0.536585 0.35 0 0 0 0 Female

3500 rows × 8 columns

clean_data(train)
long_hair forehead_width_cm forehead_height_cm nose_wide nose_long lips_thin distance_nose_to_lip_long gender
1501 1 0.439024 0.30 1 1 1 1 Male
2586 1 0.560976 0.45 0 0 0 0 Female
2653 0 0.365854 0.10 0 0 0 1 Female
1055 1 0.439024 0.40 1 1 1 1 Male
705 0 0.926829 0.25 1 1 1 1 Male
... ... ... ... ... ... ... ... ...
2087 1 0.048780 0.45 0 1 1 0 Female
1889 1 0.048780 0.15 0 0 0 0 Female
4623 1 0.536585 0.20 0 0 0 0 Female
1591 1 1.000000 0.95 1 0 1 0 Male
1346 1 0.536585 0.35 0 0 0 0 Female

3500 rows × 8 columns