ium_487184/zad_02.ipynb
bartosz.maslanka.consultant 3f7a91f1db add jenkinsfile
2023-04-21 15:03:48 +02:00

36 KiB
Raw Blame History

import numpy as np
import pandas as pd
!kaggle datasets download -d gender_classification_v7.csv
zsh:1: command not found: kaggle
raw_data = pd.read_csv("gender_class.csv")
raw_data
long_hair forehead_width_cm forehead_height_cm nose_wide nose_long lips_thin distance_nose_to_lip_long gender
0 1 11.8 6.1 1 0 1 1 Male
1 0 14.0 5.4 0 0 1 0 Female
2 0 11.8 6.3 1 1 1 1 Male
3 0 14.4 6.1 0 1 1 1 Male
4 1 13.5 5.9 0 0 0 0 Female
... ... ... ... ... ... ... ... ...
4996 1 13.6 5.1 0 0 0 0 Female
4997 1 11.9 5.4 0 0 0 0 Female
4998 1 12.9 5.7 0 0 0 0 Female
4999 1 13.2 6.2 0 0 0 0 Female
5000 1 15.4 5.4 1 1 1 1 Male

5001 rows × 8 columns

Wyczyści zbiór z artefaktów (np. puste linie, przykłady z niepoprawnymi wartościami)

def clean_data(data):
    data.dropna(inplace=True)
    return data
raw_data = clean_data(raw_data)
raw_data
long_hair forehead_width_cm forehead_height_cm nose_wide nose_long lips_thin distance_nose_to_lip_long gender
0 1 11.8 6.1 1 0 1 1 Male
1 0 14.0 5.4 0 0 1 0 Female
2 0 11.8 6.3 1 1 1 1 Male
3 0 14.4 6.1 0 1 1 1 Male
4 1 13.5 5.9 0 0 0 0 Female
... ... ... ... ... ... ... ... ...
4996 1 13.6 5.1 0 0 0 0 Female
4997 1 11.9 5.4 0 0 0 0 Female
4998 1 12.9 5.7 0 0 0 0 Female
4999 1 13.2 6.2 0 0 0 0 Female
5000 1 15.4 5.4 1 1 1 1 Male

5001 rows × 8 columns

Dokona normalizacji danych w zbiorze (np. normalizacja wartości float do zakresu 0.0 - 1.0)

def normalize_data(data):
    # znormalizuj wartości float do zakresu 0.0 - 1.0
    for col in data.columns:
        if data[col].dtype == float:
            data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())

    return data
normalized_data = normalize_data(raw_data)
normalized_data
long_hair forehead_width_cm forehead_height_cm nose_wide nose_long lips_thin distance_nose_to_lip_long gender
0 1 0.097561 0.50 1 0 1 1 Male
1 0 0.634146 0.15 0 0 1 0 Female
2 0 0.097561 0.60 1 1 1 1 Male
3 0 0.731707 0.50 0 1 1 1 Male
4 1 0.512195 0.40 0 0 0 0 Female
... ... ... ... ... ... ... ... ...
4996 1 0.536585 0.00 0 0 0 0 Female
4997 1 0.121951 0.15 0 0 0 0 Female
4998 1 0.365854 0.30 0 0 0 0 Female
4999 1 0.439024 0.55 0 0 0 0 Female
5000 1 0.975610 0.15 1 1 1 1 Male

5001 rows × 8 columns

  1. Jeśli brak w zbiorze gotowego podziału na podzbiory train/dev/test, to dokona takiego podziału
train, dev, test = np.split(normalized_data.sample(frac=1, random_state=42), [int(.7*len(normalized_data)), int(.85*len(normalized_data))])

# zapisz dane w osobnych plikach csv
train.to_csv('train.csv', index=False)
dev.to_csv('dev.csv', index=False)
test.to_csv('test.csv', index=False)
dev
long_hair forehead_width_cm forehead_height_cm nose_wide nose_long lips_thin distance_nose_to_lip_long gender
4432 1 0.512195 0.10 1 1 1 1 Male
2162 1 0.243902 0.70 1 1 1 1 Male
2396 1 0.512195 0.15 1 0 0 0 Female
4769 1 0.853659 0.10 1 1 0 1 Male
2271 1 0.292683 0.70 0 1 0 0 Female
... ... ... ... ... ... ... ... ...
846 1 0.097561 0.45 1 1 1 1 Male
2551 0 0.243902 0.35 1 1 1 1 Male
2928 1 0.634146 0.20 0 0 0 0 Female
117 1 0.707317 0.50 0 0 0 0 Female
645 1 0.195122 0.05 1 0 0 0 Female

750 rows × 8 columns

for d in [raw_data,train, dev, test]:
    print( d.describe())
         long_hair  forehead_width_cm  forehead_height_cm    nose_wide  \
count  5001.000000        5001.000000         5001.000000  5001.000000   
mean      0.869626           0.434508            0.423155     0.493901   
std       0.336748           0.270031            0.270634     0.500013   
min       0.000000           0.000000            0.000000     0.000000   
25%       1.000000           0.195122            0.200000     0.000000   
50%       1.000000           0.414634            0.400000     0.000000   
75%       1.000000           0.634146            0.650000     1.000000   
max       1.000000           1.000000            1.000000     1.000000   

         nose_long    lips_thin  distance_nose_to_lip_long  
count  5001.000000  5001.000000                5001.000000  
mean      0.507898     0.493101                   0.498900  
std       0.499988     0.500002                   0.500049  
min       0.000000     0.000000                   0.000000  
25%       0.000000     0.000000                   0.000000  
50%       1.000000     0.000000                   0.000000  
75%       1.000000     1.000000                   1.000000  
max       1.000000     1.000000                   1.000000  
         long_hair  forehead_width_cm  forehead_height_cm    nose_wide  \
count  3500.000000        3500.000000         3500.000000  3500.000000   
mean      0.870000           0.436021            0.425900     0.505714   
std       0.336351           0.270492            0.271348     0.500039   
min       0.000000           0.000000            0.000000     0.000000   
25%       1.000000           0.195122            0.200000     0.000000   
50%       1.000000           0.414634            0.400000     1.000000   
75%       1.000000           0.634146            0.650000     1.000000   
max       1.000000           1.000000            1.000000     1.000000   

         nose_long    lips_thin  distance_nose_to_lip_long  
count  3500.000000  3500.000000                3500.000000  
mean      0.522000     0.499429                   0.507714  
std       0.499587     0.500071                   0.500012  
min       0.000000     0.000000                   0.000000  
25%       0.000000     0.000000                   0.000000  
50%       1.000000     0.000000                   1.000000  
75%       1.000000     1.000000                   1.000000  
max       1.000000     1.000000                   1.000000  
        long_hair  forehead_width_cm  forehead_height_cm   nose_wide  \
count  750.000000         750.000000          750.000000  750.000000   
mean     0.870667           0.419285            0.416933    0.472000   
std      0.335792           0.264474            0.269500    0.499549   
min      0.000000           0.000000            0.000000    0.000000   
25%      1.000000           0.195122            0.200000    0.000000   
50%      1.000000           0.414634            0.400000    0.000000   
75%      1.000000           0.634146            0.637500    1.000000   
max      1.000000           1.000000            1.000000    1.000000   

        nose_long   lips_thin  distance_nose_to_lip_long  
count  750.000000  750.000000                 750.000000  
mean     0.466667    0.481333                   0.465333  
std      0.499221    0.499985                   0.499130  
min      0.000000    0.000000                   0.000000  
25%      0.000000    0.000000                   0.000000  
50%      0.000000    0.000000                   0.000000  
75%      1.000000    1.000000                   1.000000  
max      1.000000    1.000000                   1.000000  
        long_hair  forehead_width_cm  forehead_height_cm   nose_wide  \
count  751.000000         751.000000          751.000000  751.000000   
mean     0.866844           0.442662            0.416578    0.460719   
std      0.339969           0.273141            0.268567    0.498787   
min      0.000000           0.000000            0.000000    0.000000   
25%      1.000000           0.195122            0.200000    0.000000   
50%      1.000000           0.439024            0.400000    0.000000   
75%      1.000000           0.658537            0.600000    1.000000   
max      1.000000           1.000000            1.000000    1.000000   

        nose_long   lips_thin  distance_nose_to_lip_long  
count  751.000000  751.000000                 751.000000  
mean     0.483356    0.475366                   0.491345  
std      0.500056    0.499726                   0.500258  
min      0.000000    0.000000                   0.000000  
25%      0.000000    0.000000                   0.000000  
50%      0.000000    0.000000                   0.000000  
75%      1.000000    1.000000                   1.000000  
max      1.000000    1.000000                   1.000000