34 KiB
34 KiB
import numpy as np
import pandas as pd
!kaggle datasets download -d gender_classification_v7.csv
zsh:1: command not found: kaggle
raw_data = pd.read_csv("gender_class.csv")
raw_data
long_hair | forehead_width_cm | forehead_height_cm | nose_wide | nose_long | lips_thin | distance_nose_to_lip_long | gender | |
---|---|---|---|---|---|---|---|---|
0 | 1 | 11.8 | 6.1 | 1 | 0 | 1 | 1 | Male |
1 | 0 | 14.0 | 5.4 | 0 | 0 | 1 | 0 | Female |
2 | 0 | 11.8 | 6.3 | 1 | 1 | 1 | 1 | Male |
3 | 0 | 14.4 | 6.1 | 0 | 1 | 1 | 1 | Male |
4 | 1 | 13.5 | 5.9 | 0 | 0 | 0 | 0 | Female |
... | ... | ... | ... | ... | ... | ... | ... | ... |
4996 | 1 | 13.6 | 5.1 | 0 | 0 | 0 | 0 | Female |
4997 | 1 | 11.9 | 5.4 | 0 | 0 | 0 | 0 | Female |
4998 | 1 | 12.9 | 5.7 | 0 | 0 | 0 | 0 | Female |
4999 | 1 | 13.2 | 6.2 | 0 | 0 | 0 | 0 | Female |
5000 | 1 | 15.4 | 5.4 | 1 | 1 | 1 | 1 | Male |
5001 rows × 8 columns
Wyczyści zbiór z artefaktów (np. puste linie, przykłady z niepoprawnymi wartościami)
def clean_data(data):
data.dropna(inplace=True)
# usuń wiersze z niepoprawnymi wartościami
for col in data.columns:
if data[col].dtype == float:
data = data[(data[col] >= 0.0) & (data[col] <= 1.0)]
elif data[col].dtype == int:
data = data[(data[col] >= 0)]
return data
raw_data = clean_data(raw_data)
raw_data
long_hair | forehead_width_cm | forehead_height_cm | nose_wide | nose_long | lips_thin | distance_nose_to_lip_long | gender |
---|
Dokona normalizacji danych w zbiorze (np. normalizacja wartości float do zakresu 0.0 - 1.0)
def normalize_data(data):
# znormalizuj wartości float do zakresu 0.0 - 1.0
for col in data.columns:
if data[col].dtype == float:
data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())
return data
normalized_data = normalize_data(raw_data)
normalized_data
long_hair | forehead_width_cm | forehead_height_cm | nose_wide | nose_long | lips_thin | distance_nose_to_lip_long | gender |
---|
- Jeśli brak w zbiorze gotowego podziału na podzbiory train/dev/test, to dokona takiego podziału
train, dev, test = np.split(normalized_data.sample(frac=1, random_state=42), [int(.7*len(normalized_data)), int(.85*len(normalized_data))])
# zapisz dane w osobnych plikach csv
train.to_csv('train.csv', index=False)
dev.to_csv('dev.csv', index=False)
test.to_csv('test.csv', index=False)
dev
long_hair | forehead_width_cm | forehead_height_cm | nose_wide | nose_long | lips_thin | distance_nose_to_lip_long | gender |
---|
for d in [raw_data,train, dev, test]:
print( d.describe())
long_hair forehead_width_cm forehead_height_cm nose_wide \ count 5001.000000 5001.000000 5001.000000 5001.000000 mean 0.869626 13.181484 5.946311 0.493901 std 0.336748 1.107128 0.541268 0.500013 min 0.000000 11.400000 5.100000 0.000000 25% 1.000000 12.200000 5.500000 0.000000 50% 1.000000 13.100000 5.900000 0.000000 75% 1.000000 14.000000 6.400000 1.000000 max 1.000000 15.500000 7.100000 1.000000 nose_long lips_thin distance_nose_to_lip_long count 5001.000000 5001.000000 5001.000000 mean 0.507898 0.493101 0.498900 std 0.499988 0.500002 0.500049 min 0.000000 0.000000 0.000000 25% 0.000000 0.000000 0.000000 50% 1.000000 0.000000 0.000000 75% 1.000000 1.000000 1.000000 max 1.000000 1.000000 1.000000 long_hair forehead_width_cm forehead_height_cm nose_wide \ count 3500.000000 3500.000000 3500.000000 3500.000000 mean 0.870000 13.187686 5.951800 0.505714 std 0.336351 1.109019 0.542695 0.500039 min 0.000000 11.400000 5.100000 0.000000 25% 1.000000 12.200000 5.500000 0.000000 50% 1.000000 13.100000 5.900000 1.000000 75% 1.000000 14.000000 6.400000 1.000000 max 1.000000 15.500000 7.100000 1.000000 nose_long lips_thin distance_nose_to_lip_long count 3500.000000 3500.000000 3500.000000 mean 0.522000 0.499429 0.507714 std 0.499587 0.500071 0.500012 min 0.000000 0.000000 0.000000 25% 0.000000 0.000000 0.000000 50% 1.000000 0.000000 1.000000 75% 1.000000 1.000000 1.000000 max 1.000000 1.000000 1.000000 long_hair forehead_width_cm forehead_height_cm nose_wide \ count 750.000000 750.000000 750.000000 750.000000 mean 0.870667 13.119067 5.933867 0.472000 std 0.335792 1.084345 0.538999 0.499549 min 0.000000 11.400000 5.100000 0.000000 25% 1.000000 12.200000 5.500000 0.000000 50% 1.000000 13.100000 5.900000 0.000000 75% 1.000000 14.000000 6.375000 1.000000 max 1.000000 15.500000 7.100000 1.000000 nose_long lips_thin distance_nose_to_lip_long count 750.000000 750.000000 750.000000 mean 0.466667 0.481333 0.465333 std 0.499221 0.499985 0.499130 min 0.000000 0.000000 0.000000 25% 0.000000 0.000000 0.000000 50% 0.000000 0.000000 0.000000 75% 1.000000 1.000000 1.000000 max 1.000000 1.000000 1.000000 long_hair forehead_width_cm forehead_height_cm nose_wide \ count 751.000000 751.000000 751.000000 751.000000 mean 0.866844 13.214913 5.933156 0.460719 std 0.339969 1.119877 0.537134 0.498787 min 0.000000 11.400000 5.100000 0.000000 25% 1.000000 12.200000 5.500000 0.000000 50% 1.000000 13.200000 5.900000 0.000000 75% 1.000000 14.100000 6.300000 1.000000 max 1.000000 15.500000 7.100000 1.000000 nose_long lips_thin distance_nose_to_lip_long count 751.000000 751.000000 751.000000 mean 0.483356 0.475366 0.491345 std 0.500056 0.499726 0.500258 min 0.000000 0.000000 0.000000 25% 0.000000 0.000000 0.000000 50% 0.000000 0.000000 0.000000 75% 1.000000 1.000000 1.000000 max 1.000000 1.000000 1.000000
normalize_data(train)
long_hair | forehead_width_cm | forehead_height_cm | nose_wide | nose_long | lips_thin | distance_nose_to_lip_long | gender | |
---|---|---|---|---|---|---|---|---|
1501 | 1 | 0.439024 | 0.30 | 1 | 1 | 1 | 1 | Male |
2586 | 1 | 0.560976 | 0.45 | 0 | 0 | 0 | 0 | Female |
2653 | 0 | 0.365854 | 0.10 | 0 | 0 | 0 | 1 | Female |
1055 | 1 | 0.439024 | 0.40 | 1 | 1 | 1 | 1 | Male |
705 | 0 | 0.926829 | 0.25 | 1 | 1 | 1 | 1 | Male |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2087 | 1 | 0.048780 | 0.45 | 0 | 1 | 1 | 0 | Female |
1889 | 1 | 0.048780 | 0.15 | 0 | 0 | 0 | 0 | Female |
4623 | 1 | 0.536585 | 0.20 | 0 | 0 | 0 | 0 | Female |
1591 | 1 | 1.000000 | 0.95 | 1 | 0 | 1 | 0 | Male |
1346 | 1 | 0.536585 | 0.35 | 0 | 0 | 0 | 0 | Female |
3500 rows × 8 columns
clean_data(train)
long_hair | forehead_width_cm | forehead_height_cm | nose_wide | nose_long | lips_thin | distance_nose_to_lip_long | gender | |
---|---|---|---|---|---|---|---|---|
1501 | 1 | 0.439024 | 0.30 | 1 | 1 | 1 | 1 | Male |
2586 | 1 | 0.560976 | 0.45 | 0 | 0 | 0 | 0 | Female |
2653 | 0 | 0.365854 | 0.10 | 0 | 0 | 0 | 1 | Female |
1055 | 1 | 0.439024 | 0.40 | 1 | 1 | 1 | 1 | Male |
705 | 0 | 0.926829 | 0.25 | 1 | 1 | 1 | 1 | Male |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2087 | 1 | 0.048780 | 0.45 | 0 | 1 | 1 | 0 | Female |
1889 | 1 | 0.048780 | 0.15 | 0 | 0 | 0 | 0 | Female |
4623 | 1 | 0.536585 | 0.20 | 0 | 0 | 0 | 0 | Female |
1591 | 1 | 1.000000 | 0.95 | 1 | 0 | 1 | 0 | Male |
1346 | 1 | 0.536585 | 0.35 | 0 | 0 | 0 | 0 | Female |
3500 rows × 8 columns