36 KiB
36 KiB
import numpy as np
import pandas as pd
!kaggle datasets download -d gender_classification_v7.csv
zsh:1: command not found: kaggle
raw_data = pd.read_csv("gender_class.csv")
raw_data
long_hair | forehead_width_cm | forehead_height_cm | nose_wide | nose_long | lips_thin | distance_nose_to_lip_long | gender | |
---|---|---|---|---|---|---|---|---|
0 | 1 | 11.8 | 6.1 | 1 | 0 | 1 | 1 | Male |
1 | 0 | 14.0 | 5.4 | 0 | 0 | 1 | 0 | Female |
2 | 0 | 11.8 | 6.3 | 1 | 1 | 1 | 1 | Male |
3 | 0 | 14.4 | 6.1 | 0 | 1 | 1 | 1 | Male |
4 | 1 | 13.5 | 5.9 | 0 | 0 | 0 | 0 | Female |
... | ... | ... | ... | ... | ... | ... | ... | ... |
4996 | 1 | 13.6 | 5.1 | 0 | 0 | 0 | 0 | Female |
4997 | 1 | 11.9 | 5.4 | 0 | 0 | 0 | 0 | Female |
4998 | 1 | 12.9 | 5.7 | 0 | 0 | 0 | 0 | Female |
4999 | 1 | 13.2 | 6.2 | 0 | 0 | 0 | 0 | Female |
5000 | 1 | 15.4 | 5.4 | 1 | 1 | 1 | 1 | Male |
5001 rows × 8 columns
Wyczyści zbiór z artefaktów (np. puste linie, przykłady z niepoprawnymi wartościami)
def clean_data(data):
data.dropna(inplace=True)
return data
raw_data = clean_data(raw_data)
raw_data
long_hair | forehead_width_cm | forehead_height_cm | nose_wide | nose_long | lips_thin | distance_nose_to_lip_long | gender | |
---|---|---|---|---|---|---|---|---|
0 | 1 | 11.8 | 6.1 | 1 | 0 | 1 | 1 | Male |
1 | 0 | 14.0 | 5.4 | 0 | 0 | 1 | 0 | Female |
2 | 0 | 11.8 | 6.3 | 1 | 1 | 1 | 1 | Male |
3 | 0 | 14.4 | 6.1 | 0 | 1 | 1 | 1 | Male |
4 | 1 | 13.5 | 5.9 | 0 | 0 | 0 | 0 | Female |
... | ... | ... | ... | ... | ... | ... | ... | ... |
4996 | 1 | 13.6 | 5.1 | 0 | 0 | 0 | 0 | Female |
4997 | 1 | 11.9 | 5.4 | 0 | 0 | 0 | 0 | Female |
4998 | 1 | 12.9 | 5.7 | 0 | 0 | 0 | 0 | Female |
4999 | 1 | 13.2 | 6.2 | 0 | 0 | 0 | 0 | Female |
5000 | 1 | 15.4 | 5.4 | 1 | 1 | 1 | 1 | Male |
5001 rows × 8 columns
Dokona normalizacji danych w zbiorze (np. normalizacja wartości float do zakresu 0.0 - 1.0)
def normalize_data(data):
# znormalizuj wartości float do zakresu 0.0 - 1.0
for col in data.columns:
if data[col].dtype == float:
data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())
return data
normalized_data = normalize_data(raw_data)
normalized_data
long_hair | forehead_width_cm | forehead_height_cm | nose_wide | nose_long | lips_thin | distance_nose_to_lip_long | gender | |
---|---|---|---|---|---|---|---|---|
0 | 1 | 0.097561 | 0.50 | 1 | 0 | 1 | 1 | Male |
1 | 0 | 0.634146 | 0.15 | 0 | 0 | 1 | 0 | Female |
2 | 0 | 0.097561 | 0.60 | 1 | 1 | 1 | 1 | Male |
3 | 0 | 0.731707 | 0.50 | 0 | 1 | 1 | 1 | Male |
4 | 1 | 0.512195 | 0.40 | 0 | 0 | 0 | 0 | Female |
... | ... | ... | ... | ... | ... | ... | ... | ... |
4996 | 1 | 0.536585 | 0.00 | 0 | 0 | 0 | 0 | Female |
4997 | 1 | 0.121951 | 0.15 | 0 | 0 | 0 | 0 | Female |
4998 | 1 | 0.365854 | 0.30 | 0 | 0 | 0 | 0 | Female |
4999 | 1 | 0.439024 | 0.55 | 0 | 0 | 0 | 0 | Female |
5000 | 1 | 0.975610 | 0.15 | 1 | 1 | 1 | 1 | Male |
5001 rows × 8 columns
- Jeśli brak w zbiorze gotowego podziału na podzbiory train/dev/test, to dokona takiego podziału
train, dev, test = np.split(normalized_data.sample(frac=1, random_state=42), [int(.7*len(normalized_data)), int(.85*len(normalized_data))])
# zapisz dane w osobnych plikach csv
train.to_csv('train.csv', index=False)
dev.to_csv('dev.csv', index=False)
test.to_csv('test.csv', index=False)
dev
long_hair | forehead_width_cm | forehead_height_cm | nose_wide | nose_long | lips_thin | distance_nose_to_lip_long | gender | |
---|---|---|---|---|---|---|---|---|
4432 | 1 | 0.512195 | 0.10 | 1 | 1 | 1 | 1 | Male |
2162 | 1 | 0.243902 | 0.70 | 1 | 1 | 1 | 1 | Male |
2396 | 1 | 0.512195 | 0.15 | 1 | 0 | 0 | 0 | Female |
4769 | 1 | 0.853659 | 0.10 | 1 | 1 | 0 | 1 | Male |
2271 | 1 | 0.292683 | 0.70 | 0 | 1 | 0 | 0 | Female |
... | ... | ... | ... | ... | ... | ... | ... | ... |
846 | 1 | 0.097561 | 0.45 | 1 | 1 | 1 | 1 | Male |
2551 | 0 | 0.243902 | 0.35 | 1 | 1 | 1 | 1 | Male |
2928 | 1 | 0.634146 | 0.20 | 0 | 0 | 0 | 0 | Female |
117 | 1 | 0.707317 | 0.50 | 0 | 0 | 0 | 0 | Female |
645 | 1 | 0.195122 | 0.05 | 1 | 0 | 0 | 0 | Female |
750 rows × 8 columns
for d in [raw_data,train, dev, test]:
print( d.describe())
long_hair forehead_width_cm forehead_height_cm nose_wide \ count 5001.000000 5001.000000 5001.000000 5001.000000 mean 0.869626 0.434508 0.423155 0.493901 std 0.336748 0.270031 0.270634 0.500013 min 0.000000 0.000000 0.000000 0.000000 25% 1.000000 0.195122 0.200000 0.000000 50% 1.000000 0.414634 0.400000 0.000000 75% 1.000000 0.634146 0.650000 1.000000 max 1.000000 1.000000 1.000000 1.000000 nose_long lips_thin distance_nose_to_lip_long count 5001.000000 5001.000000 5001.000000 mean 0.507898 0.493101 0.498900 std 0.499988 0.500002 0.500049 min 0.000000 0.000000 0.000000 25% 0.000000 0.000000 0.000000 50% 1.000000 0.000000 0.000000 75% 1.000000 1.000000 1.000000 max 1.000000 1.000000 1.000000 long_hair forehead_width_cm forehead_height_cm nose_wide \ count 3500.000000 3500.000000 3500.000000 3500.000000 mean 0.870000 0.436021 0.425900 0.505714 std 0.336351 0.270492 0.271348 0.500039 min 0.000000 0.000000 0.000000 0.000000 25% 1.000000 0.195122 0.200000 0.000000 50% 1.000000 0.414634 0.400000 1.000000 75% 1.000000 0.634146 0.650000 1.000000 max 1.000000 1.000000 1.000000 1.000000 nose_long lips_thin distance_nose_to_lip_long count 3500.000000 3500.000000 3500.000000 mean 0.522000 0.499429 0.507714 std 0.499587 0.500071 0.500012 min 0.000000 0.000000 0.000000 25% 0.000000 0.000000 0.000000 50% 1.000000 0.000000 1.000000 75% 1.000000 1.000000 1.000000 max 1.000000 1.000000 1.000000 long_hair forehead_width_cm forehead_height_cm nose_wide \ count 750.000000 750.000000 750.000000 750.000000 mean 0.870667 0.419285 0.416933 0.472000 std 0.335792 0.264474 0.269500 0.499549 min 0.000000 0.000000 0.000000 0.000000 25% 1.000000 0.195122 0.200000 0.000000 50% 1.000000 0.414634 0.400000 0.000000 75% 1.000000 0.634146 0.637500 1.000000 max 1.000000 1.000000 1.000000 1.000000 nose_long lips_thin distance_nose_to_lip_long count 750.000000 750.000000 750.000000 mean 0.466667 0.481333 0.465333 std 0.499221 0.499985 0.499130 min 0.000000 0.000000 0.000000 25% 0.000000 0.000000 0.000000 50% 0.000000 0.000000 0.000000 75% 1.000000 1.000000 1.000000 max 1.000000 1.000000 1.000000 long_hair forehead_width_cm forehead_height_cm nose_wide \ count 751.000000 751.000000 751.000000 751.000000 mean 0.866844 0.442662 0.416578 0.460719 std 0.339969 0.273141 0.268567 0.498787 min 0.000000 0.000000 0.000000 0.000000 25% 1.000000 0.195122 0.200000 0.000000 50% 1.000000 0.439024 0.400000 0.000000 75% 1.000000 0.658537 0.600000 1.000000 max 1.000000 1.000000 1.000000 1.000000 nose_long lips_thin distance_nose_to_lip_long count 751.000000 751.000000 751.000000 mean 0.483356 0.475366 0.491345 std 0.500056 0.499726 0.500258 min 0.000000 0.000000 0.000000 25% 0.000000 0.000000 0.000000 50% 0.000000 0.000000 0.000000 75% 1.000000 1.000000 1.000000 max 1.000000 1.000000 1.000000