7.2 KiB
7.2 KiB
import pandas as pd
import numpy as np
data = pd.read_csv("Customers.csv")
print(data[:10])
CustomerID Gender Age Annual Income ($) Spending Score (1-100) \ 0 1 Male 19 15000 39 1 2 Male 21 35000 81 2 3 Female 20 86000 6 3 4 Female 23 59000 77 4 5 Female 31 38000 40 5 6 Female 22 58000 76 6 7 Female 35 31000 6 7 8 Female 23 84000 94 8 9 Male 64 97000 3 9 10 Female 30 98000 72 Profession Work Experience Family Size 0 Healthcare 1 4 1 Engineer 3 3 2 Engineer 1 1 3 Lawyer 0 2 4 Entertainment 2 6 5 Artist 0 2 6 Healthcare 1 3 7 Healthcare 1 3 8 Engineer 0 3 9 Artist 1 4
dataF = data
# Changing words to numbers
mapping = {'NaN' : 0, 'Healthcare' : 1, 'Engineer' : 2, 'Lawyer' : 3, 'Entertainment' : 4, 'Artist' : 5, 'Executive' : 6,
'Doctor' : 7, 'Homemaker' : 8, 'Marketing' : 9}
mapping2 = {'Male' : 0, 'Female' : 1}
dataF = dataF.replace({'Profession': mapping})
dataF = dataF.replace({'Gender': mapping2})
dataF = dataF.drop(columns=['CustomerID'])
# Normalization
dataF['Profession'] = dataF['Profession'].fillna(0)
normalized_dataF = (dataF - dataF.min())/(dataF.max() - dataF.min())
print(normalized_dataF[:10])
Gender Age Annual Income ($) Spending Score (1-100) Profession \ 0 0.0 0.191919 0.078958 0.39 0.111111 1 0.0 0.212121 0.184236 0.81 0.222222 2 1.0 0.202020 0.452694 0.06 0.222222 3 1.0 0.232323 0.310569 0.77 0.333333 4 1.0 0.313131 0.200027 0.40 0.444444 5 1.0 0.222222 0.305305 0.76 0.555556 6 1.0 0.353535 0.163180 0.06 0.111111 7 1.0 0.232323 0.442166 0.94 0.111111 8 0.0 0.646465 0.510596 0.03 0.222222 9 1.0 0.303030 0.515860 0.72 0.555556 Work Experience Family Size 0 0.058824 0.375 1 0.176471 0.250 2 0.058824 0.000 3 0.000000 0.125 4 0.117647 0.625 5 0.000000 0.125 6 0.058824 0.250 7 0.058824 0.250 8 0.000000 0.250 9 0.058824 0.375
train_data = normalized_dataF[0:1600]
dev_data = normalized_dataF[1600:1800]
test_data = normalized_dataF[1800:]
print(f"Wielkość zbioru Customers: {len(data)} elementów")
print(f"Wielkość zbioru trenującego: {len(train_data)} elementów")
print(f"Wielkość zbioru walidującego: {len(dev_data)} elementów")
print(f"Wielkość zbioru testującego: {len(test_data)} elementów")
print(f" \nDane i wartości na temat zbioru: \n \n {normalized_dataF.describe()}")
Wielkość zbioru Customers: 2000 elementów Wielkość zbioru trenującego: 1600 elementów Wielkość zbioru walidującego: 200 elementów Wielkość zbioru testującego: 200 elementów Dane i wartości na temat zbioru: Gender Age Annual Income ($) Spending Score (1-100) \ count 2000.000000 2000.000000 2000.000000 2000.000000 mean 0.593000 0.494545 0.582879 0.509625 std 0.491398 0.287169 0.240767 0.279347 min 0.000000 0.000000 0.000000 0.000000 25% 0.000000 0.252525 0.392538 0.280000 50% 1.000000 0.484848 0.579263 0.500000 75% 1.000000 0.737374 0.784806 0.750000 max 1.000000 1.000000 1.000000 1.000000 Profession Work Experience Family Size count 2000.000000 2000.000000 2000.000000 mean 0.467167 0.241324 0.346062 std 0.250289 0.230718 0.246344 min 0.000000 0.000000 0.000000 25% 0.222222 0.058824 0.125000 50% 0.555556 0.176471 0.375000 75% 0.555556 0.411765 0.500000 max 1.000000 1.000000 1.000000