2023-04-04 14:00:50 +02:00
|
|
|
#!/usr/bin/python
|
|
|
|
|
2023-04-04 13:58:30 +02:00
|
|
|
import pandas as pd
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
|
2023-05-06 13:46:09 +02:00
|
|
|
def prepareData():
|
|
|
|
data = pd.read_csv("Customers.csv")
|
|
|
|
#print(data[:10])
|
2023-04-04 13:58:30 +02:00
|
|
|
|
2023-05-06 13:46:09 +02:00
|
|
|
dataF = data
|
2023-04-04 13:58:30 +02:00
|
|
|
|
2023-05-06 13:46:09 +02:00
|
|
|
mapping = {'NaN' : 0, 'Healthcare' : 1, 'Engineer' : 2, 'Lawyer' : 3, 'Entertainment' : 4, 'Artist' : 5, 'Executive' : 6,
|
|
|
|
'Doctor' : 7, 'Homemaker' : 8, 'Marketing' : 9}
|
2023-04-04 13:58:30 +02:00
|
|
|
|
2023-05-06 13:46:09 +02:00
|
|
|
mapping2 = {'Male' : 0, 'Female' : 1}
|
2023-04-04 13:58:30 +02:00
|
|
|
|
2023-05-06 13:46:09 +02:00
|
|
|
dataF = dataF.replace({'Profession': mapping})
|
|
|
|
dataF = dataF.replace({'Gender': mapping2})
|
2023-04-04 13:58:30 +02:00
|
|
|
|
2023-05-06 13:46:09 +02:00
|
|
|
dataF = dataF.drop(columns=['CustomerID'])
|
2023-04-04 13:58:30 +02:00
|
|
|
|
2023-05-06 13:46:09 +02:00
|
|
|
dataF['Profession'] = dataF['Profession'].fillna(0)
|
2023-04-04 13:58:30 +02:00
|
|
|
|
2023-05-06 13:46:09 +02:00
|
|
|
normalized_dataF = (dataF - dataF.min())/(dataF.max() - dataF.min())
|
2023-04-04 13:58:30 +02:00
|
|
|
|
2023-05-06 13:46:09 +02:00
|
|
|
#print(normalized_dataF[:10])
|
2023-04-04 13:58:30 +02:00
|
|
|
|
2023-05-06 13:46:09 +02:00
|
|
|
train_data = normalized_dataF[0:1600]
|
|
|
|
dev_data = normalized_dataF[1600:1800]
|
|
|
|
test_data = normalized_dataF[1800:]
|
2023-04-04 13:58:30 +02:00
|
|
|
|
2023-05-06 13:46:09 +02:00
|
|
|
#print(f"Wielkość zbioru Customers: {len(data)} elementów")
|
|
|
|
#print(f"Wielkość zbioru trenującego: {len(train_data)} elementów")
|
|
|
|
#print(f"Wielkość zbioru walidującego: {len(dev_data)} elementów")
|
|
|
|
#print(f"Wielkość zbioru testującego: {len(test_data)} elementów")
|
|
|
|
|
|
|
|
#print(f" \nDane i wartości na temat zbioru: \n \n {normalized_dataF.describe()}")
|
|
|
|
|
|
|
|
return train_data, dev_data, test_data
|