187 lines
7.2 KiB
Plaintext
187 lines
7.2 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 60,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" CustomerID Gender Age Annual Income ($) Spending Score (1-100) \\\n",
|
|
"0 1 Male 19 15000 39 \n",
|
|
"1 2 Male 21 35000 81 \n",
|
|
"2 3 Female 20 86000 6 \n",
|
|
"3 4 Female 23 59000 77 \n",
|
|
"4 5 Female 31 38000 40 \n",
|
|
"5 6 Female 22 58000 76 \n",
|
|
"6 7 Female 35 31000 6 \n",
|
|
"7 8 Female 23 84000 94 \n",
|
|
"8 9 Male 64 97000 3 \n",
|
|
"9 10 Female 30 98000 72 \n",
|
|
"\n",
|
|
" Profession Work Experience Family Size \n",
|
|
"0 Healthcare 1 4 \n",
|
|
"1 Engineer 3 3 \n",
|
|
"2 Engineer 1 1 \n",
|
|
"3 Lawyer 0 2 \n",
|
|
"4 Entertainment 2 6 \n",
|
|
"5 Artist 0 2 \n",
|
|
"6 Healthcare 1 3 \n",
|
|
"7 Healthcare 1 3 \n",
|
|
"8 Engineer 0 3 \n",
|
|
"9 Artist 1 4 \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"\n",
|
|
"data = pd.read_csv(\"Customers.csv\")\n",
|
|
"print(data[:10])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 61,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" Gender Age Annual Income ($) Spending Score (1-100) Profession \\\n",
|
|
"0 0.0 0.191919 0.078958 0.39 0.111111 \n",
|
|
"1 0.0 0.212121 0.184236 0.81 0.222222 \n",
|
|
"2 1.0 0.202020 0.452694 0.06 0.222222 \n",
|
|
"3 1.0 0.232323 0.310569 0.77 0.333333 \n",
|
|
"4 1.0 0.313131 0.200027 0.40 0.444444 \n",
|
|
"5 1.0 0.222222 0.305305 0.76 0.555556 \n",
|
|
"6 1.0 0.353535 0.163180 0.06 0.111111 \n",
|
|
"7 1.0 0.232323 0.442166 0.94 0.111111 \n",
|
|
"8 0.0 0.646465 0.510596 0.03 0.222222 \n",
|
|
"9 1.0 0.303030 0.515860 0.72 0.555556 \n",
|
|
"\n",
|
|
" Work Experience Family Size \n",
|
|
"0 0.058824 0.375 \n",
|
|
"1 0.176471 0.250 \n",
|
|
"2 0.058824 0.000 \n",
|
|
"3 0.000000 0.125 \n",
|
|
"4 0.117647 0.625 \n",
|
|
"5 0.000000 0.125 \n",
|
|
"6 0.058824 0.250 \n",
|
|
"7 0.058824 0.250 \n",
|
|
"8 0.000000 0.250 \n",
|
|
"9 0.058824 0.375 \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"dataF = data\n",
|
|
"\n",
|
|
"# Changing words to numbers\n",
|
|
"\n",
|
|
"mapping = {'NaN' : 0, 'Healthcare' : 1, 'Engineer' : 2, 'Lawyer' : 3, 'Entertainment' : 4, 'Artist' : 5, 'Executive' : 6,\n",
|
|
" 'Doctor' : 7, 'Homemaker' : 8, 'Marketing' : 9}\n",
|
|
"\n",
|
|
"mapping2 = {'Male' : 0, 'Female' : 1}\n",
|
|
"\n",
|
|
"dataF = dataF.replace({'Profession': mapping})\n",
|
|
"dataF = dataF.replace({'Gender': mapping2})\n",
|
|
"\n",
|
|
"dataF = dataF.drop(columns=['CustomerID'])\n",
|
|
"\n",
|
|
"# Normalization\n",
|
|
"\n",
|
|
"dataF['Profession'] = dataF['Profession'].fillna(0)\n",
|
|
"\n",
|
|
"normalized_dataF = (dataF - dataF.min())/(dataF.max() - dataF.min())\n",
|
|
"\n",
|
|
"print(normalized_dataF[:10])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 62,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"train_data = normalized_dataF[0:1600]\n",
|
|
"dev_data = normalized_dataF[1600:1800]\n",
|
|
"test_data = normalized_dataF[1800:]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 64,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Wielkość zbioru Customers: 2000 elementów\n",
|
|
"Wielkość zbioru trenującego: 1600 elementów\n",
|
|
"Wielkość zbioru walidującego: 200 elementów\n",
|
|
"Wielkość zbioru testującego: 200 elementów\n",
|
|
" \n",
|
|
"Dane i wartości na temat zbioru: \n",
|
|
" \n",
|
|
" Gender Age Annual Income ($) Spending Score (1-100) \\\n",
|
|
"count 2000.000000 2000.000000 2000.000000 2000.000000 \n",
|
|
"mean 0.593000 0.494545 0.582879 0.509625 \n",
|
|
"std 0.491398 0.287169 0.240767 0.279347 \n",
|
|
"min 0.000000 0.000000 0.000000 0.000000 \n",
|
|
"25% 0.000000 0.252525 0.392538 0.280000 \n",
|
|
"50% 1.000000 0.484848 0.579263 0.500000 \n",
|
|
"75% 1.000000 0.737374 0.784806 0.750000 \n",
|
|
"max 1.000000 1.000000 1.000000 1.000000 \n",
|
|
"\n",
|
|
" Profession Work Experience Family Size \n",
|
|
"count 2000.000000 2000.000000 2000.000000 \n",
|
|
"mean 0.467167 0.241324 0.346062 \n",
|
|
"std 0.250289 0.230718 0.246344 \n",
|
|
"min 0.000000 0.000000 0.000000 \n",
|
|
"25% 0.222222 0.058824 0.125000 \n",
|
|
"50% 0.555556 0.176471 0.375000 \n",
|
|
"75% 0.555556 0.411765 0.500000 \n",
|
|
"max 1.000000 1.000000 1.000000 \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(f\"Wielkość zbioru Customers: {len(data)} elementów\")\n",
|
|
"print(f\"Wielkość zbioru trenującego: {len(train_data)} elementów\")\n",
|
|
"print(f\"Wielkość zbioru walidującego: {len(dev_data)} elementów\")\n",
|
|
"print(f\"Wielkość zbioru testującego: {len(test_data)} elementów\")\n",
|
|
"\n",
|
|
"print(f\" \\nDane i wartości na temat zbioru: \\n \\n {normalized_dataF.describe()}\")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.6"
|
|
},
|
|
"orig_nbformat": 4
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|