{ "cells": [ { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " CustomerID Gender Age Annual Income ($) Spending Score (1-100) \\\n", "0 1 Male 19 15000 39 \n", "1 2 Male 21 35000 81 \n", "2 3 Female 20 86000 6 \n", "3 4 Female 23 59000 77 \n", "4 5 Female 31 38000 40 \n", "5 6 Female 22 58000 76 \n", "6 7 Female 35 31000 6 \n", "7 8 Female 23 84000 94 \n", "8 9 Male 64 97000 3 \n", "9 10 Female 30 98000 72 \n", "\n", " Profession Work Experience Family Size \n", "0 Healthcare 1 4 \n", "1 Engineer 3 3 \n", "2 Engineer 1 1 \n", "3 Lawyer 0 2 \n", "4 Entertainment 2 6 \n", "5 Artist 0 2 \n", "6 Healthcare 1 3 \n", "7 Healthcare 1 3 \n", "8 Engineer 0 3 \n", "9 Artist 1 4 \n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "data = pd.read_csv(\"Customers.csv\")\n", "print(data[:10])" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Gender Age Annual Income ($) Spending Score (1-100) Profession \\\n", "0 0.0 0.191919 0.078958 0.39 0.111111 \n", "1 0.0 0.212121 0.184236 0.81 0.222222 \n", "2 1.0 0.202020 0.452694 0.06 0.222222 \n", "3 1.0 0.232323 0.310569 0.77 0.333333 \n", "4 1.0 0.313131 0.200027 0.40 0.444444 \n", "5 1.0 0.222222 0.305305 0.76 0.555556 \n", "6 1.0 0.353535 0.163180 0.06 0.111111 \n", "7 1.0 0.232323 0.442166 0.94 0.111111 \n", "8 0.0 0.646465 0.510596 0.03 0.222222 \n", "9 1.0 0.303030 0.515860 0.72 0.555556 \n", "\n", " Work Experience Family Size \n", "0 0.058824 0.375 \n", "1 0.176471 0.250 \n", "2 0.058824 0.000 \n", "3 0.000000 0.125 \n", "4 0.117647 0.625 \n", "5 0.000000 0.125 \n", "6 0.058824 0.250 \n", "7 0.058824 0.250 \n", "8 0.000000 0.250 \n", "9 0.058824 0.375 \n" ] } ], "source": [ "dataF = data\n", "\n", "# Changing words to numbers\n", "\n", "mapping = {'NaN' : 0, 'Healthcare' : 1, 'Engineer' : 2, 'Lawyer' : 3, 'Entertainment' : 4, 'Artist' : 5, 'Executive' : 6,\n", " 'Doctor' : 7, 'Homemaker' : 8, 'Marketing' : 9}\n", "\n", "mapping2 = {'Male' : 0, 'Female' : 1}\n", "\n", "dataF = dataF.replace({'Profession': mapping})\n", "dataF = dataF.replace({'Gender': mapping2})\n", "\n", "dataF = dataF.drop(columns=['CustomerID'])\n", "\n", "# Normalization\n", "\n", "dataF['Profession'] = dataF['Profession'].fillna(0)\n", "\n", "normalized_dataF = (dataF - dataF.min())/(dataF.max() - dataF.min())\n", "\n", "print(normalized_dataF[:10])" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "train_data = normalized_dataF[0:1600]\n", "dev_data = normalized_dataF[1600:1800]\n", "test_data = normalized_dataF[1800:]" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wielkość zbioru Customers: 2000 elementów\n", "Wielkość zbioru trenującego: 1600 elementów\n", "Wielkość zbioru walidującego: 200 elementów\n", "Wielkość zbioru testującego: 200 elementów\n", " \n", "Dane i wartości na temat zbioru: \n", " \n", " Gender Age Annual Income ($) Spending Score (1-100) \\\n", "count 2000.000000 2000.000000 2000.000000 2000.000000 \n", "mean 0.593000 0.494545 0.582879 0.509625 \n", "std 0.491398 0.287169 0.240767 0.279347 \n", "min 0.000000 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.252525 0.392538 0.280000 \n", "50% 1.000000 0.484848 0.579263 0.500000 \n", "75% 1.000000 0.737374 0.784806 0.750000 \n", "max 1.000000 1.000000 1.000000 1.000000 \n", "\n", " Profession Work Experience Family Size \n", "count 2000.000000 2000.000000 2000.000000 \n", "mean 0.467167 0.241324 0.346062 \n", "std 0.250289 0.230718 0.246344 \n", "min 0.000000 0.000000 0.000000 \n", "25% 0.222222 0.058824 0.125000 \n", "50% 0.555556 0.176471 0.375000 \n", "75% 0.555556 0.411765 0.500000 \n", "max 1.000000 1.000000 1.000000 \n" ] } ], "source": [ "print(f\"Wielkość zbioru Customers: {len(data)} elementów\")\n", "print(f\"Wielkość zbioru trenującego: {len(train_data)} elementów\")\n", "print(f\"Wielkość zbioru walidującego: {len(dev_data)} elementów\")\n", "print(f\"Wielkość zbioru testującego: {len(test_data)} elementów\")\n", "\n", "print(f\" \\nDane i wartości na temat zbioru: \\n \\n {normalized_dataF.describe()}\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }