Added task 1 (zadanie1) to repository

This commit is contained in:
Jakub Henyk 2023-03-21 20:42:13 +01:00
parent 1d0b0ebd9c
commit bd84848bf6
4 changed files with 2191 additions and 4 deletions

2001
Customers.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,3 @@
# ium_452627 # ium_452627
Rozwiązania zadań do przedmiotu Inżynieria Uczenia Maszynowego Rozwiązania zadań do przedmiotu: Inżynieria Uczenia Maszynowego

186
zadanie1.ipynb Normal file
View File

@ -0,0 +1,186 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" CustomerID Gender Age Annual Income ($) Spending Score (1-100) \\\n",
"0 1 Male 19 15000 39 \n",
"1 2 Male 21 35000 81 \n",
"2 3 Female 20 86000 6 \n",
"3 4 Female 23 59000 77 \n",
"4 5 Female 31 38000 40 \n",
"5 6 Female 22 58000 76 \n",
"6 7 Female 35 31000 6 \n",
"7 8 Female 23 84000 94 \n",
"8 9 Male 64 97000 3 \n",
"9 10 Female 30 98000 72 \n",
"\n",
" Profession Work Experience Family Size \n",
"0 Healthcare 1 4 \n",
"1 Engineer 3 3 \n",
"2 Engineer 1 1 \n",
"3 Lawyer 0 2 \n",
"4 Entertainment 2 6 \n",
"5 Artist 0 2 \n",
"6 Healthcare 1 3 \n",
"7 Healthcare 1 3 \n",
"8 Engineer 0 3 \n",
"9 Artist 1 4 \n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"data = pd.read_csv(\"Customers.csv\")\n",
"print(data[:10])"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Gender Age Annual Income ($) Spending Score (1-100) Profession \\\n",
"0 0.0 0.191919 0.078958 0.39 0.111111 \n",
"1 0.0 0.212121 0.184236 0.81 0.222222 \n",
"2 1.0 0.202020 0.452694 0.06 0.222222 \n",
"3 1.0 0.232323 0.310569 0.77 0.333333 \n",
"4 1.0 0.313131 0.200027 0.40 0.444444 \n",
"5 1.0 0.222222 0.305305 0.76 0.555556 \n",
"6 1.0 0.353535 0.163180 0.06 0.111111 \n",
"7 1.0 0.232323 0.442166 0.94 0.111111 \n",
"8 0.0 0.646465 0.510596 0.03 0.222222 \n",
"9 1.0 0.303030 0.515860 0.72 0.555556 \n",
"\n",
" Work Experience Family Size \n",
"0 0.058824 0.375 \n",
"1 0.176471 0.250 \n",
"2 0.058824 0.000 \n",
"3 0.000000 0.125 \n",
"4 0.117647 0.625 \n",
"5 0.000000 0.125 \n",
"6 0.058824 0.250 \n",
"7 0.058824 0.250 \n",
"8 0.000000 0.250 \n",
"9 0.058824 0.375 \n"
]
}
],
"source": [
"dataF = data\n",
"\n",
"# Changing words to numbers\n",
"\n",
"mapping = {'NaN' : 0, 'Healthcare' : 1, 'Engineer' : 2, 'Lawyer' : 3, 'Entertainment' : 4, 'Artist' : 5, 'Executive' : 6,\n",
" 'Doctor' : 7, 'Homemaker' : 8, 'Marketing' : 9}\n",
"\n",
"mapping2 = {'Male' : 0, 'Female' : 1}\n",
"\n",
"dataF = dataF.replace({'Profession': mapping})\n",
"dataF = dataF.replace({'Gender': mapping2})\n",
"\n",
"dataF = dataF.drop(columns=['CustomerID'])\n",
"\n",
"# Normalization\n",
"\n",
"dataF['Profession'] = dataF['Profession'].fillna(0)\n",
"\n",
"normalized_dataF = (dataF - dataF.min())/(dataF.max() - dataF.min())\n",
"\n",
"print(normalized_dataF[:10])"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"train_data = normalized_dataF[0:1600]\n",
"dev_data = normalized_dataF[1600:1800]\n",
"test_data = normalized_dataF[1800:]"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wielkość zbioru Customers: 2000 elementów\n",
"Wielkość zbioru trenującego: 1600 elementów\n",
"Wielkość zbioru walidującego: 200 elementów\n",
"Wielkość zbioru testującego: 200 elementów\n",
" \n",
"Dane i wartości na temat zbioru: \n",
" \n",
" Gender Age Annual Income ($) Spending Score (1-100) \\\n",
"count 2000.000000 2000.000000 2000.000000 2000.000000 \n",
"mean 0.593000 0.494545 0.582879 0.509625 \n",
"std 0.491398 0.287169 0.240767 0.279347 \n",
"min 0.000000 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.252525 0.392538 0.280000 \n",
"50% 1.000000 0.484848 0.579263 0.500000 \n",
"75% 1.000000 0.737374 0.784806 0.750000 \n",
"max 1.000000 1.000000 1.000000 1.000000 \n",
"\n",
" Profession Work Experience Family Size \n",
"count 2000.000000 2000.000000 2000.000000 \n",
"mean 0.467167 0.241324 0.346062 \n",
"std 0.250289 0.230718 0.246344 \n",
"min 0.000000 0.000000 0.000000 \n",
"25% 0.222222 0.058824 0.125000 \n",
"50% 0.555556 0.176471 0.375000 \n",
"75% 0.555556 0.411765 0.500000 \n",
"max 1.000000 1.000000 1.000000 \n"
]
}
],
"source": [
"print(f\"Wielkość zbioru Customers: {len(data)} elementów\")\n",
"print(f\"Wielkość zbioru trenującego: {len(train_data)} elementów\")\n",
"print(f\"Wielkość zbioru walidującego: {len(dev_data)} elementów\")\n",
"print(f\"Wielkość zbioru testującego: {len(test_data)} elementów\")\n",
"\n",
"print(f\" \\nDane i wartości na temat zbioru: \\n \\n {normalized_dataF.describe()}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}