{ "cells": [ { "cell_type": "code", "execution_count": 14, "id": "74524ede", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " age gender height_cm weight_kg body fat_% diastolic systolic \\\n", "0 27.0 M 172.3 75.24 21.3 80.0 130.0 \n", "1 25.0 M 165.0 55.80 15.7 77.0 126.0 \n", "2 31.0 M 179.6 78.00 20.1 92.0 152.0 \n", "3 32.0 M 174.5 71.10 18.4 76.0 147.0 \n", "4 28.0 M 173.8 67.70 17.1 70.0 127.0 \n", "\n", " gripForce sit and bend forward_cm sit-ups counts broad jump_cm class \\\n", "0 54.9 18.4 60.0 217.0 C \n", "1 36.4 16.3 53.0 229.0 A \n", "2 44.8 12.0 49.0 181.0 C \n", "3 41.4 15.2 53.0 219.0 B \n", "4 43.5 27.1 45.0 217.0 B \n", "\n", " BMI \n", "0 25.344179 \n", "1 20.495868 \n", "2 24.181428 \n", "3 23.349562 \n", "4 22.412439 \n" ] } ], "source": [ "import pandas as pd\n", "import plotly.express as px\n", "import seaborn as sns\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import MinMaxScaler\n", "\n", "df = pd.read_csv(r'.\\body_performance.csv')\n", "\n", "df['BMI'] = df['weight_kg']/(0.0001*df['height_cm']*df['height_cm'])\n", "print(df.head())" ] }, { "cell_type": "code", "execution_count": null, "id": "0177f243", "metadata": {}, "outputs": [], "source": [ "df.duplicated().sum()\n", "print(f'with duplicates:{df.shape}')\n", "df.drop_duplicates(inplace=True)\n", "print(f'without duplicates:{df.shape}')\n", "df_copy = df.copy()" ] }, { "cell_type": "code", "execution_count": 3, "id": "05f9442a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "number of elements in data frame: 13393\n", "train: 10715\n", "test: 1339\n", "valid: 1339\n" ] } ], "source": [ "body_train, body_test = train_test_split(df, test_size=int(df[\"age\"].count()*0.2), random_state=1)\n", "body_test, body_valid = train_test_split(body_test, test_size=int(body_test[\"age\"].count()*0.5), random_state=1)\n", "\n", "print(\"number of elements in data frame: {}\".format(df['age'].count()))\n", "print(\"train: {}\".format(body_train[\"age\"].count()))\n", "print(\"test: {}\".format(body_test[\"age\"].count()))\n", "print(\"valid: {}\".format(body_valid[\"age\"].count()))" ] }, { "cell_type": "code", "execution_count": 12, "id": "0f3ad57a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " age gender height_cm weight_kg body fat_% \\\n", "count 13393.000000 13393 13393.000000 13393.000000 13393.000000 \n", "unique NaN 2 NaN NaN NaN \n", "top NaN M NaN NaN NaN \n", "freq NaN 8467 NaN NaN NaN \n", "mean 36.775106 NaN 168.559807 67.447316 23.240165 \n", "std 13.625639 NaN 8.426583 11.949666 7.256844 \n", "min 21.000000 NaN 125.000000 26.300000 3.000000 \n", "25% 25.000000 NaN 162.400000 58.200000 18.000000 \n", "50% 32.000000 NaN 169.200000 67.400000 22.800000 \n", "75% 48.000000 NaN 174.800000 75.300000 28.000000 \n", "max 64.000000 NaN 193.800000 138.100000 78.400000 \n", "\n", " diastolic systolic gripForce sit and bend forward_cm \\\n", "count 13393.000000 13393.000000 13393.000000 13393.000000 \n", "unique NaN NaN NaN NaN \n", "top NaN NaN NaN NaN \n", "freq NaN NaN NaN NaN \n", "mean 78.796842 130.234817 36.963877 15.209268 \n", "std 10.742033 14.713954 10.624864 8.456677 \n", "min 0.000000 0.000000 0.000000 -25.000000 \n", "25% 71.000000 120.000000 27.500000 10.900000 \n", "50% 79.000000 130.000000 37.900000 16.200000 \n", "75% 86.000000 141.000000 45.200000 20.700000 \n", "max 156.200000 201.000000 70.500000 213.000000 \n", "\n", " sit-ups counts broad jump_cm class BMI \n", "count 13393.000000 13393.000000 13393 13393.000000 \n", "unique NaN NaN 4 NaN \n", "top NaN NaN C NaN \n", "freq NaN NaN 3349 NaN \n", "mean 39.771224 190.129627 NaN 23.606014 \n", "std 14.276698 39.868000 NaN 2.940936 \n", "min 0.000000 0.000000 NaN 11.103976 \n", "25% 30.000000 162.000000 NaN 21.612812 \n", "50% 41.000000 193.000000 NaN 23.463513 \n", "75% 50.000000 221.000000 NaN 25.341367 \n", "max 80.000000 303.000000 NaN 42.906509 \n" ] } ], "source": [ "print(df.describe(include='all'))\n", "#sit and bend forward_cm jest na minusie!!!" ] }, { "cell_type": "code", "execution_count": 15, "id": "dacdd816", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | age | \n", "gender | \n", "height_cm | \n", "weight_kg | \n", "body fat_% | \n", "diastolic | \n", "systolic | \n", "gripForce | \n", "sit and bend forward_cm | \n", "sit-ups counts | \n", "broad jump_cm | \n", "class | \n", "BMI | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | \n", "13393.000000 | \n", "13393 | \n", "13393.000000 | \n", "13393.000000 | \n", "13393.000000 | \n", "13393.000000 | \n", "13393.000000 | \n", "13393.000000 | \n", "13393.000000 | \n", "13393.000000 | \n", "13393.000000 | \n", "13393 | \n", "13393.000000 | \n", "
unique | \n", "NaN | \n", "2 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "4 | \n", "NaN | \n", "
top | \n", "NaN | \n", "M | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "C | \n", "NaN | \n", "
freq | \n", "NaN | \n", "8467 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "3349 | \n", "NaN | \n", "
mean | \n", "0.366863 | \n", "NaN | \n", "0.633137 | \n", "0.368044 | \n", "0.268437 | \n", "0.504461 | \n", "0.647934 | \n", "0.524310 | \n", "-0.662107 | \n", "0.497140 | \n", "0.627491 | \n", "NaN | \n", "0.393115 | \n", "
std | \n", "0.316875 | \n", "NaN | \n", "0.122479 | \n", "0.106884 | \n", "0.096245 | \n", "0.068771 | \n", "0.073204 | \n", "0.150707 | \n", "0.071065 | \n", "0.178459 | \n", "0.131578 | \n", "NaN | \n", "0.092475 | \n", "
min | \n", "0.000000 | \n", "NaN | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "-1.000000 | \n", "0.000000 | \n", "0.000000 | \n", "NaN | \n", "0.000000 | \n", "
25% | \n", "0.093023 | \n", "NaN | \n", "0.543605 | \n", "0.285331 | \n", "0.198939 | \n", "0.454545 | \n", "0.597015 | \n", "0.390071 | \n", "-0.698319 | \n", "0.375000 | \n", "0.534653 | \n", "NaN | \n", "0.330440 | \n", "
50% | \n", "0.255814 | \n", "NaN | \n", "0.642442 | \n", "0.367621 | \n", "0.262599 | \n", "0.505762 | \n", "0.646766 | \n", "0.537589 | \n", "-0.653782 | \n", "0.512500 | \n", "0.636964 | \n", "NaN | \n", "0.388634 | \n", "
75% | \n", "0.627907 | \n", "NaN | \n", "0.723837 | \n", "0.438283 | \n", "0.331565 | \n", "0.550576 | \n", "0.701493 | \n", "0.641135 | \n", "-0.615966 | \n", "0.625000 | \n", "0.729373 | \n", "NaN | \n", "0.447681 | \n", "
max | \n", "1.000000 | \n", "NaN | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "NaN | \n", "1.000000 | \n", "