ium_s487178/.ipynb_checkpoints/body_performance-checkpoint.ipynb
2023-04-05 14:40:41 +02:00

651 lines
22 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 14,
"id": "74524ede",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" age gender height_cm weight_kg body fat_% diastolic systolic \\\n",
"0 27.0 M 172.3 75.24 21.3 80.0 130.0 \n",
"1 25.0 M 165.0 55.80 15.7 77.0 126.0 \n",
"2 31.0 M 179.6 78.00 20.1 92.0 152.0 \n",
"3 32.0 M 174.5 71.10 18.4 76.0 147.0 \n",
"4 28.0 M 173.8 67.70 17.1 70.0 127.0 \n",
"\n",
" gripForce sit and bend forward_cm sit-ups counts broad jump_cm class \\\n",
"0 54.9 18.4 60.0 217.0 C \n",
"1 36.4 16.3 53.0 229.0 A \n",
"2 44.8 12.0 49.0 181.0 C \n",
"3 41.4 15.2 53.0 219.0 B \n",
"4 43.5 27.1 45.0 217.0 B \n",
"\n",
" BMI \n",
"0 25.344179 \n",
"1 20.495868 \n",
"2 24.181428 \n",
"3 23.349562 \n",
"4 22.412439 \n"
]
}
],
"source": [
"import pandas as pd\n",
"import plotly.express as px\n",
"import seaborn as sns\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import MinMaxScaler\n",
"\n",
"df = pd.read_csv(r'.\\body_performance.csv')\n",
"\n",
"df['BMI'] = df['weight_kg']/(0.0001*df['height_cm']*df['height_cm'])\n",
"print(df.head())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0177f243",
"metadata": {},
"outputs": [],
"source": [
"df.duplicated().sum()\n",
"print(f'with duplicates:{df.shape}')\n",
"df.drop_duplicates(inplace=True)\n",
"print(f'without duplicates:{df.shape}')\n",
"df_copy = df.copy()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "05f9442a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"number of elements in data frame: 13393\n",
"train: 10715\n",
"test: 1339\n",
"valid: 1339\n"
]
}
],
"source": [
"body_train, body_test = train_test_split(df, test_size=int(df[\"age\"].count()*0.2), random_state=1)\n",
"body_test, body_valid = train_test_split(body_test, test_size=int(body_test[\"age\"].count()*0.5), random_state=1)\n",
"\n",
"print(\"number of elements in data frame: {}\".format(df['age'].count()))\n",
"print(\"train: {}\".format(body_train[\"age\"].count()))\n",
"print(\"test: {}\".format(body_test[\"age\"].count()))\n",
"print(\"valid: {}\".format(body_valid[\"age\"].count()))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "0f3ad57a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" age gender height_cm weight_kg body fat_% \\\n",
"count 13393.000000 13393 13393.000000 13393.000000 13393.000000 \n",
"unique NaN 2 NaN NaN NaN \n",
"top NaN M NaN NaN NaN \n",
"freq NaN 8467 NaN NaN NaN \n",
"mean 36.775106 NaN 168.559807 67.447316 23.240165 \n",
"std 13.625639 NaN 8.426583 11.949666 7.256844 \n",
"min 21.000000 NaN 125.000000 26.300000 3.000000 \n",
"25% 25.000000 NaN 162.400000 58.200000 18.000000 \n",
"50% 32.000000 NaN 169.200000 67.400000 22.800000 \n",
"75% 48.000000 NaN 174.800000 75.300000 28.000000 \n",
"max 64.000000 NaN 193.800000 138.100000 78.400000 \n",
"\n",
" diastolic systolic gripForce sit and bend forward_cm \\\n",
"count 13393.000000 13393.000000 13393.000000 13393.000000 \n",
"unique NaN NaN NaN NaN \n",
"top NaN NaN NaN NaN \n",
"freq NaN NaN NaN NaN \n",
"mean 78.796842 130.234817 36.963877 15.209268 \n",
"std 10.742033 14.713954 10.624864 8.456677 \n",
"min 0.000000 0.000000 0.000000 -25.000000 \n",
"25% 71.000000 120.000000 27.500000 10.900000 \n",
"50% 79.000000 130.000000 37.900000 16.200000 \n",
"75% 86.000000 141.000000 45.200000 20.700000 \n",
"max 156.200000 201.000000 70.500000 213.000000 \n",
"\n",
" sit-ups counts broad jump_cm class BMI \n",
"count 13393.000000 13393.000000 13393 13393.000000 \n",
"unique NaN NaN 4 NaN \n",
"top NaN NaN C NaN \n",
"freq NaN NaN 3349 NaN \n",
"mean 39.771224 190.129627 NaN 23.606014 \n",
"std 14.276698 39.868000 NaN 2.940936 \n",
"min 0.000000 0.000000 NaN 11.103976 \n",
"25% 30.000000 162.000000 NaN 21.612812 \n",
"50% 41.000000 193.000000 NaN 23.463513 \n",
"75% 50.000000 221.000000 NaN 25.341367 \n",
"max 80.000000 303.000000 NaN 42.906509 \n"
]
}
],
"source": [
"print(df.describe(include='all'))\n",
"#sit and bend forward_cm jest na minusie!!!"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "dacdd816",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>gender</th>\n",
" <th>height_cm</th>\n",
" <th>weight_kg</th>\n",
" <th>body fat_%</th>\n",
" <th>diastolic</th>\n",
" <th>systolic</th>\n",
" <th>gripForce</th>\n",
" <th>sit and bend forward_cm</th>\n",
" <th>sit-ups counts</th>\n",
" <th>broad jump_cm</th>\n",
" <th>class</th>\n",
" <th>BMI</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>13393.000000</td>\n",
" <td>13393</td>\n",
" <td>13393.000000</td>\n",
" <td>13393.000000</td>\n",
" <td>13393.000000</td>\n",
" <td>13393.000000</td>\n",
" <td>13393.000000</td>\n",
" <td>13393.000000</td>\n",
" <td>13393.000000</td>\n",
" <td>13393.000000</td>\n",
" <td>13393.000000</td>\n",
" <td>13393</td>\n",
" <td>13393.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>4</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>NaN</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>C</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>NaN</td>\n",
" <td>8467</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3349</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>0.366863</td>\n",
" <td>NaN</td>\n",
" <td>0.633137</td>\n",
" <td>0.368044</td>\n",
" <td>0.268437</td>\n",
" <td>0.504461</td>\n",
" <td>0.647934</td>\n",
" <td>0.524310</td>\n",
" <td>-0.662107</td>\n",
" <td>0.497140</td>\n",
" <td>0.627491</td>\n",
" <td>NaN</td>\n",
" <td>0.393115</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>0.316875</td>\n",
" <td>NaN</td>\n",
" <td>0.122479</td>\n",
" <td>0.106884</td>\n",
" <td>0.096245</td>\n",
" <td>0.068771</td>\n",
" <td>0.073204</td>\n",
" <td>0.150707</td>\n",
" <td>0.071065</td>\n",
" <td>0.178459</td>\n",
" <td>0.131578</td>\n",
" <td>NaN</td>\n",
" <td>0.092475</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>-1.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>0.093023</td>\n",
" <td>NaN</td>\n",
" <td>0.543605</td>\n",
" <td>0.285331</td>\n",
" <td>0.198939</td>\n",
" <td>0.454545</td>\n",
" <td>0.597015</td>\n",
" <td>0.390071</td>\n",
" <td>-0.698319</td>\n",
" <td>0.375000</td>\n",
" <td>0.534653</td>\n",
" <td>NaN</td>\n",
" <td>0.330440</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>0.255814</td>\n",
" <td>NaN</td>\n",
" <td>0.642442</td>\n",
" <td>0.367621</td>\n",
" <td>0.262599</td>\n",
" <td>0.505762</td>\n",
" <td>0.646766</td>\n",
" <td>0.537589</td>\n",
" <td>-0.653782</td>\n",
" <td>0.512500</td>\n",
" <td>0.636964</td>\n",
" <td>NaN</td>\n",
" <td>0.388634</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>0.627907</td>\n",
" <td>NaN</td>\n",
" <td>0.723837</td>\n",
" <td>0.438283</td>\n",
" <td>0.331565</td>\n",
" <td>0.550576</td>\n",
" <td>0.701493</td>\n",
" <td>0.641135</td>\n",
" <td>-0.615966</td>\n",
" <td>0.625000</td>\n",
" <td>0.729373</td>\n",
" <td>NaN</td>\n",
" <td>0.447681</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>1.000000</td>\n",
" <td>NaN</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>NaN</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" age gender height_cm weight_kg body fat_% \\\n",
"count 13393.000000 13393 13393.000000 13393.000000 13393.000000 \n",
"unique NaN 2 NaN NaN NaN \n",
"top NaN M NaN NaN NaN \n",
"freq NaN 8467 NaN NaN NaN \n",
"mean 0.366863 NaN 0.633137 0.368044 0.268437 \n",
"std 0.316875 NaN 0.122479 0.106884 0.096245 \n",
"min 0.000000 NaN 0.000000 0.000000 0.000000 \n",
"25% 0.093023 NaN 0.543605 0.285331 0.198939 \n",
"50% 0.255814 NaN 0.642442 0.367621 0.262599 \n",
"75% 0.627907 NaN 0.723837 0.438283 0.331565 \n",
"max 1.000000 NaN 1.000000 1.000000 1.000000 \n",
"\n",
" diastolic systolic gripForce sit and bend forward_cm \\\n",
"count 13393.000000 13393.000000 13393.000000 13393.000000 \n",
"unique NaN NaN NaN NaN \n",
"top NaN NaN NaN NaN \n",
"freq NaN NaN NaN NaN \n",
"mean 0.504461 0.647934 0.524310 -0.662107 \n",
"std 0.068771 0.073204 0.150707 0.071065 \n",
"min 0.000000 0.000000 0.000000 -1.000000 \n",
"25% 0.454545 0.597015 0.390071 -0.698319 \n",
"50% 0.505762 0.646766 0.537589 -0.653782 \n",
"75% 0.550576 0.701493 0.641135 -0.615966 \n",
"max 1.000000 1.000000 1.000000 1.000000 \n",
"\n",
" sit-ups counts broad jump_cm class BMI \n",
"count 13393.000000 13393.000000 13393 13393.000000 \n",
"unique NaN NaN 4 NaN \n",
"top NaN NaN C NaN \n",
"freq NaN NaN 3349 NaN \n",
"mean 0.497140 0.627491 NaN 0.393115 \n",
"std 0.178459 0.131578 NaN 0.092475 \n",
"min 0.000000 0.000000 NaN 0.000000 \n",
"25% 0.375000 0.534653 NaN 0.330440 \n",
"50% 0.512500 0.636964 NaN 0.388634 \n",
"75% 0.625000 0.729373 NaN 0.447681 \n",
"max 1.000000 1.000000 NaN 1.000000 "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scaler = MinMaxScaler()\n",
"df[['age', 'height_cm', 'weight_kg','body fat_%',\n",
" 'diastolic','systolic','gripForce','sit-ups counts',\n",
" 'broad jump_cm','BMI']] = scaler.fit_transform(df[[\n",
" 'age', 'height_cm', 'weight_kg','body fat_%',\n",
" 'diastolic','systolic','gripForce','sit-ups counts',\n",
" 'broad jump_cm','BMI']])\n",
"\n",
"scaler = MinMaxScaler(feature_range=(-1, 1))\n",
"df['sit and bend forward_cm'] = scaler.fit_transform(df[['sit and bend forward_cm']])\n",
"df.describe(include='all')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "5cd376cf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 13393 entries, 0 to 13392\n",
"Data columns (total 13 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 age 13393 non-null float64\n",
" 1 gender 13393 non-null object \n",
" 2 height_cm 13393 non-null float64\n",
" 3 weight_kg 13393 non-null float64\n",
" 4 body fat_% 13393 non-null float64\n",
" 5 diastolic 13393 non-null float64\n",
" 6 systolic 13393 non-null float64\n",
" 7 gripForce 13393 non-null float64\n",
" 8 sit and bend forward_cm 13393 non-null float64\n",
" 9 sit-ups counts 13393 non-null float64\n",
" 10 broad jump_cm 13393 non-null float64\n",
" 11 class 13393 non-null object \n",
" 12 BMI 13393 non-null float64\n",
"dtypes: float64(11), object(2)\n",
"memory usage: 1.3+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "93dcf330",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Each class in data frame: \n",
"C 3349\n",
"D 3349\n",
"A 3348\n",
"B 3347\n",
"Name: class, dtype: int64\n",
"Each class in train data: \n",
"A 2703\n",
"B 2681\n",
"C 2671\n",
"D 2660\n",
"Name: class, dtype: int64\n",
"Each class in test data: \n",
"D 353\n",
"C 332\n",
"B 328\n",
"A 326\n",
"Name: class, dtype: int64\n",
"Each class in valid data: \n",
"C 346\n",
"B 338\n",
"D 336\n",
"A 319\n",
"Name: class, dtype: int64\n"
]
}
],
"source": [
"print('Each class in data frame: \\n{}'.format(df['class'].value_counts()))\n",
"print('Each class in train data: \\n{}'.format(body_train['class'].value_counts()))\n",
"print('Each class in test data: \\n{}'.format(body_test['class'].value_counts()))\n",
"print('Each class in valid data: \\n{}'.format(body_valid['class'].value_counts()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b5620509",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "3e9bbbe7",
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4857a167",
"metadata": {},
"outputs": [],
"source": [
"#df[\"class\"].value_counts().plot(kind=\"bar\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "779157c0",
"metadata": {},
"outputs": [],
"source": [
"#df[[\"class\",\"body fat_%\"]].groupby(\"class\").mean().plot(kind=\"bar\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "da14bf43",
"metadata": {},
"outputs": [],
"source": [
"#sns.set_theme()\n",
"\n",
"#sns.relplot(data = df.head(200), x = 'broad jump_cm', y = 'sit-ups counts', hue = 'class')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6597e57c",
"metadata": {},
"outputs": [],
"source": [
"#sns.relplot(data = df[df['gender'] == 'M'].head(200), x = 'body fat_%', y = 'BMI', hue = 'class')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "957e1b2e",
"metadata": {},
"outputs": [],
"source": [
"#sns.relplot(data = df[df['gender'] == 'F'].head(200), x = 'body fat_%', y = 'BMI', hue = 'class')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9f0394f0",
"metadata": {},
"outputs": [],
"source": [
"#px.box(df, y=['height_cm',\n",
"# 'weight_kg',\n",
"# 'body fat_%',\n",
"# 'diastolic',\n",
"# 'systolic',\n",
"# 'gripForce',\n",
"# 'sit and bend forward_cm',\n",
"# 'sit-ups counts',\n",
"# 'broad jump_cm',\n",
"# 'BMI'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "22542bba",
"metadata": {},
"outputs": [],
"source": [
"# this is taking too long time\n",
"#sns.pairplot(data=df.drop(columns=[\"gender\"]).head(500), hue=\"class\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "29730d20",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "dc21a9cb",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}