ium_464937/02.ipynb

554 lines
43 KiB
Plaintext
Raw Normal View History

2024-03-18 00:22:51 +01:00
{
"cells": [
{
"cell_type": "markdown",
"source": [
"## 1. Pobranie zbioru"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 34,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: kaggle in ./venv/lib/python3.11/site-packages (1.6.6)\r\n",
"Requirement already satisfied: six>=1.10 in ./venv/lib/python3.11/site-packages (from kaggle) (1.16.0)\r\n",
"Requirement already satisfied: certifi in ./venv/lib/python3.11/site-packages (from kaggle) (2024.2.2)\r\n",
"Requirement already satisfied: python-dateutil in ./venv/lib/python3.11/site-packages (from kaggle) (2.9.0.post0)\r\n",
"Requirement already satisfied: requests in ./venv/lib/python3.11/site-packages (from kaggle) (2.31.0)\r\n",
"Requirement already satisfied: tqdm in ./venv/lib/python3.11/site-packages (from kaggle) (4.66.2)\r\n",
"Requirement already satisfied: python-slugify in ./venv/lib/python3.11/site-packages (from kaggle) (8.0.4)\r\n",
"Requirement already satisfied: urllib3 in ./venv/lib/python3.11/site-packages (from kaggle) (2.2.1)\r\n",
"Requirement already satisfied: bleach in ./venv/lib/python3.11/site-packages (from kaggle) (6.1.0)\r\n",
"Requirement already satisfied: webencodings in ./venv/lib/python3.11/site-packages (from bleach->kaggle) (0.5.1)\r\n",
"Requirement already satisfied: text-unidecode>=1.3 in ./venv/lib/python3.11/site-packages (from python-slugify->kaggle) (1.3)\r\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in ./venv/lib/python3.11/site-packages (from requests->kaggle) (3.3.2)\r\n",
"Requirement already satisfied: idna<4,>=2.5 in ./venv/lib/python3.11/site-packages (from requests->kaggle) (3.6)\r\n",
"\r\n",
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip available: \u001B[0m\u001B[31;49m22.3.1\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m24.0\u001B[0m\r\n",
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\r\n",
"Requirement already satisfied: pandas in ./venv/lib/python3.11/site-packages (2.2.1)\r\n",
"Requirement already satisfied: numpy<2,>=1.23.2 in ./venv/lib/python3.11/site-packages (from pandas) (1.26.4)\r\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in ./venv/lib/python3.11/site-packages (from pandas) (2.9.0.post0)\r\n",
"Requirement already satisfied: pytz>=2020.1 in ./venv/lib/python3.11/site-packages (from pandas) (2024.1)\r\n",
"Requirement already satisfied: tzdata>=2022.7 in ./venv/lib/python3.11/site-packages (from pandas) (2024.1)\r\n",
"Requirement already satisfied: six>=1.5 in ./venv/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\r\n",
"\r\n",
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip available: \u001B[0m\u001B[31;49m22.3.1\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m24.0\u001B[0m\r\n",
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\r\n"
]
}
],
"source": [
"!pip install kaggle\n",
"!pip install pandas"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-17T20:46:24.425690Z",
"start_time": "2024-03-17T20:46:20.152437Z"
}
}
},
{
"cell_type": "code",
"execution_count": 35,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"powerlifting-database.zip: Skipping, found more recently modified local copy (use --force to force download)\r\n"
]
}
],
"source": [
"!kaggle datasets download -d open-powerlifting/powerlifting-database"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-17T20:46:25.992367Z",
"start_time": "2024-03-17T20:46:24.434212Z"
}
}
},
{
"cell_type": "code",
"execution_count": 36,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Archive: powerlifting-database.zip\r\n",
" inflating: openpowerlifting-2024-01-06-4c732975.csv \r\n",
" inflating: openpowerlifting.csv \r\n"
]
}
],
"source": [
"!unzip -o powerlifting-database.zip"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-17T20:46:33.195703Z",
"start_time": "2024-03-17T20:46:25.990819Z"
}
}
},
{
"cell_type": "code",
"execution_count": 37,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pandas in ./venv/lib/python3.11/site-packages (2.2.1)\r\n",
"Requirement already satisfied: numpy<2,>=1.23.2 in ./venv/lib/python3.11/site-packages (from pandas) (1.26.4)\r\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in ./venv/lib/python3.11/site-packages (from pandas) (2.9.0.post0)\r\n",
"Requirement already satisfied: pytz>=2020.1 in ./venv/lib/python3.11/site-packages (from pandas) (2024.1)\r\n",
"Requirement already satisfied: tzdata>=2022.7 in ./venv/lib/python3.11/site-packages (from pandas) (2024.1)\r\n",
"Requirement already satisfied: six>=1.5 in ./venv/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\r\n",
"\r\n",
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip available: \u001B[0m\u001B[31;49m22.3.1\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m24.0\u001B[0m\r\n",
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\r\n",
"Requirement already satisfied: seaborn in ./venv/lib/python3.11/site-packages (0.13.2)\r\n",
"Requirement already satisfied: numpy!=1.24.0,>=1.20 in ./venv/lib/python3.11/site-packages (from seaborn) (1.26.4)\r\n",
"Requirement already satisfied: pandas>=1.2 in ./venv/lib/python3.11/site-packages (from seaborn) (2.2.1)\r\n",
"Requirement already satisfied: matplotlib!=3.6.1,>=3.4 in ./venv/lib/python3.11/site-packages (from seaborn) (3.8.3)\r\n",
"Requirement already satisfied: contourpy>=1.0.1 in ./venv/lib/python3.11/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.2.0)\r\n",
"Requirement already satisfied: cycler>=0.10 in ./venv/lib/python3.11/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (0.12.1)\r\n",
"Requirement already satisfied: fonttools>=4.22.0 in ./venv/lib/python3.11/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (4.50.0)\r\n",
"Requirement already satisfied: kiwisolver>=1.3.1 in ./venv/lib/python3.11/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.4.5)\r\n",
"Requirement already satisfied: packaging>=20.0 in ./venv/lib/python3.11/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (24.0)\r\n",
"Requirement already satisfied: pillow>=8 in ./venv/lib/python3.11/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (10.2.0)\r\n",
"Requirement already satisfied: pyparsing>=2.3.1 in ./venv/lib/python3.11/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (3.1.2)\r\n",
"Requirement already satisfied: python-dateutil>=2.7 in ./venv/lib/python3.11/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (2.9.0.post0)\r\n",
"Requirement already satisfied: pytz>=2020.1 in ./venv/lib/python3.11/site-packages (from pandas>=1.2->seaborn) (2024.1)\r\n",
"Requirement already satisfied: tzdata>=2022.7 in ./venv/lib/python3.11/site-packages (from pandas>=1.2->seaborn) (2024.1)\r\n",
"Requirement already satisfied: six>=1.5 in ./venv/lib/python3.11/site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.4->seaborn) (1.16.0)\r\n",
"\r\n",
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip available: \u001B[0m\u001B[31;49m22.3.1\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m24.0\u001B[0m\r\n",
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\r\n"
]
}
],
"source": [
"!pip install pandas\n",
"!pip install seaborn"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-17T20:46:37.684925Z",
"start_time": "2024-03-17T20:46:33.204291Z"
}
}
},
{
"cell_type": "code",
"execution_count": 62,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/82/g0638vys2hs3rk916hlpkdrr0000gn/T/ipykernel_47077/3909872695.py:2: DtypeWarning: Columns (35) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = pd.read_csv('openpowerlifting.csv')\n"
]
}
],
"source": [
"import pandas as pd\n",
"data = pd.read_csv('openpowerlifting.csv')"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-17T21:22:11.363229Z",
"start_time": "2024-03-17T21:22:03.759530Z"
}
}
},
{
"cell_type": "markdown",
"source": [
"## 2. Statystyki"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 39,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 1423354 entries, 0 to 1423353\n",
"Data columns (total 37 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Name 1423354 non-null object \n",
" 1 Sex 1423354 non-null object \n",
" 2 Event 1423354 non-null object \n",
" 3 Equipment 1423354 non-null object \n",
" 4 Age 757527 non-null float64\n",
" 5 AgeClass 786800 non-null object \n",
" 6 Division 1415176 non-null object \n",
" 7 BodyweightKg 1406622 non-null float64\n",
" 8 WeightClassKg 1410042 non-null object \n",
" 9 Squat1Kg 337580 non-null float64\n",
" 10 Squat2Kg 333349 non-null float64\n",
" 11 Squat3Kg 323842 non-null float64\n",
" 12 Squat4Kg 3696 non-null float64\n",
" 13 Best3SquatKg 1031450 non-null float64\n",
" 14 Bench1Kg 499779 non-null float64\n",
" 15 Bench2Kg 493486 non-null float64\n",
" 16 Bench3Kg 478485 non-null float64\n",
" 17 Bench4Kg 9505 non-null float64\n",
" 18 Best3BenchKg 1276181 non-null float64\n",
" 19 Deadlift1Kg 363544 non-null float64\n",
" 20 Deadlift2Kg 356023 non-null float64\n",
" 21 Deadlift3Kg 339947 non-null float64\n",
" 22 Deadlift4Kg 9246 non-null float64\n",
" 23 Best3DeadliftKg 1081808 non-null float64\n",
" 24 TotalKg 1313184 non-null float64\n",
" 25 Place 1423354 non-null object \n",
" 26 Wilks 1304407 non-null float64\n",
" 27 McCulloch 1304254 non-null float64\n",
" 28 Glossbrenner 1304407 non-null float64\n",
" 29 IPFPoints 1273286 non-null float64\n",
" 30 Tested 1093892 non-null object \n",
" 31 Country 388884 non-null object \n",
" 32 Federation 1423354 non-null object \n",
" 33 Date 1423354 non-null object \n",
" 34 MeetCountry 1423354 non-null object \n",
" 35 MeetState 941545 non-null object \n",
" 36 MeetName 1423354 non-null object \n",
"dtypes: float64(22), object(15)\n",
"memory usage: 401.8+ MB\n"
]
}
],
"source": [
"data.head()\n",
"data.info()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-17T20:46:45.574171Z",
"start_time": "2024-03-17T20:46:44.513858Z"
}
}
},
{
"cell_type": "code",
"execution_count": 40,
"outputs": [
{
"data": {
"text/plain": "Sex\nM 1060189\nF 363165\nName: count, dtype: int64"
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['Sex'].value_counts()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-17T20:46:45.647881Z",
"start_time": "2024-03-17T20:46:45.644775Z"
}
}
},
{
"cell_type": "code",
"execution_count": 41,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Minimum: -477.5\n",
"Maksimum: 575.0\n",
"Odchylenie standardowe: 69.23931149707244\n",
"Mediana: 167.83\n"
]
},
{
"data": {
"text/plain": "Best3SquatKg\n200.00 15211\n136.08 12626\n190.00 12044\n160.00 12043\n170.00 11993\n ... \n277.30 1\n143.20 1\n129.60 1\n131.80 1\n309.58 1\nName: count, Length: 1907, dtype: int64"
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(f\"Minimum: {data['Best3SquatKg'].min()}\")\n",
"print(f\"Maksimum: {data['Best3SquatKg'].max()}\")\n",
"print(f\"Odchylenie standardowe: {data['Best3SquatKg'].std()}\")\n",
"print(f\"Mediana: {data['Best3SquatKg'].median()}\")\n",
"data['Best3SquatKg'].value_counts()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-17T20:46:45.729691Z",
"start_time": "2024-03-17T20:46:45.650117Z"
}
}
},
{
"cell_type": "markdown",
"source": [
"## 3. Czyszczenie zbioru"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Kolumna country w 73% przypadków jest pusta, dlatego ją usuwam. Podobnie z kolumnami Squat4kg, Bench4kg oraz Deadlift4kg"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 61,
"outputs": [
{
"data": {
"text/plain": " Name Sex Event Equipment Age AgeClass \\\n63986 Kylie Beutler F SBD Wraps 23.0 20-23 \n66457 Kaitlynn Naert F SBD Wraps 13.0 13-15 \n67030 Carol Moorhead F SBD Wraps 55.0 55-59 \n67031 Nancy Lowther F SBD Wraps 58.0 55-59 \n69557 Roger Shaw M SBD Wraps 73.0 70-74 \n646493 Ryan Lapadat M SBD Raw 26.0 24-34 \n646495 Denis Pronin M SBD Multi-ply 20.0 20-23 \n652230 Brooke Zak F SBD Raw 12.5 13-15 \n658136 Brooke Zak F SBD Raw 12.0 5-12 \n658137 Brooke Zak F SBD Raw 12.0 5-12 \n658150 Frank Ferchland M SBD Raw 50.5 50-54 \n658151 Frank Ferchland M SBD Raw 50.5 50-54 \n658152 Frank Ferchland M SBD Raw 50.5 50-54 \n919233 Michael Trentin M SBD Multi-ply 18.0 18-19 \n\n Division BodyweightKg WeightClassKg Squat1Kg ... Wilks \\\n63986 Juniors 20-23 56.00 56 83.91 ... 338.90 \n66457 Teen 13-15 103.69 90+ 43.09 ... 205.66 \n67030 Open 74.39 75 79.38 ... 223.22 \n67031 Open 87.09 90 90.72 ... 260.41 \n69557 Masters 70-79 74.12 75 147.42 ... 311.23 \n646493 Open 81.70 82.5 142.50 ... 347.04 \n646495 Juniors 20-23 76.30 82.5 125.00 ... 313.37 \n652230 Teen 12-13 44.54 48 40.00 ... 223.45 \n658136 Teen 12-13 43.27 44 37.50 ... 234.91 \n658137 Open 43.27 44 37.50 ... 234.91 \n658150 Open 106.87 110 142.50 ... 286.58 \n658151 Law/Fire/Military 106.87 110 142.50 ... 286.58 \n658152 Masters 50-54 106.87 110 142.50 ... 286.58 \n919233 MO-MP 64.45 67.5 117.50 ... 298.34 \n\n McCulloch Glossbrenner IPFPoints Tested Federation Date \\\n63986 338.90 299.77 523.61 Yes WPA 2011-05-21 \n66457 263.24 175.84 359.25 Yes APA 2015-09-19 \n67030 273.44 196.41 374.65 Yes APA 2017-04-22 \n67031 336.19 227.02 449.73 Yes APA 2017-04-22 \n69557 546.53 301.14 443.10 Yes APA 2018-11-17 \n646493 347.04 335.10 505.60 Yes CPO 2008-05-17 \n646495 322.77 303.01 411.52 Yes CPO 2008-05-17 \n652230 286.01 200.16 303.20 Yes RAW 2018-11-09 \n658136 312.43 211.11 318.59 Yes RAW 2018-08-03 \n658137 312.43 211.11 318.59 Yes RAW 2018-08-03 \n658150 323.84 274.82 390.62 Yes RAW 2018-08-03 \n658151 323.84 274.82 390.62 Yes RAW 2018-08-03 \n658152 323.84 274.82 390.62 Yes RAW 2018-08-03 \n919233 316.24 289.83 394.62 Yes CAPO 2001-08-18 \n\n MeetCountry MeetState MeetName \n63986 USA CA World Championships \n66457 USA MI Wolverine Open \n67030 USA MO ShowMe State Raw Championships \n67031 USA MO ShowMe State Raw Championships \n69557 USA MO Midwest Raw Championships \n646493 Canada ON Canadian Championships \n646495 Canada ON Canadian Championships \n652230 USA NC OBX Open \n658136 USA NC Southern Open \n658137 USA NC
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Name</th>\n <th>Sex</th>\n <th>Event</th>\n <th>Equipment</th>\n <th>Age</th>\n <th>AgeClass</th>\n <th>Division</th>\n <th>BodyweightKg</th>\n <th>WeightClassKg</th>\n <th>Squat1Kg</th>\n <th>...</th>\n <th>Wilks</th>\n <th>McCulloch</th>\n <th>Glossbrenner</th>\n <th>IPFPoints</th>\n <th>Tested</th>\n <th>Federation</th>\n <th>Date</th>\n <th>MeetCountry</th>\n <th>MeetState</th>\n <th>MeetName</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>63986</th>\n <td>Kylie Beutler</td>\n <td>F</td>\n <td>SBD</td>\n <td>Wraps</td>\n <td>23.0</td>\n <td>20-23</td>\n <td>Juniors 20-23</td>\n <td>56.00</td>\n <td>56</td>\n <td>83.91</td>\n <td>...</td>\n <td>338.90</td>\n <td>338.90</td>\n <td>299.77</td>\n <td>523.61</td>\n <td>Yes</td>\n <td>WPA</td>\n <td>2011-05-21</td>\n <td>USA</td>\n <td>CA</td>\n <td>World Championships</td>\n </tr>\n <tr>\n <th>66457</th>\n <td>Kaitlynn Naert</td>\n <td>F</td>\n <td>SBD</td>\n <td>Wraps</td>\n <td>13.0</td>\n <td>13-15</td>\n <td>Teen 13-15</td>\n <td>103.69</td>\n <td>90+</td>\n <td>43.09</td>\n <td>...</td>\n <td>205.66</td>\n <td>263.24</td>\n <td>175.84</td>\n <td>359.25</td>\n <td>Yes</td>\n <td>APA</td>\n <td>2015-09-19</td>\n <td>USA</td>\n <td>MI</td>\n <td>Wolverine Open</td>\n </tr>\n <tr>\n <th>67030</th>\n <td>Carol Moorhead</td>\n <td>F</td>\n <td>SBD</td>\n <td>Wraps</td>\n <td>55.0</td>\n <td>55-59</td>\n <td>Open</td>\n <td>74.39</td>\n <td>75</td>\n <td>79.38</td>\n <td>...</td>\n <td>223.22</td>\n <td>273.44</td>\n <td>196.41</td>\n <td>374.65</td>\n <td>Yes</td>\n <td>APA</td>\n <td>2017-04-22</td>\n <td>USA</td>\n <td>MO</td>\n <td>ShowMe State Raw Championships</td>\n </tr>\n <tr>\n <th>67031</th>\n <td>Nancy Lowther</td>\n <td>F</td>\n <td>SBD</td>\n <td>Wraps</td>\n <td>58.0</td>\n <td>55-59</td>\n <td>Open</td>\n <td>87.09</td>\n <td>90</td>\n <td>90.72</td>\n <td>...</td>\n <td>260.41</td>\n <td>336.19</td>\n <td>227.02</td>\n <td>449.73</td>\n <td>Yes</td>\n <td>APA</td>\n <td>2017-04-22</td>\n <td>USA</td>\n <td>MO</td>\n <td>ShowMe State Raw Championships</td>\n </tr>\n <tr>\n <th>69557</th>\n <td>Roger Shaw</td>\n <td>M</td>\n <td>SBD</td>\n <td>Wraps</td>\n <td>73.0</td>\n <td>70-74</td>\n <td>Masters 70-79</td>\n <td>74.12</td>\n <td>75</td>\n <td>147.42</td>\n <td>...</td>\n <td>311.23</td>\n <td>546.53</td>\n <td>301.14</td>\n <td>443.10</td>\n <td>Yes</td>\n <td>APA</td>\n <td>2018-11-17</td>\n <td>USA</td>\n <td>MO</td>\n <td>Midwest Raw Championships</td>\n </tr>\n <tr>\n <th>646493</th>\n <td>Ryan Lapadat</td>\n <td>M</td>\n <td>SBD</td>\n <td>Raw</td>\n <td>26.0</td>\n <td>24-34</td>\n <td>Open</td>\n <td>81.70</td>\n <td>82.5</td>\n <td>142.50</td>\n <td>...</td>\n <td>347.04</td>\n <td>347.04</td>\n <td>335.10</td>\n <td>505.60</td>\n <td>Yes</td>\n <td>CPO</td>\n <td>2008-05-17</td>\n <td>Canada</td>\n <td>ON</td>\n <td>Canadian Championships</td>\n </tr>\n <tr>\n <th>646495<
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.drop(columns=['Country'])"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-17T21:21:57.260136Z",
"start_time": "2024-03-17T21:21:57.252428Z"
}
}
},
{
"cell_type": "markdown",
"source": [
"Wartości NaN zamieniam na 0."
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 63,
"outputs": [],
"source": [
"data.fillna(0, inplace=True)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-17T21:22:23.496536Z",
"start_time": "2024-03-17T21:22:20.502125Z"
}
}
},
{
"cell_type": "markdown",
"source": [
"## 4. Podział zbioru na podzbiory\n",
"Używam proporcji 8:1:1 (train:dev:test)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 67,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting scikit-learn\r\n",
" Downloading scikit_learn-1.4.1.post1-cp311-cp311-macosx_10_9_x86_64.whl (11.6 MB)\r\n",
"\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m11.6/11.6 MB\u001B[0m \u001B[31m33.5 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m00:01\u001B[0m0:01\u001B[0m\r\n",
"\u001B[?25hRequirement already satisfied: numpy<2.0,>=1.19.5 in ./venv/lib/python3.11/site-packages (from scikit-learn) (1.26.4)\r\n",
"Collecting scipy>=1.6.0\r\n",
" Downloading scipy-1.12.0-cp311-cp311-macosx_10_9_x86_64.whl (38.9 MB)\r\n",
"\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m38.9/38.9 MB\u001B[0m \u001B[31m14.1 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m00:01\u001B[0m00:01\u001B[0m\r\n",
"\u001B[?25hCollecting joblib>=1.2.0\r\n",
" Using cached joblib-1.3.2-py3-none-any.whl (302 kB)\r\n",
"Collecting threadpoolctl>=2.0.0\r\n",
" Downloading threadpoolctl-3.3.0-py3-none-any.whl (17 kB)\r\n",
"Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn\r\n",
"Successfully installed joblib-1.3.2 scikit-learn-1.4.1.post1 scipy-1.12.0 threadpoolctl-3.3.0\r\n",
"\r\n",
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip available: \u001B[0m\u001B[31;49m22.3.1\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m24.0\u001B[0m\r\n",
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\r\n"
]
}
],
"source": [
"!pip install scikit-learn"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-17T21:25:18.706896Z",
"start_time": "2024-03-17T21:25:03.710247Z"
}
}
},
{
"cell_type": "code",
"execution_count": 70,
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"openpowerlifting_train, openpowerlifting_test = train_test_split(data, test_size=0.1, random_state=1)\n",
"openpowerlifting_train, openpowerlifting_dev = train_test_split(openpowerlifting_train, test_size=1/9, random_state=1)\n"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-17T21:34:45.563021Z",
"start_time": "2024-03-17T21:34:41.224064Z"
}
}
},
{
"cell_type": "code",
"execution_count": 72,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wielkość zbioru train: 1138682\n",
"Wielkość zbioru dev: 142336\n",
"Wielkość zbioru test: 142336\n"
]
}
],
"source": [
"print(\"Wielkość zbioru train: \", len(openpowerlifting_train))\n",
"print(\"Wielkość zbioru dev: \", len(openpowerlifting_dev))\n",
"print(\"Wielkość zbioru test: \", len(openpowerlifting_test))"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-17T21:35:31.899647Z",
"start_time": "2024-03-17T21:35:31.894090Z"
}
}
},
{
"cell_type": "markdown",
"source": [
"## 5. Normalizacja"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 83,
"outputs": [
{
"data": {
"text/plain": " Name Sex Event Equipment Age AgeClass Division \\\n0 Abbie Murphy F SBD Wraps 0.661353 24-34 F-OR \n1 Abbie Tuong F SBD Wraps 0.661353 24-34 F-OR \n2 Ainslee Hooper F B Raw 1.255975 40-44 F-OR \n3 Amy Moldenhauer F SBD Wraps 0.337014 20-23 F-OR \n4 Andrea Rowan F SBD Wraps 1.526258 45-49 F-OR \n... ... .. ... ... ... ... ... \n1423349 Marian Cafalik M SBD Raw 2.364134 60-64 Masters 2 \n1423350 Marian Piwowarczyk M SBD Raw 2.093852 55-59 Masters 2 \n1423351 Andrzej Bryniarski M SBD Raw 2.472248 60-64 Masters 2 \n1423352 Stanisław Goroczko M SBD Raw 2.526304 60-64 Masters 2 \n1423353 Jan Sowa M SBD Raw 2.904700 70-74 Masters 2 \n\n BodyweightKg WeightClassKg Squat1Kg ... McCulloch Glossbrenner \\\n0 -0.944800 60 0.611664 ... 324.16 286.42 \n1 -0.997210 60 0.842750 ... 378.07 334.16 \n2 -1.122189 56 -0.312682 ... 38.56 34.12 \n3 -0.936736 60 -1.525886 ... 345.61 305.37 \n4 0.837161 110 1.073837 ... 338.91 274.56 \n... ... ... ... ... ... ... \n1423349 -0.392472 74 1.536010 ... 438.27 316.52 \n1423350 -0.795631 66 0.727207 ... 372.60 295.66 \n1423351 0.450129 105 1.304923 ... 382.36 264.22 \n1423352 -0.098167 83 -2.219146 ... 0.00 0.00 \n1423353 -0.049788 83 -1.641430 ... 0.00 0.00 \n\n IPFPoints Tested Country Federation Date MeetCountry \\\n0 511.15 0 0 GPC-AUS 2018-10-27 Australia \n1 595.65 0 0 GPC-AUS 2018-10-27 Australia \n2 313.97 0 0 GPC-AUS 2018-10-27 Australia \n3 547.04 0 0 GPC-AUS 2018-10-27 Australia \n4 550.08 0 0 GPC-AUS 2018-10-27 Australia \n... ... ... ... ... ... ... \n1423349 469.67 Yes 0 PZKFiTS 2017-04-01 Poland \n1423350 423.03 Yes Poland PZKFiTS 2017-04-01 Poland \n1423351 378.84 Yes 0 PZKFiTS 2017-04-01 Poland \n1423352 0.00 Yes 0 PZKFiTS 2017-04-01 Poland \n1423353 0.00 Yes 0 PZKFiTS 2017-04-01 Poland \n\n MeetState MeetName \n0 VIC Melbourne Cup \n1 VIC Melbourne Cup \n2 VIC Melbourne Cup \n3 VIC Melbourne Cup \n4 VIC Melbourne Cup \n... ... ... \n1423349 0 Polish Classic Powerlifting Cup \n1423350 0 Polish Classic Powerlifting Cup \n1423351 0 Polish Classic Powerlifting Cup \n1423352 0 Polish Classic Powerlifting Cup \n1423353 0 Polish Classic Powerlifting Cup \n\n[1423354 rows x 37 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Name</th>\n <th>Sex</th>\n <th>Event</th>\n <th>Equipment</th>\n <th>Age</th>\n <th>AgeClass</th>\n <th>Division</th>\n <th>BodyweightKg</th>\n <th>WeightClassKg</th>\n <th>Squat1Kg</th>\n <th>...</th>\n <th>McCulloch</th>\n <th>Glossbrenner</th>\n <th>IPFPoints</th>\n <th>Tested</th>\n <th>Country</th>\n <th>Federation</th>\n <th>Date</th>\n <th>MeetCountry</th>\n <th>MeetState</th>\n <th>MeetName</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Abbie Murphy</td>\n <td>F</td>\n <td>SBD</td>\n <td>Wraps</td>\n <td>0.661353</td>\n <td>24-34</td>\n <td>F-OR</td>\n <td>-0.944800</td>\n <td>60</td>\n <td>0.611664</td>\n <td>...</td>\n <td>324.16</td>\n <td>286.42</td>\n <td>511.15</td>\n <td>0</td>\n <td>0</td>\n <td>GPC-AUS</td>\n <td>2018-10-27</td>\n <td>Australia</td>\n <td>VIC</td>\n <td>Melbourne Cup</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Abbie Tuong</td>\n <td>F</td>\n <td>SBD</td>\n <td>Wraps</td>\n <td>0.661353</td>\n <td>24-34</td>\n <td>F-OR</td>\n <td>-0.997210</td>\n <td>60</td>\n <td>0.842750</td>\n <td>...</td>\n <td>378.07</td>\n <td>334.16</td>\n <td>595.65</td>\n <td>0</td>\n <td>0</td>\n <td>GPC-AUS</td>\n <td>2018-10-27</td>\n <td>Australia</td>\n <td>VIC</td>\n <td>Melbourne Cup</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Ainslee Hooper</td>\n <td>F</td>\n <td>B</td>\n <td>Raw</td>\n <td>1.255975</td>\n <td>40-44</td>\n <td>F-OR</td>\n <td>-1.122189</td>\n <td>56</td>\n <td>-0.312682</td>\n <td>...</td>\n <td>38.56</td>\n <td>34.12</td>\n <td>313.97</td>\n <td>0</td>\n <td>0</td>\n <td>GPC-AUS</td>\n <td>2018-10-27</td>\n <td>Australia</td>\n <td>VIC</td>\n <td>Melbourne Cup</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Amy Moldenhauer</td>\n <td>F</td>\n <td>SBD</td>\n <td>Wraps</td>\n <td>0.337014</td>\n <td>20-23</td>\n <td>F-OR</td>\n <td>-0.936736</td>\n <td>60</td>\n <td>-1.525886</td>\n <td>...</td>\n <td>345.61</td>\n <td>305.37</td>\n <td>547.04</td>\n <td>0</td>\n <td>0</td>\n <td>GPC-AUS</td>\n <td>2018-10-27</td>\n <td>Australia</td>\n <td>VIC</td>\n <td>Melbourne Cup</td>\n </tr>\n <tr>\n <th>4</th>\n <td>Andrea Rowan</td>\n <td>F</td>\n <td>SBD</td>\n <td>Wraps</td>\n <td>1.526258</td>\n <td>45-49</td>\n <td>F-OR</td>\n <td>0.837161</td>\n <td>110</td>\n <td>1.073837</td>\n <td>...</td>\n <td>338.91</td>\n <td>274.56</td>\n <td>550.08</td>\n <td>0</td>\n <td>0</td>\n <td>GPC-AUS</td>\n <td>2018-10-27</td>\n <td>Australia</td>\n <td>VIC</td>\n <td>Melbourne Cup</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>1423349</th>\n <td>Marian Cafalik</td>\n <td>M</td>\n <td>SBD</td>\n <td>R
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.preprocessing import StandardScaler\n",
"\n",
"scaled_features = data.copy()\n",
"col_names = ['Age', 'BodyweightKg', 'Squat1Kg', 'Squat2Kg', 'Squat3Kg', 'Squat4Kg', 'Best3SquatKg', 'Bench1Kg', 'Bench2Kg', 'Bench3Kg', 'Bench4Kg', 'Best3BenchKg', 'Deadlift1Kg', 'Deadlift2Kg', 'Deadlift3Kg', 'Deadlift4Kg', 'Best3DeadliftKg', 'TotalKg']\n",
"features = scaled_features[col_names]\n",
"scaler = StandardScaler().fit(features.values)\n",
"features = scaler.transform(features.values)\n",
"scaled_features[col_names] = features\n",
"scaled_features"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-17T23:21:17.202879Z",
"start_time": "2024-03-17T23:21:12.896847Z"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}