ium_464962/ium_01.ipynb
Krzysztof Raczyński 3dbe94f3bc added ium01
2024-03-19 15:42:16 +01:00

1135 lines
36 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "51d5eebd",
"metadata": {},
"source": [
"### Instalacja i import bibliotek"
]
},
{
"cell_type": "code",
"execution_count": 74,
"id": "300962ca",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: kaggle in c:\\users\\krzys\\anaconda3\\lib\\site-packages (1.6.6)\n",
"Requirement already satisfied: bleach in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from kaggle) (6.1.0)\n",
"Requirement already satisfied: python-dateutil in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from kaggle) (2.9.0.post0)\n",
"Requirement already satisfied: six>=1.10 in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from kaggle) (1.16.0)\n",
"Requirement already satisfied: requests in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from kaggle) (2.31.0)\n",
"Requirement already satisfied: certifi in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from kaggle) (2024.2.2)\n",
"Requirement already satisfied: tqdm in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from kaggle) (4.66.2)\n",
"Requirement already satisfied: python-slugify in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from kaggle) (8.0.4)\n",
"Requirement already satisfied: urllib3 in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from kaggle) (2.2.1)\n",
"Requirement already satisfied: webencodings in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from bleach->kaggle) (0.5.1)\n",
"Requirement already satisfied: text-unidecode>=1.3 in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from python-slugify->kaggle) (1.3)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from requests->kaggle) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from requests->kaggle) (3.6)\n",
"Requirement already satisfied: colorama in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from tqdm->kaggle) (0.4.6)\n",
"Requirement already satisfied: pandas in c:\\users\\krzys\\anaconda3\\lib\\site-packages (1.4.2)\n",
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from pandas) (2021.3)\n",
"Requirement already satisfied: numpy>=1.18.5 in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from pandas) (1.21.5)\n",
"Requirement already satisfied: python-dateutil>=2.8.1 in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from pandas) (2.9.0.post0)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)\n"
]
}
],
"source": [
"!pip install kaggle\n",
"!pip install pandas"
]
},
{
"cell_type": "code",
"execution_count": 75,
"id": "ace0fd9d",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import MinMaxScaler"
]
},
{
"cell_type": "markdown",
"id": "8063f07b",
"metadata": {},
"source": [
"### Pobranie zbioru danych"
]
},
{
"cell_type": "code",
"execution_count": 76,
"id": "13978bf5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"vehicle-sales-data.zip: Skipping, found more recently modified local copy (use --force to force download)\n"
]
}
],
"source": [
"!kaggle datasets download -d syedanwarafridi/vehicle-sales-data"
]
},
{
"cell_type": "code",
"execution_count": 77,
"id": "079faa4e",
"metadata": {},
"outputs": [],
"source": [
"#conda install git pip"
]
},
{
"cell_type": "code",
"execution_count": 78,
"id": "de0ab590",
"metadata": {},
"outputs": [],
"source": [
"#!pip install unzip"
]
},
{
"cell_type": "code",
"execution_count": 79,
"id": "265ecdca",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Archive: vehicle-sales-data.zip\n",
" inflating: car_prices.csv \n"
]
}
],
"source": [
"!unzip -o vehicle-sales-data.zip"
]
},
{
"cell_type": "markdown",
"id": "2bf18b9a",
"metadata": {},
"source": [
"### Opis i czyszczenie danych danych"
]
},
{
"cell_type": "code",
"execution_count": 80,
"id": "145d6d72",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>year</th>\n",
" <th>make</th>\n",
" <th>model</th>\n",
" <th>trim</th>\n",
" <th>body</th>\n",
" <th>transmission</th>\n",
" <th>vin</th>\n",
" <th>state</th>\n",
" <th>condition</th>\n",
" <th>odometer</th>\n",
" <th>color</th>\n",
" <th>interior</th>\n",
" <th>seller</th>\n",
" <th>mmr</th>\n",
" <th>sellingprice</th>\n",
" <th>saledate</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2015</td>\n",
" <td>Kia</td>\n",
" <td>Sorento</td>\n",
" <td>LX</td>\n",
" <td>SUV</td>\n",
" <td>automatic</td>\n",
" <td>5xyktca69fg566472</td>\n",
" <td>ca</td>\n",
" <td>5.0</td>\n",
" <td>16639.0</td>\n",
" <td>white</td>\n",
" <td>black</td>\n",
" <td>kia motors america inc</td>\n",
" <td>20500.0</td>\n",
" <td>21500.0</td>\n",
" <td>Tue Dec 16 2014 12:30:00 GMT-0800 (PST)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2015</td>\n",
" <td>Kia</td>\n",
" <td>Sorento</td>\n",
" <td>LX</td>\n",
" <td>SUV</td>\n",
" <td>automatic</td>\n",
" <td>5xyktca69fg561319</td>\n",
" <td>ca</td>\n",
" <td>5.0</td>\n",
" <td>9393.0</td>\n",
" <td>white</td>\n",
" <td>beige</td>\n",
" <td>kia motors america inc</td>\n",
" <td>20800.0</td>\n",
" <td>21500.0</td>\n",
" <td>Tue Dec 16 2014 12:30:00 GMT-0800 (PST)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2014</td>\n",
" <td>BMW</td>\n",
" <td>3 Series</td>\n",
" <td>328i SULEV</td>\n",
" <td>Sedan</td>\n",
" <td>automatic</td>\n",
" <td>wba3c1c51ek116351</td>\n",
" <td>ca</td>\n",
" <td>45.0</td>\n",
" <td>1331.0</td>\n",
" <td>gray</td>\n",
" <td>black</td>\n",
" <td>financial services remarketing (lease)</td>\n",
" <td>31900.0</td>\n",
" <td>30000.0</td>\n",
" <td>Thu Jan 15 2015 04:30:00 GMT-0800 (PST)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2015</td>\n",
" <td>Volvo</td>\n",
" <td>S60</td>\n",
" <td>T5</td>\n",
" <td>Sedan</td>\n",
" <td>automatic</td>\n",
" <td>yv1612tb4f1310987</td>\n",
" <td>ca</td>\n",
" <td>41.0</td>\n",
" <td>14282.0</td>\n",
" <td>white</td>\n",
" <td>black</td>\n",
" <td>volvo na rep/world omni</td>\n",
" <td>27500.0</td>\n",
" <td>27750.0</td>\n",
" <td>Thu Jan 29 2015 04:30:00 GMT-0800 (PST)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2014</td>\n",
" <td>BMW</td>\n",
" <td>6 Series Gran Coupe</td>\n",
" <td>650i</td>\n",
" <td>Sedan</td>\n",
" <td>automatic</td>\n",
" <td>wba6b2c57ed129731</td>\n",
" <td>ca</td>\n",
" <td>43.0</td>\n",
" <td>2641.0</td>\n",
" <td>gray</td>\n",
" <td>black</td>\n",
" <td>financial services remarketing (lease)</td>\n",
" <td>66000.0</td>\n",
" <td>67000.0</td>\n",
" <td>Thu Dec 18 2014 12:30:00 GMT-0800 (PST)</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" year make model trim body transmission \\\n",
"0 2015 Kia Sorento LX SUV automatic \n",
"1 2015 Kia Sorento LX SUV automatic \n",
"2 2014 BMW 3 Series 328i SULEV Sedan automatic \n",
"3 2015 Volvo S60 T5 Sedan automatic \n",
"4 2014 BMW 6 Series Gran Coupe 650i Sedan automatic \n",
"\n",
" vin state condition odometer color interior \\\n",
"0 5xyktca69fg566472 ca 5.0 16639.0 white black \n",
"1 5xyktca69fg561319 ca 5.0 9393.0 white beige \n",
"2 wba3c1c51ek116351 ca 45.0 1331.0 gray black \n",
"3 yv1612tb4f1310987 ca 41.0 14282.0 white black \n",
"4 wba6b2c57ed129731 ca 43.0 2641.0 gray black \n",
"\n",
" seller mmr sellingprice \\\n",
"0 kia motors america inc 20500.0 21500.0 \n",
"1 kia motors america inc 20800.0 21500.0 \n",
"2 financial services remarketing (lease) 31900.0 30000.0 \n",
"3 volvo na rep/world omni 27500.0 27750.0 \n",
"4 financial services remarketing (lease) 66000.0 67000.0 \n",
"\n",
" saledate \n",
"0 Tue Dec 16 2014 12:30:00 GMT-0800 (PST) \n",
"1 Tue Dec 16 2014 12:30:00 GMT-0800 (PST) \n",
"2 Thu Jan 15 2015 04:30:00 GMT-0800 (PST) \n",
"3 Thu Jan 29 2015 04:30:00 GMT-0800 (PST) \n",
"4 Thu Dec 18 2014 12:30:00 GMT-0800 (PST) "
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('car_prices.csv')\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 81,
"id": "5fb29c05",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(558837, 16)"
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 82,
"id": "89130732",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 558837 entries, 0 to 558836\n",
"Data columns (total 16 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 year 558837 non-null int64 \n",
" 1 make 548536 non-null object \n",
" 2 model 548438 non-null object \n",
" 3 trim 548186 non-null object \n",
" 4 body 545642 non-null object \n",
" 5 transmission 493485 non-null object \n",
" 6 vin 558833 non-null object \n",
" 7 state 558837 non-null object \n",
" 8 condition 547017 non-null float64\n",
" 9 odometer 558743 non-null float64\n",
" 10 color 558088 non-null object \n",
" 11 interior 558088 non-null object \n",
" 12 seller 558837 non-null object \n",
" 13 mmr 558799 non-null float64\n",
" 14 sellingprice 558825 non-null float64\n",
" 15 saledate 558825 non-null object \n",
"dtypes: float64(4), int64(1), object(11)\n",
"memory usage: 68.2+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 83,
"id": "242cd7f0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>year</th>\n",
" <th>make</th>\n",
" <th>model</th>\n",
" <th>trim</th>\n",
" <th>body</th>\n",
" <th>transmission</th>\n",
" <th>vin</th>\n",
" <th>state</th>\n",
" <th>condition</th>\n",
" <th>odometer</th>\n",
" <th>color</th>\n",
" <th>interior</th>\n",
" <th>seller</th>\n",
" <th>mmr</th>\n",
" <th>sellingprice</th>\n",
" <th>saledate</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>558837.000000</td>\n",
" <td>548536</td>\n",
" <td>548438</td>\n",
" <td>548186</td>\n",
" <td>545642</td>\n",
" <td>493485</td>\n",
" <td>558833</td>\n",
" <td>558837</td>\n",
" <td>547017.000000</td>\n",
" <td>558743.000000</td>\n",
" <td>558088</td>\n",
" <td>558088</td>\n",
" <td>558837</td>\n",
" <td>558799.000000</td>\n",
" <td>558825.000000</td>\n",
" <td>558825</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>NaN</td>\n",
" <td>96</td>\n",
" <td>973</td>\n",
" <td>1963</td>\n",
" <td>87</td>\n",
" <td>4</td>\n",
" <td>550297</td>\n",
" <td>64</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>46</td>\n",
" <td>17</td>\n",
" <td>14263</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3766</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>NaN</td>\n",
" <td>Ford</td>\n",
" <td>Altima</td>\n",
" <td>Base</td>\n",
" <td>Sedan</td>\n",
" <td>automatic</td>\n",
" <td>automatic</td>\n",
" <td>fl</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>black</td>\n",
" <td>black</td>\n",
" <td>nissan-infiniti lt</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Tue Feb 10 2015 01:30:00 GMT-0800 (PST)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>NaN</td>\n",
" <td>93554</td>\n",
" <td>19349</td>\n",
" <td>55817</td>\n",
" <td>199437</td>\n",
" <td>475915</td>\n",
" <td>22</td>\n",
" <td>82945</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>110970</td>\n",
" <td>244329</td>\n",
" <td>19693</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>5334</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>2010.038927</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>30.672365</td>\n",
" <td>68320.017767</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>13769.377495</td>\n",
" <td>13611.358810</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>3.966864</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>13.402832</td>\n",
" <td>53398.542821</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>9679.967174</td>\n",
" <td>9749.501628</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1982.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>25.000000</td>\n",
" <td>1.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>2007.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>23.000000</td>\n",
" <td>28371.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7100.000000</td>\n",
" <td>6900.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>2012.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>35.000000</td>\n",
" <td>52254.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>12250.000000</td>\n",
" <td>12100.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>2013.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>42.000000</td>\n",
" <td>99109.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>18300.000000</td>\n",
" <td>18200.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>2015.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>49.000000</td>\n",
" <td>999999.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>182000.000000</td>\n",
" <td>230000.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" year make model trim body transmission vin \\\n",
"count 558837.000000 548536 548438 548186 545642 493485 558833 \n",
"unique NaN 96 973 1963 87 4 550297 \n",
"top NaN Ford Altima Base Sedan automatic automatic \n",
"freq NaN 93554 19349 55817 199437 475915 22 \n",
"mean 2010.038927 NaN NaN NaN NaN NaN NaN \n",
"std 3.966864 NaN NaN NaN NaN NaN NaN \n",
"min 1982.000000 NaN NaN NaN NaN NaN NaN \n",
"25% 2007.000000 NaN NaN NaN NaN NaN NaN \n",
"50% 2012.000000 NaN NaN NaN NaN NaN NaN \n",
"75% 2013.000000 NaN NaN NaN NaN NaN NaN \n",
"max 2015.000000 NaN NaN NaN NaN NaN NaN \n",
"\n",
" state condition odometer color interior \\\n",
"count 558837 547017.000000 558743.000000 558088 558088 \n",
"unique 64 NaN NaN 46 17 \n",
"top fl NaN NaN black black \n",
"freq 82945 NaN NaN 110970 244329 \n",
"mean NaN 30.672365 68320.017767 NaN NaN \n",
"std NaN 13.402832 53398.542821 NaN NaN \n",
"min NaN 1.000000 1.000000 NaN NaN \n",
"25% NaN 23.000000 28371.000000 NaN NaN \n",
"50% NaN 35.000000 52254.000000 NaN NaN \n",
"75% NaN 42.000000 99109.000000 NaN NaN \n",
"max NaN 49.000000 999999.000000 NaN NaN \n",
"\n",
" seller mmr sellingprice \\\n",
"count 558837 558799.000000 558825.000000 \n",
"unique 14263 NaN NaN \n",
"top nissan-infiniti lt NaN NaN \n",
"freq 19693 NaN NaN \n",
"mean NaN 13769.377495 13611.358810 \n",
"std NaN 9679.967174 9749.501628 \n",
"min NaN 25.000000 1.000000 \n",
"25% NaN 7100.000000 6900.000000 \n",
"50% NaN 12250.000000 12100.000000 \n",
"75% NaN 18300.000000 18200.000000 \n",
"max NaN 182000.000000 230000.000000 \n",
"\n",
" saledate \n",
"count 558825 \n",
"unique 3766 \n",
"top Tue Feb 10 2015 01:30:00 GMT-0800 (PST) \n",
"freq 5334 \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN "
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe(include='all')"
]
},
{
"cell_type": "code",
"execution_count": 84,
"id": "75742f81",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(472325, 16)"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df.dropna()\n",
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 85,
"id": "93d5b418",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"year 0\n",
"make 0\n",
"model 0\n",
"trim 0\n",
"body 0\n",
"transmission 0\n",
"vin 0\n",
"state 0\n",
"condition 0\n",
"odometer 0\n",
"color 0\n",
"interior 0\n",
"seller 0\n",
"mmr 0\n",
"sellingprice 0\n",
"saledate 0\n",
"dtype: int64"
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 86,
"id": "b039be76",
"metadata": {},
"outputs": [],
"source": [
"df['body'] = df['body'].replace({'sedan': 'Sedan'})\n",
"df['body'] = df['body'].replace({'Suv': 'SUV'})\n",
"df['body'] = df['body'].replace({'suv': 'SUV'})"
]
},
{
"cell_type": "code",
"execution_count": 87,
"id": "92787fae",
"metadata": {},
"outputs": [],
"source": [
"numeric_columns = df.select_dtypes(include=['int', 'float']).columns\n",
"scaler = MinMaxScaler(feature_range=(0, 1))\n",
"\n",
"df_scaled = df.copy()\n",
"df_scaled[numeric_columns] = scaler.fit_transform(df[numeric_columns])"
]
},
{
"cell_type": "code",
"execution_count": 88,
"id": "147f68ca",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>year</th>\n",
" <th>make</th>\n",
" <th>model</th>\n",
" <th>trim</th>\n",
" <th>body</th>\n",
" <th>transmission</th>\n",
" <th>vin</th>\n",
" <th>state</th>\n",
" <th>condition</th>\n",
" <th>odometer</th>\n",
" <th>color</th>\n",
" <th>interior</th>\n",
" <th>seller</th>\n",
" <th>mmr</th>\n",
" <th>sellingprice</th>\n",
" <th>saledate</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.00</td>\n",
" <td>Kia</td>\n",
" <td>Sorento</td>\n",
" <td>LX</td>\n",
" <td>SUV</td>\n",
" <td>automatic</td>\n",
" <td>5xyktca69fg566472</td>\n",
" <td>ca</td>\n",
" <td>0.083333</td>\n",
" <td>0.016638</td>\n",
" <td>white</td>\n",
" <td>black</td>\n",
" <td>kia motors america inc</td>\n",
" <td>0.112515</td>\n",
" <td>0.093474</td>\n",
" <td>Tue Dec 16 2014 12:30:00 GMT-0800 (PST)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.00</td>\n",
" <td>Kia</td>\n",
" <td>Sorento</td>\n",
" <td>LX</td>\n",
" <td>SUV</td>\n",
" <td>automatic</td>\n",
" <td>5xyktca69fg561319</td>\n",
" <td>ca</td>\n",
" <td>0.083333</td>\n",
" <td>0.009392</td>\n",
" <td>white</td>\n",
" <td>beige</td>\n",
" <td>kia motors america inc</td>\n",
" <td>0.114164</td>\n",
" <td>0.093474</td>\n",
" <td>Tue Dec 16 2014 12:30:00 GMT-0800 (PST)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.96</td>\n",
" <td>BMW</td>\n",
" <td>3 Series</td>\n",
" <td>328i SULEV</td>\n",
" <td>Sedan</td>\n",
" <td>automatic</td>\n",
" <td>wba3c1c51ek116351</td>\n",
" <td>ca</td>\n",
" <td>0.916667</td>\n",
" <td>0.001330</td>\n",
" <td>gray</td>\n",
" <td>black</td>\n",
" <td>financial services remarketing (lease)</td>\n",
" <td>0.175161</td>\n",
" <td>0.130431</td>\n",
" <td>Thu Jan 15 2015 04:30:00 GMT-0800 (PST)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1.00</td>\n",
" <td>Volvo</td>\n",
" <td>S60</td>\n",
" <td>T5</td>\n",
" <td>Sedan</td>\n",
" <td>automatic</td>\n",
" <td>yv1612tb4f1310987</td>\n",
" <td>ca</td>\n",
" <td>0.833333</td>\n",
" <td>0.014281</td>\n",
" <td>white</td>\n",
" <td>black</td>\n",
" <td>volvo na rep/world omni</td>\n",
" <td>0.150982</td>\n",
" <td>0.120648</td>\n",
" <td>Thu Jan 29 2015 04:30:00 GMT-0800 (PST)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.96</td>\n",
" <td>BMW</td>\n",
" <td>6 Series Gran Coupe</td>\n",
" <td>650i</td>\n",
" <td>Sedan</td>\n",
" <td>automatic</td>\n",
" <td>wba6b2c57ed129731</td>\n",
" <td>ca</td>\n",
" <td>0.875000</td>\n",
" <td>0.002640</td>\n",
" <td>gray</td>\n",
" <td>black</td>\n",
" <td>financial services remarketing (lease)</td>\n",
" <td>0.362550</td>\n",
" <td>0.291301</td>\n",
" <td>Thu Dec 18 2014 12:30:00 GMT-0800 (PST)</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" year make model trim body transmission \\\n",
"0 1.00 Kia Sorento LX SUV automatic \n",
"1 1.00 Kia Sorento LX SUV automatic \n",
"2 0.96 BMW 3 Series 328i SULEV Sedan automatic \n",
"3 1.00 Volvo S60 T5 Sedan automatic \n",
"4 0.96 BMW 6 Series Gran Coupe 650i Sedan automatic \n",
"\n",
" vin state condition odometer color interior \\\n",
"0 5xyktca69fg566472 ca 0.083333 0.016638 white black \n",
"1 5xyktca69fg561319 ca 0.083333 0.009392 white beige \n",
"2 wba3c1c51ek116351 ca 0.916667 0.001330 gray black \n",
"3 yv1612tb4f1310987 ca 0.833333 0.014281 white black \n",
"4 wba6b2c57ed129731 ca 0.875000 0.002640 gray black \n",
"\n",
" seller mmr sellingprice \\\n",
"0 kia motors america inc 0.112515 0.093474 \n",
"1 kia motors america inc 0.114164 0.093474 \n",
"2 financial services remarketing (lease) 0.175161 0.130431 \n",
"3 volvo na rep/world omni 0.150982 0.120648 \n",
"4 financial services remarketing (lease) 0.362550 0.291301 \n",
"\n",
" saledate \n",
"0 Tue Dec 16 2014 12:30:00 GMT-0800 (PST) \n",
"1 Tue Dec 16 2014 12:30:00 GMT-0800 (PST) \n",
"2 Thu Jan 15 2015 04:30:00 GMT-0800 (PST) \n",
"3 Thu Jan 29 2015 04:30:00 GMT-0800 (PST) \n",
"4 Thu Dec 18 2014 12:30:00 GMT-0800 (PST) "
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_scaled.head()"
]
},
{
"cell_type": "markdown",
"id": "3df15219",
"metadata": {},
"source": [
"### Podział danych na podzbiory"
]
},
{
"cell_type": "code",
"execution_count": 89,
"id": "8589727c",
"metadata": {},
"outputs": [],
"source": [
"car_train, car_dev_test = train_test_split(df, random_state = 0, train_size = 0.8)\n",
"car_dev, car_test = train_test_split(car_dev_test, random_state = 0, train_size = 0.5)"
]
},
{
"cell_type": "code",
"execution_count": 90,
"id": "728bfc11",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(377860, 16)\n",
"(47232, 16)\n",
"(47233, 16)\n"
]
}
],
"source": [
"print(car_train.shape)\n",
"print(car_dev.shape)\n",
"print(car_test.shape)"
]
},
{
"cell_type": "markdown",
"id": "632d1ab8",
"metadata": {},
"source": [
"### Statystyki zbioru"
]
},
{
"cell_type": "code",
"execution_count": 91,
"id": "edca0016",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Ford 81013\n",
"Chevrolet 54150\n",
"Nissan 44043\n",
"Toyota 35313\n",
"Dodge 27181\n",
"Honda 24781\n",
"Hyundai 18659\n",
"BMW 17509\n",
"Kia 15828\n",
"Chrysler 15133\n",
"Name: make, dtype: int64"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['make'].value_counts().head(10)"
]
},
{
"cell_type": "code",
"execution_count": 92,
"id": "429a052f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Sedan 211298\n",
"SUV 120968\n",
"Hatchback 19351\n",
"Minivan 18305\n",
"Coupe 13121\n",
"Wagon 12023\n",
"Crew Cab 11508\n",
"Convertible 7725\n",
"SuperCrew 6195\n",
"G Sedan 5644\n",
"Name: body, dtype: int64"
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['body'].value_counts().head(10)"
]
},
{
"cell_type": "code",
"execution_count": 93,
"id": "4c247aca",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"automatic 455963\n",
"manual 16362\n",
"Name: transmission, dtype: int64"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['transmission'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ce3e9252",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}