1135 lines
36 KiB
Plaintext
1135 lines
36 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "51d5eebd",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Instalacja i import bibliotek"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 74,
|
|
"id": "300962ca",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Requirement already satisfied: kaggle in c:\\users\\krzys\\anaconda3\\lib\\site-packages (1.6.6)\n",
|
|
"Requirement already satisfied: bleach in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from kaggle) (6.1.0)\n",
|
|
"Requirement already satisfied: python-dateutil in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from kaggle) (2.9.0.post0)\n",
|
|
"Requirement already satisfied: six>=1.10 in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from kaggle) (1.16.0)\n",
|
|
"Requirement already satisfied: requests in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from kaggle) (2.31.0)\n",
|
|
"Requirement already satisfied: certifi in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from kaggle) (2024.2.2)\n",
|
|
"Requirement already satisfied: tqdm in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from kaggle) (4.66.2)\n",
|
|
"Requirement already satisfied: python-slugify in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from kaggle) (8.0.4)\n",
|
|
"Requirement already satisfied: urllib3 in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from kaggle) (2.2.1)\n",
|
|
"Requirement already satisfied: webencodings in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from bleach->kaggle) (0.5.1)\n",
|
|
"Requirement already satisfied: text-unidecode>=1.3 in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from python-slugify->kaggle) (1.3)\n",
|
|
"Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from requests->kaggle) (3.3.2)\n",
|
|
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from requests->kaggle) (3.6)\n",
|
|
"Requirement already satisfied: colorama in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from tqdm->kaggle) (0.4.6)\n",
|
|
"Requirement already satisfied: pandas in c:\\users\\krzys\\anaconda3\\lib\\site-packages (1.4.2)\n",
|
|
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from pandas) (2021.3)\n",
|
|
"Requirement already satisfied: numpy>=1.18.5 in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from pandas) (1.21.5)\n",
|
|
"Requirement already satisfied: python-dateutil>=2.8.1 in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from pandas) (2.9.0.post0)\n",
|
|
"Requirement already satisfied: six>=1.5 in c:\\users\\krzys\\anaconda3\\lib\\site-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"!pip install kaggle\n",
|
|
"!pip install pandas"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 75,
|
|
"id": "ace0fd9d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"from sklearn.preprocessing import MinMaxScaler"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "8063f07b",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Pobranie zbioru danych"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 76,
|
|
"id": "13978bf5",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"vehicle-sales-data.zip: Skipping, found more recently modified local copy (use --force to force download)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"!kaggle datasets download -d syedanwarafridi/vehicle-sales-data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 77,
|
|
"id": "079faa4e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#conda install git pip"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 78,
|
|
"id": "de0ab590",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#!pip install unzip"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 79,
|
|
"id": "265ecdca",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Archive: vehicle-sales-data.zip\n",
|
|
" inflating: car_prices.csv \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"!unzip -o vehicle-sales-data.zip"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "2bf18b9a",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Opis i czyszczenie danych danych"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 80,
|
|
"id": "145d6d72",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>year</th>\n",
|
|
" <th>make</th>\n",
|
|
" <th>model</th>\n",
|
|
" <th>trim</th>\n",
|
|
" <th>body</th>\n",
|
|
" <th>transmission</th>\n",
|
|
" <th>vin</th>\n",
|
|
" <th>state</th>\n",
|
|
" <th>condition</th>\n",
|
|
" <th>odometer</th>\n",
|
|
" <th>color</th>\n",
|
|
" <th>interior</th>\n",
|
|
" <th>seller</th>\n",
|
|
" <th>mmr</th>\n",
|
|
" <th>sellingprice</th>\n",
|
|
" <th>saledate</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>2015</td>\n",
|
|
" <td>Kia</td>\n",
|
|
" <td>Sorento</td>\n",
|
|
" <td>LX</td>\n",
|
|
" <td>SUV</td>\n",
|
|
" <td>automatic</td>\n",
|
|
" <td>5xyktca69fg566472</td>\n",
|
|
" <td>ca</td>\n",
|
|
" <td>5.0</td>\n",
|
|
" <td>16639.0</td>\n",
|
|
" <td>white</td>\n",
|
|
" <td>black</td>\n",
|
|
" <td>kia motors america inc</td>\n",
|
|
" <td>20500.0</td>\n",
|
|
" <td>21500.0</td>\n",
|
|
" <td>Tue Dec 16 2014 12:30:00 GMT-0800 (PST)</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>2015</td>\n",
|
|
" <td>Kia</td>\n",
|
|
" <td>Sorento</td>\n",
|
|
" <td>LX</td>\n",
|
|
" <td>SUV</td>\n",
|
|
" <td>automatic</td>\n",
|
|
" <td>5xyktca69fg561319</td>\n",
|
|
" <td>ca</td>\n",
|
|
" <td>5.0</td>\n",
|
|
" <td>9393.0</td>\n",
|
|
" <td>white</td>\n",
|
|
" <td>beige</td>\n",
|
|
" <td>kia motors america inc</td>\n",
|
|
" <td>20800.0</td>\n",
|
|
" <td>21500.0</td>\n",
|
|
" <td>Tue Dec 16 2014 12:30:00 GMT-0800 (PST)</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>2014</td>\n",
|
|
" <td>BMW</td>\n",
|
|
" <td>3 Series</td>\n",
|
|
" <td>328i SULEV</td>\n",
|
|
" <td>Sedan</td>\n",
|
|
" <td>automatic</td>\n",
|
|
" <td>wba3c1c51ek116351</td>\n",
|
|
" <td>ca</td>\n",
|
|
" <td>45.0</td>\n",
|
|
" <td>1331.0</td>\n",
|
|
" <td>gray</td>\n",
|
|
" <td>black</td>\n",
|
|
" <td>financial services remarketing (lease)</td>\n",
|
|
" <td>31900.0</td>\n",
|
|
" <td>30000.0</td>\n",
|
|
" <td>Thu Jan 15 2015 04:30:00 GMT-0800 (PST)</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>2015</td>\n",
|
|
" <td>Volvo</td>\n",
|
|
" <td>S60</td>\n",
|
|
" <td>T5</td>\n",
|
|
" <td>Sedan</td>\n",
|
|
" <td>automatic</td>\n",
|
|
" <td>yv1612tb4f1310987</td>\n",
|
|
" <td>ca</td>\n",
|
|
" <td>41.0</td>\n",
|
|
" <td>14282.0</td>\n",
|
|
" <td>white</td>\n",
|
|
" <td>black</td>\n",
|
|
" <td>volvo na rep/world omni</td>\n",
|
|
" <td>27500.0</td>\n",
|
|
" <td>27750.0</td>\n",
|
|
" <td>Thu Jan 29 2015 04:30:00 GMT-0800 (PST)</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>2014</td>\n",
|
|
" <td>BMW</td>\n",
|
|
" <td>6 Series Gran Coupe</td>\n",
|
|
" <td>650i</td>\n",
|
|
" <td>Sedan</td>\n",
|
|
" <td>automatic</td>\n",
|
|
" <td>wba6b2c57ed129731</td>\n",
|
|
" <td>ca</td>\n",
|
|
" <td>43.0</td>\n",
|
|
" <td>2641.0</td>\n",
|
|
" <td>gray</td>\n",
|
|
" <td>black</td>\n",
|
|
" <td>financial services remarketing (lease)</td>\n",
|
|
" <td>66000.0</td>\n",
|
|
" <td>67000.0</td>\n",
|
|
" <td>Thu Dec 18 2014 12:30:00 GMT-0800 (PST)</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" year make model trim body transmission \\\n",
|
|
"0 2015 Kia Sorento LX SUV automatic \n",
|
|
"1 2015 Kia Sorento LX SUV automatic \n",
|
|
"2 2014 BMW 3 Series 328i SULEV Sedan automatic \n",
|
|
"3 2015 Volvo S60 T5 Sedan automatic \n",
|
|
"4 2014 BMW 6 Series Gran Coupe 650i Sedan automatic \n",
|
|
"\n",
|
|
" vin state condition odometer color interior \\\n",
|
|
"0 5xyktca69fg566472 ca 5.0 16639.0 white black \n",
|
|
"1 5xyktca69fg561319 ca 5.0 9393.0 white beige \n",
|
|
"2 wba3c1c51ek116351 ca 45.0 1331.0 gray black \n",
|
|
"3 yv1612tb4f1310987 ca 41.0 14282.0 white black \n",
|
|
"4 wba6b2c57ed129731 ca 43.0 2641.0 gray black \n",
|
|
"\n",
|
|
" seller mmr sellingprice \\\n",
|
|
"0 kia motors america inc 20500.0 21500.0 \n",
|
|
"1 kia motors america inc 20800.0 21500.0 \n",
|
|
"2 financial services remarketing (lease) 31900.0 30000.0 \n",
|
|
"3 volvo na rep/world omni 27500.0 27750.0 \n",
|
|
"4 financial services remarketing (lease) 66000.0 67000.0 \n",
|
|
"\n",
|
|
" saledate \n",
|
|
"0 Tue Dec 16 2014 12:30:00 GMT-0800 (PST) \n",
|
|
"1 Tue Dec 16 2014 12:30:00 GMT-0800 (PST) \n",
|
|
"2 Thu Jan 15 2015 04:30:00 GMT-0800 (PST) \n",
|
|
"3 Thu Jan 29 2015 04:30:00 GMT-0800 (PST) \n",
|
|
"4 Thu Dec 18 2014 12:30:00 GMT-0800 (PST) "
|
|
]
|
|
},
|
|
"execution_count": 80,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df = pd.read_csv('car_prices.csv')\n",
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 81,
|
|
"id": "5fb29c05",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(558837, 16)"
|
|
]
|
|
},
|
|
"execution_count": 81,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 82,
|
|
"id": "89130732",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|
"RangeIndex: 558837 entries, 0 to 558836\n",
|
|
"Data columns (total 16 columns):\n",
|
|
" # Column Non-Null Count Dtype \n",
|
|
"--- ------ -------------- ----- \n",
|
|
" 0 year 558837 non-null int64 \n",
|
|
" 1 make 548536 non-null object \n",
|
|
" 2 model 548438 non-null object \n",
|
|
" 3 trim 548186 non-null object \n",
|
|
" 4 body 545642 non-null object \n",
|
|
" 5 transmission 493485 non-null object \n",
|
|
" 6 vin 558833 non-null object \n",
|
|
" 7 state 558837 non-null object \n",
|
|
" 8 condition 547017 non-null float64\n",
|
|
" 9 odometer 558743 non-null float64\n",
|
|
" 10 color 558088 non-null object \n",
|
|
" 11 interior 558088 non-null object \n",
|
|
" 12 seller 558837 non-null object \n",
|
|
" 13 mmr 558799 non-null float64\n",
|
|
" 14 sellingprice 558825 non-null float64\n",
|
|
" 15 saledate 558825 non-null object \n",
|
|
"dtypes: float64(4), int64(1), object(11)\n",
|
|
"memory usage: 68.2+ MB\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"df.info()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 83,
|
|
"id": "242cd7f0",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>year</th>\n",
|
|
" <th>make</th>\n",
|
|
" <th>model</th>\n",
|
|
" <th>trim</th>\n",
|
|
" <th>body</th>\n",
|
|
" <th>transmission</th>\n",
|
|
" <th>vin</th>\n",
|
|
" <th>state</th>\n",
|
|
" <th>condition</th>\n",
|
|
" <th>odometer</th>\n",
|
|
" <th>color</th>\n",
|
|
" <th>interior</th>\n",
|
|
" <th>seller</th>\n",
|
|
" <th>mmr</th>\n",
|
|
" <th>sellingprice</th>\n",
|
|
" <th>saledate</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>558837.000000</td>\n",
|
|
" <td>548536</td>\n",
|
|
" <td>548438</td>\n",
|
|
" <td>548186</td>\n",
|
|
" <td>545642</td>\n",
|
|
" <td>493485</td>\n",
|
|
" <td>558833</td>\n",
|
|
" <td>558837</td>\n",
|
|
" <td>547017.000000</td>\n",
|
|
" <td>558743.000000</td>\n",
|
|
" <td>558088</td>\n",
|
|
" <td>558088</td>\n",
|
|
" <td>558837</td>\n",
|
|
" <td>558799.000000</td>\n",
|
|
" <td>558825.000000</td>\n",
|
|
" <td>558825</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>96</td>\n",
|
|
" <td>973</td>\n",
|
|
" <td>1963</td>\n",
|
|
" <td>87</td>\n",
|
|
" <td>4</td>\n",
|
|
" <td>550297</td>\n",
|
|
" <td>64</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>46</td>\n",
|
|
" <td>17</td>\n",
|
|
" <td>14263</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>3766</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>Ford</td>\n",
|
|
" <td>Altima</td>\n",
|
|
" <td>Base</td>\n",
|
|
" <td>Sedan</td>\n",
|
|
" <td>automatic</td>\n",
|
|
" <td>automatic</td>\n",
|
|
" <td>fl</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>black</td>\n",
|
|
" <td>black</td>\n",
|
|
" <td>nissan-infiniti lt</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>Tue Feb 10 2015 01:30:00 GMT-0800 (PST)</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>93554</td>\n",
|
|
" <td>19349</td>\n",
|
|
" <td>55817</td>\n",
|
|
" <td>199437</td>\n",
|
|
" <td>475915</td>\n",
|
|
" <td>22</td>\n",
|
|
" <td>82945</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>110970</td>\n",
|
|
" <td>244329</td>\n",
|
|
" <td>19693</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>5334</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>2010.038927</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>30.672365</td>\n",
|
|
" <td>68320.017767</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>13769.377495</td>\n",
|
|
" <td>13611.358810</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>3.966864</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>13.402832</td>\n",
|
|
" <td>53398.542821</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>9679.967174</td>\n",
|
|
" <td>9749.501628</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>1982.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>25.000000</td>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>2007.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>23.000000</td>\n",
|
|
" <td>28371.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7100.000000</td>\n",
|
|
" <td>6900.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>2012.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>35.000000</td>\n",
|
|
" <td>52254.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>12250.000000</td>\n",
|
|
" <td>12100.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>2013.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>42.000000</td>\n",
|
|
" <td>99109.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>18300.000000</td>\n",
|
|
" <td>18200.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>2015.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>49.000000</td>\n",
|
|
" <td>999999.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>182000.000000</td>\n",
|
|
" <td>230000.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" year make model trim body transmission vin \\\n",
|
|
"count 558837.000000 548536 548438 548186 545642 493485 558833 \n",
|
|
"unique NaN 96 973 1963 87 4 550297 \n",
|
|
"top NaN Ford Altima Base Sedan automatic automatic \n",
|
|
"freq NaN 93554 19349 55817 199437 475915 22 \n",
|
|
"mean 2010.038927 NaN NaN NaN NaN NaN NaN \n",
|
|
"std 3.966864 NaN NaN NaN NaN NaN NaN \n",
|
|
"min 1982.000000 NaN NaN NaN NaN NaN NaN \n",
|
|
"25% 2007.000000 NaN NaN NaN NaN NaN NaN \n",
|
|
"50% 2012.000000 NaN NaN NaN NaN NaN NaN \n",
|
|
"75% 2013.000000 NaN NaN NaN NaN NaN NaN \n",
|
|
"max 2015.000000 NaN NaN NaN NaN NaN NaN \n",
|
|
"\n",
|
|
" state condition odometer color interior \\\n",
|
|
"count 558837 547017.000000 558743.000000 558088 558088 \n",
|
|
"unique 64 NaN NaN 46 17 \n",
|
|
"top fl NaN NaN black black \n",
|
|
"freq 82945 NaN NaN 110970 244329 \n",
|
|
"mean NaN 30.672365 68320.017767 NaN NaN \n",
|
|
"std NaN 13.402832 53398.542821 NaN NaN \n",
|
|
"min NaN 1.000000 1.000000 NaN NaN \n",
|
|
"25% NaN 23.000000 28371.000000 NaN NaN \n",
|
|
"50% NaN 35.000000 52254.000000 NaN NaN \n",
|
|
"75% NaN 42.000000 99109.000000 NaN NaN \n",
|
|
"max NaN 49.000000 999999.000000 NaN NaN \n",
|
|
"\n",
|
|
" seller mmr sellingprice \\\n",
|
|
"count 558837 558799.000000 558825.000000 \n",
|
|
"unique 14263 NaN NaN \n",
|
|
"top nissan-infiniti lt NaN NaN \n",
|
|
"freq 19693 NaN NaN \n",
|
|
"mean NaN 13769.377495 13611.358810 \n",
|
|
"std NaN 9679.967174 9749.501628 \n",
|
|
"min NaN 25.000000 1.000000 \n",
|
|
"25% NaN 7100.000000 6900.000000 \n",
|
|
"50% NaN 12250.000000 12100.000000 \n",
|
|
"75% NaN 18300.000000 18200.000000 \n",
|
|
"max NaN 182000.000000 230000.000000 \n",
|
|
"\n",
|
|
" saledate \n",
|
|
"count 558825 \n",
|
|
"unique 3766 \n",
|
|
"top Tue Feb 10 2015 01:30:00 GMT-0800 (PST) \n",
|
|
"freq 5334 \n",
|
|
"mean NaN \n",
|
|
"std NaN \n",
|
|
"min NaN \n",
|
|
"25% NaN \n",
|
|
"50% NaN \n",
|
|
"75% NaN \n",
|
|
"max NaN "
|
|
]
|
|
},
|
|
"execution_count": 83,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df.describe(include='all')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 84,
|
|
"id": "75742f81",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(472325, 16)"
|
|
]
|
|
},
|
|
"execution_count": 84,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df = df.dropna()\n",
|
|
"df.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 85,
|
|
"id": "93d5b418",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"year 0\n",
|
|
"make 0\n",
|
|
"model 0\n",
|
|
"trim 0\n",
|
|
"body 0\n",
|
|
"transmission 0\n",
|
|
"vin 0\n",
|
|
"state 0\n",
|
|
"condition 0\n",
|
|
"odometer 0\n",
|
|
"color 0\n",
|
|
"interior 0\n",
|
|
"seller 0\n",
|
|
"mmr 0\n",
|
|
"sellingprice 0\n",
|
|
"saledate 0\n",
|
|
"dtype: int64"
|
|
]
|
|
},
|
|
"execution_count": 85,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df.isna().sum()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 86,
|
|
"id": "b039be76",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df['body'] = df['body'].replace({'sedan': 'Sedan'})\n",
|
|
"df['body'] = df['body'].replace({'Suv': 'SUV'})\n",
|
|
"df['body'] = df['body'].replace({'suv': 'SUV'})"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 87,
|
|
"id": "92787fae",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"numeric_columns = df.select_dtypes(include=['int', 'float']).columns\n",
|
|
"scaler = MinMaxScaler(feature_range=(0, 1))\n",
|
|
"\n",
|
|
"df_scaled = df.copy()\n",
|
|
"df_scaled[numeric_columns] = scaler.fit_transform(df[numeric_columns])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 88,
|
|
"id": "147f68ca",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>year</th>\n",
|
|
" <th>make</th>\n",
|
|
" <th>model</th>\n",
|
|
" <th>trim</th>\n",
|
|
" <th>body</th>\n",
|
|
" <th>transmission</th>\n",
|
|
" <th>vin</th>\n",
|
|
" <th>state</th>\n",
|
|
" <th>condition</th>\n",
|
|
" <th>odometer</th>\n",
|
|
" <th>color</th>\n",
|
|
" <th>interior</th>\n",
|
|
" <th>seller</th>\n",
|
|
" <th>mmr</th>\n",
|
|
" <th>sellingprice</th>\n",
|
|
" <th>saledate</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>1.00</td>\n",
|
|
" <td>Kia</td>\n",
|
|
" <td>Sorento</td>\n",
|
|
" <td>LX</td>\n",
|
|
" <td>SUV</td>\n",
|
|
" <td>automatic</td>\n",
|
|
" <td>5xyktca69fg566472</td>\n",
|
|
" <td>ca</td>\n",
|
|
" <td>0.083333</td>\n",
|
|
" <td>0.016638</td>\n",
|
|
" <td>white</td>\n",
|
|
" <td>black</td>\n",
|
|
" <td>kia motors america inc</td>\n",
|
|
" <td>0.112515</td>\n",
|
|
" <td>0.093474</td>\n",
|
|
" <td>Tue Dec 16 2014 12:30:00 GMT-0800 (PST)</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>1.00</td>\n",
|
|
" <td>Kia</td>\n",
|
|
" <td>Sorento</td>\n",
|
|
" <td>LX</td>\n",
|
|
" <td>SUV</td>\n",
|
|
" <td>automatic</td>\n",
|
|
" <td>5xyktca69fg561319</td>\n",
|
|
" <td>ca</td>\n",
|
|
" <td>0.083333</td>\n",
|
|
" <td>0.009392</td>\n",
|
|
" <td>white</td>\n",
|
|
" <td>beige</td>\n",
|
|
" <td>kia motors america inc</td>\n",
|
|
" <td>0.114164</td>\n",
|
|
" <td>0.093474</td>\n",
|
|
" <td>Tue Dec 16 2014 12:30:00 GMT-0800 (PST)</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>0.96</td>\n",
|
|
" <td>BMW</td>\n",
|
|
" <td>3 Series</td>\n",
|
|
" <td>328i SULEV</td>\n",
|
|
" <td>Sedan</td>\n",
|
|
" <td>automatic</td>\n",
|
|
" <td>wba3c1c51ek116351</td>\n",
|
|
" <td>ca</td>\n",
|
|
" <td>0.916667</td>\n",
|
|
" <td>0.001330</td>\n",
|
|
" <td>gray</td>\n",
|
|
" <td>black</td>\n",
|
|
" <td>financial services remarketing (lease)</td>\n",
|
|
" <td>0.175161</td>\n",
|
|
" <td>0.130431</td>\n",
|
|
" <td>Thu Jan 15 2015 04:30:00 GMT-0800 (PST)</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>1.00</td>\n",
|
|
" <td>Volvo</td>\n",
|
|
" <td>S60</td>\n",
|
|
" <td>T5</td>\n",
|
|
" <td>Sedan</td>\n",
|
|
" <td>automatic</td>\n",
|
|
" <td>yv1612tb4f1310987</td>\n",
|
|
" <td>ca</td>\n",
|
|
" <td>0.833333</td>\n",
|
|
" <td>0.014281</td>\n",
|
|
" <td>white</td>\n",
|
|
" <td>black</td>\n",
|
|
" <td>volvo na rep/world omni</td>\n",
|
|
" <td>0.150982</td>\n",
|
|
" <td>0.120648</td>\n",
|
|
" <td>Thu Jan 29 2015 04:30:00 GMT-0800 (PST)</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>0.96</td>\n",
|
|
" <td>BMW</td>\n",
|
|
" <td>6 Series Gran Coupe</td>\n",
|
|
" <td>650i</td>\n",
|
|
" <td>Sedan</td>\n",
|
|
" <td>automatic</td>\n",
|
|
" <td>wba6b2c57ed129731</td>\n",
|
|
" <td>ca</td>\n",
|
|
" <td>0.875000</td>\n",
|
|
" <td>0.002640</td>\n",
|
|
" <td>gray</td>\n",
|
|
" <td>black</td>\n",
|
|
" <td>financial services remarketing (lease)</td>\n",
|
|
" <td>0.362550</td>\n",
|
|
" <td>0.291301</td>\n",
|
|
" <td>Thu Dec 18 2014 12:30:00 GMT-0800 (PST)</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" year make model trim body transmission \\\n",
|
|
"0 1.00 Kia Sorento LX SUV automatic \n",
|
|
"1 1.00 Kia Sorento LX SUV automatic \n",
|
|
"2 0.96 BMW 3 Series 328i SULEV Sedan automatic \n",
|
|
"3 1.00 Volvo S60 T5 Sedan automatic \n",
|
|
"4 0.96 BMW 6 Series Gran Coupe 650i Sedan automatic \n",
|
|
"\n",
|
|
" vin state condition odometer color interior \\\n",
|
|
"0 5xyktca69fg566472 ca 0.083333 0.016638 white black \n",
|
|
"1 5xyktca69fg561319 ca 0.083333 0.009392 white beige \n",
|
|
"2 wba3c1c51ek116351 ca 0.916667 0.001330 gray black \n",
|
|
"3 yv1612tb4f1310987 ca 0.833333 0.014281 white black \n",
|
|
"4 wba6b2c57ed129731 ca 0.875000 0.002640 gray black \n",
|
|
"\n",
|
|
" seller mmr sellingprice \\\n",
|
|
"0 kia motors america inc 0.112515 0.093474 \n",
|
|
"1 kia motors america inc 0.114164 0.093474 \n",
|
|
"2 financial services remarketing (lease) 0.175161 0.130431 \n",
|
|
"3 volvo na rep/world omni 0.150982 0.120648 \n",
|
|
"4 financial services remarketing (lease) 0.362550 0.291301 \n",
|
|
"\n",
|
|
" saledate \n",
|
|
"0 Tue Dec 16 2014 12:30:00 GMT-0800 (PST) \n",
|
|
"1 Tue Dec 16 2014 12:30:00 GMT-0800 (PST) \n",
|
|
"2 Thu Jan 15 2015 04:30:00 GMT-0800 (PST) \n",
|
|
"3 Thu Jan 29 2015 04:30:00 GMT-0800 (PST) \n",
|
|
"4 Thu Dec 18 2014 12:30:00 GMT-0800 (PST) "
|
|
]
|
|
},
|
|
"execution_count": 88,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df_scaled.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "3df15219",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Podział danych na podzbiory"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 89,
|
|
"id": "8589727c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"car_train, car_dev_test = train_test_split(df, random_state = 0, train_size = 0.8)\n",
|
|
"car_dev, car_test = train_test_split(car_dev_test, random_state = 0, train_size = 0.5)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 90,
|
|
"id": "728bfc11",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"(377860, 16)\n",
|
|
"(47232, 16)\n",
|
|
"(47233, 16)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(car_train.shape)\n",
|
|
"print(car_dev.shape)\n",
|
|
"print(car_test.shape)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "632d1ab8",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Statystyki zbioru"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 91,
|
|
"id": "edca0016",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"Ford 81013\n",
|
|
"Chevrolet 54150\n",
|
|
"Nissan 44043\n",
|
|
"Toyota 35313\n",
|
|
"Dodge 27181\n",
|
|
"Honda 24781\n",
|
|
"Hyundai 18659\n",
|
|
"BMW 17509\n",
|
|
"Kia 15828\n",
|
|
"Chrysler 15133\n",
|
|
"Name: make, dtype: int64"
|
|
]
|
|
},
|
|
"execution_count": 91,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df['make'].value_counts().head(10)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 92,
|
|
"id": "429a052f",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"Sedan 211298\n",
|
|
"SUV 120968\n",
|
|
"Hatchback 19351\n",
|
|
"Minivan 18305\n",
|
|
"Coupe 13121\n",
|
|
"Wagon 12023\n",
|
|
"Crew Cab 11508\n",
|
|
"Convertible 7725\n",
|
|
"SuperCrew 6195\n",
|
|
"G Sedan 5644\n",
|
|
"Name: body, dtype: int64"
|
|
]
|
|
},
|
|
"execution_count": 92,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df['body'].value_counts().head(10)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 93,
|
|
"id": "4c247aca",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"automatic 455963\n",
|
|
"manual 16362\n",
|
|
"Name: transmission, dtype: int64"
|
|
]
|
|
},
|
|
"execution_count": 93,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df['transmission'].value_counts()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "ce3e9252",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.12"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|