{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "history_visible": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SXcGWK6GBeDz", "outputId": "ff6683a6-819f-4a8e-d2cc-b5b1871719f8" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /root/.kaggle/kaggle.json'\n", "Downloading car-prices-poland.zip to /content\n", " 0% 0.00/1.64M [00:00=2.8.1 in /usr/local/lib/python3.9/dist-packages (from pandas) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas) (2022.7.1)\n", "Requirement already satisfied: numpy>=1.18.5 in /usr/local/lib/python3.9/dist-packages (from pandas) (1.22.4)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.8.1->pandas) (1.15.0)\n", "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Requirement already satisfied: seaborn in /usr/local/lib/python3.9/dist-packages (0.12.2)\n", "Requirement already satisfied: pandas>=0.25 in /usr/local/lib/python3.9/dist-packages (from seaborn) (1.4.4)\n", "Requirement already satisfied: matplotlib!=3.6.1,>=3.1 in /usr/local/lib/python3.9/dist-packages (from seaborn) (3.7.1)\n", "Requirement already satisfied: numpy!=1.24.0,>=1.17 in /usr/local/lib/python3.9/dist-packages (from seaborn) (1.22.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (23.0)\n", "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (2.8.2)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.4.4)\n", "Requirement already satisfied: importlib-resources>=3.2.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (5.12.0)\n", "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (3.0.9)\n", "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (0.11.0)\n", "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (4.39.0)\n", "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.0.7)\n", "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (8.4.0)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=0.25->seaborn) (2022.7.1)\n", "Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.9/dist-packages (from importlib-resources>=3.2.0->matplotlib!=3.6.1,>=3.1->seaborn) (3.15.0)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.1->seaborn) (1.15.0)\n" ] } ] }, { "cell_type": "code", "source": [ "import pandas as pd\n", "cars = pd.read_csv('Car_Prices_Poland_Kaggle.csv')" ], "metadata": { "id": "YWOwBUSMFLkI" }, "execution_count": 8, "outputs": [] }, { "cell_type": "code", "source": [ "cars.describe(include='all')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 458 }, "id": "juZ7gGxSFkyn", "outputId": "4ab59d9c-a016-45af-aef5-1cb76a8543ab" }, "execution_count": 9, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Unnamed: 0 mark model generation_name year \\\n", "count 117927.000000 117927 117927 87842 117927.000000 \n", "unique NaN 23 328 364 NaN \n", "top NaN audi astra gen-8p-2003-2012 NaN \n", "freq NaN 12031 3331 1567 NaN \n", "mean 58963.000000 NaN NaN NaN 2012.925259 \n", "std 34042.736935 NaN NaN NaN 5.690135 \n", "min 0.000000 NaN NaN NaN 1945.000000 \n", "25% 29481.500000 NaN NaN NaN 2009.000000 \n", "50% 58963.000000 NaN NaN NaN 2013.000000 \n", "75% 88444.500000 NaN NaN NaN 2018.000000 \n", "max 117926.000000 NaN NaN NaN 2022.000000 \n", "\n", " mileage vol_engine fuel city province \\\n", "count 1.179270e+05 117927.000000 117927 117927 117927 \n", "unique NaN NaN 6 4427 23 \n", "top NaN NaN Gasoline Warszawa Mazowieckie \n", "freq NaN NaN 61597 7972 22219 \n", "mean 1.409768e+05 1812.057782 NaN NaN NaN \n", "std 9.236936e+04 643.613438 NaN NaN NaN \n", "min 0.000000e+00 0.000000 NaN NaN NaN \n", "25% 6.700000e+04 1461.000000 NaN NaN NaN \n", "50% 1.462690e+05 1796.000000 NaN NaN NaN \n", "75% 2.030000e+05 1995.000000 NaN NaN NaN \n", "max 2.800000e+06 7600.000000 NaN NaN NaN \n", "\n", " price \n", "count 1.179270e+05 \n", "unique NaN \n", "top NaN \n", "freq NaN \n", "mean 7.029988e+04 \n", "std 8.482458e+04 \n", "min 5.000000e+02 \n", "25% 2.100000e+04 \n", "50% 4.190000e+04 \n", "75% 8.360000e+04 \n", "max 2.399900e+06 " ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0markmodelgeneration_nameyearmileagevol_enginefuelcityprovinceprice
count117927.00000011792711792787842117927.0000001.179270e+05117927.0000001179271179271179271.179270e+05
uniqueNaN23328364NaNNaNNaN6442723NaN
topNaNaudiastragen-8p-2003-2012NaNNaNNaNGasolineWarszawaMazowieckieNaN
freqNaN1203133311567NaNNaNNaN61597797222219NaN
mean58963.000000NaNNaNNaN2012.9252591.409768e+051812.057782NaNNaNNaN7.029988e+04
std34042.736935NaNNaNNaN5.6901359.236936e+04643.613438NaNNaNNaN8.482458e+04
min0.000000NaNNaNNaN1945.0000000.000000e+000.000000NaNNaNNaN5.000000e+02
25%29481.500000NaNNaNNaN2009.0000006.700000e+041461.000000NaNNaNNaN2.100000e+04
50%58963.000000NaNNaNNaN2013.0000001.462690e+051796.000000NaNNaNNaN4.190000e+04
75%88444.500000NaNNaNNaN2018.0000002.030000e+051995.000000NaNNaNNaN8.360000e+04
max117926.000000NaNNaNNaN2022.0000002.800000e+067600.000000NaNNaNNaN2.399900e+06
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 9 } ] }, { "cell_type": "code", "source": [ "cars = cars.drop(73436) #wiersz z błednymi danymi" ], "metadata": { "id": "L1nKE2QCKDrw" }, "execution_count": 10, "outputs": [] }, { "cell_type": "code", "source": [ "def normalize(df,feature_name):\n", " result = df.copy()\n", " max_value = df[feature_name].max()\n", " min_value = df[feature_name].min()\n", " result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)\n", " return result" ], "metadata": { "id": "_G1CH_2QQPAF" }, "execution_count": 11, "outputs": [] }, { "cell_type": "markdown", "source": [], "metadata": { "id": "apINI6GKPeda" } }, { "cell_type": "code", "source": [ "cars_normalized = normalize(cars,'vol_engine')\n", "print(cars_normalized)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YPLfex7yPH8v", "outputId": "9b44de1d-7776-40f0-b1a9-eac0d08b52b1" }, "execution_count": 12, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " Unnamed: 0 mark model generation_name year mileage \\\n", "0 0 opel combo gen-d-2011 2015 139568 \n", "1 1 opel combo gen-d-2011 2018 31991 \n", "2 2 opel combo gen-d-2011 2015 278437 \n", "3 3 opel combo gen-d-2011 2016 47600 \n", "4 4 opel combo gen-d-2011 2014 103000 \n", "... ... ... ... ... ... ... \n", "117922 117922 volvo xc-90 gen-ii-2014-xc-90 2020 40000 \n", "117923 117923 volvo xc-90 gen-ii-2014-xc-90 2017 51000 \n", "117924 117924 volvo xc-90 gen-ii-2014-xc-90 2016 83500 \n", "117925 117925 volvo xc-90 gen-ii-2014-xc-90 2017 174000 \n", "117926 117926 volvo xc-90 gen-ii-2014-xc-90 2016 189020 \n", "\n", " vol_engine fuel city province price \n", "0 0.164211 Diesel Janki Mazowieckie 35900 \n", "1 0.197237 Diesel Katowice Śląskie 78501 \n", "2 0.210263 Diesel Brzeg Opolskie 27000 \n", "3 0.164211 Diesel Korfantów Opolskie 30800 \n", "4 0.184211 CNG Tarnowskie Góry Śląskie 35900 \n", "... ... ... ... ... ... \n", "117922 0.259079 Hybrid Katowice Śląskie 222790 \n", "117923 0.259079 Diesel Chechło Pierwsze Łódzkie 229900 \n", "117924 0.259079 Gasoline Pruszcz Gdański Pomorskie 135000 \n", "117925 0.259079 Diesel Kalisz Wielkopolskie 154500 \n", "117926 0.259079 Gasoline Sionna Mazowieckie 130000 \n", "\n", "[117926 rows x 11 columns]\n" ] } ] }, { "cell_type": "code", "source": [ "import sklearn\n", "import sklearn.model_selection\n", "cars_train, cars_test = sklearn.model_selection.train_test_split(cars_normalized, test_size=23586, random_state=1)\n", "cars_train[\"province\"].value_counts()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "PZwsQwgeSoHb", "outputId": "8972c3e2-344b-482a-addf-23a799fbb3fb" }, "execution_count": 14, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Mazowieckie 17750\n", "Śląskie 13441\n", "Wielkopolskie 11162\n", "Małopolskie 7796\n", "Dolnośląskie 7092\n", "Łódzkie 6303\n", "Pomorskie 6094\n", "Kujawsko-pomorskie 4256\n", "Lubelskie 3775\n", "Zachodniopomorskie 3165\n", "Podkarpackie 2826\n", "Świętokrzyskie 2657\n", "Warmińsko-mazurskie 2375\n", "Lubuskie 2220\n", "Podlaskie 1716\n", "Opolskie 1679\n", "Moravian-Silesian Region 27\n", "Wiedeń 2\n", "Berlin 2\n", "Trenczyn 1\n", "Niedersachsen 1\n", "Name: province, dtype: int64" ] }, "metadata": {}, "execution_count": 14 } ] }, { "cell_type": "code", "source": [ "cars_dev, cars_test = sklearn.model_selection.train_test_split(cars_test, test_size=11793, random_state=1)\n", "cars_dev[\"province\"].value_counts()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-ec5RLaXTgWK", "outputId": "227a54eb-6c8f-4faf-c38b-cd3147202e92" }, "execution_count": 15, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Mazowieckie 2261\n", "Śląskie 1666\n", "Wielkopolskie 1418\n", "Małopolskie 948\n", "Dolnośląskie 867\n", "Łódzkie 775\n", "Pomorskie 766\n", "Kujawsko-pomorskie 532\n", "Lubelskie 504\n", "Zachodniopomorskie 396\n", "Podkarpackie 365\n", "Świętokrzyskie 353\n", "Warmińsko-mazurskie 282\n", "Lubuskie 263\n", "Opolskie 199\n", "Podlaskie 192\n", "Moravian-Silesian Region 4\n", "Nordrhein-Westfalen 1\n", "Berlin 1\n", "Name: province, dtype: int64" ] }, "metadata": {}, "execution_count": 15 } ] }, { "cell_type": "code", "source": [ "\n", "cars_test[\"province\"].value_counts()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "2VwezzlzUvZd", "outputId": "5dece8a2-2d6b-4a25-fda5-be7c85b4765d" }, "execution_count": 16, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Mazowieckie 2208\n", "Śląskie 1599\n", "Wielkopolskie 1436\n", "Małopolskie 1012\n", "Dolnośląskie 879\n", "Łódzkie 806\n", "Pomorskie 745\n", "Kujawsko-pomorskie 583\n", "Lubelskie 461\n", "Zachodniopomorskie 402\n", "Podkarpackie 362\n", "Świętokrzyskie 327\n", "Warmińsko-mazurskie 299\n", "Lubuskie 260\n", "Podlaskie 215\n", "Opolskie 195\n", "Moravian-Silesian Region 4\n", "Name: province, dtype: int64" ] }, "metadata": {}, "execution_count": 16 } ] }, { "cell_type": "code", "source": [ "#Ilość wartości w zbiorach\n", "print(cars_normalized.size)\n", "print(cars_train.size)\n", "print(cars_dev.size)\n", "print(cars_test.size)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "sprjCCXTV8W0", "outputId": "3b12b8c4-279f-4751-f801-e97d2c81c01b" }, "execution_count": 17, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "1297186\n", "1037740\n", "129723\n", "129723\n" ] } ] }, { "cell_type": "code", "source": [ "#Średnie wartości parametrów\n", "print(cars_normalized['price'].mean())\n", "print(cars_train['price'].mean())\n", "print(cars_dev['price'].mean())\n", "print(cars_test['price'].mean())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "TjvBBTAsXbUK", "outputId": "644543bb-acb6-4bda-de01-ab92514b7de8" }, "execution_count": 18, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "70299.94754337466\n", "70432.62519609921\n", "69244.09963537692\n", "70294.41923174764\n" ] } ] }, { "cell_type": "code", "source": [ "#Najmniejsze ceny pojazdów\n", "print(cars_normalized['price'].min())\n", "print(cars_train['price'].min())\n", "print(cars_dev['price'].min())\n", "print(cars_test['price'].min())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "GJI2qf-1YLbp", "outputId": "20aec129-96c9-4adb-f3cb-25db4b2dc207" }, "execution_count": 19, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "500\n", "500\n", "1250\n", "900\n" ] } ] }, { "cell_type": "code", "source": [ "#Największe ceny pojazdów\n", "print(cars_normalized['price'].max())\n", "print(cars_train['price'].max())\n", "print(cars_dev['price'].max())\n", "print(cars_test['price'].max())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Ve8Cvu7IYx-E", "outputId": "ec0b0167-74ad-4118-b1c8-734c80cd9d79" }, "execution_count": 20, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "2399900\n", "2399900\n", "1368341\n", "1000000\n" ] } ] }, { "cell_type": "code", "source": [ "#Odchylenie standardowe\n", "print(cars_normalized['price'].std())\n", "print(cars_train['price'].std())\n", "print(cars_dev['price'].std())\n", "print(cars_test['price'].std())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "tGDytphgY7oB", "outputId": "caf5152a-5c5d-42ca-95d5-8aa1afd8d46f" }, "execution_count": 21, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "84824.93470827927\n", "85120.16823252657\n", "82128.74927832028\n", "85111.52408658911\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "9JafBXorXIXy" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "#Mediany cen pojazdów\n", "print(cars_normalized['price'].median())\n", "print(cars_train['price'].median())\n", "print(cars_dev['price'].median())\n", "print(cars_test['price'].median())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "pdmR9mKpU78C", "outputId": "e0fbd8a5-39b4-441f-8b64-1aaa210ba36c" }, "execution_count": 22, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "41900.0\n", "41900.0\n", "41901.0\n", "40900.0\n" ] } ] }, { "cell_type": "code", "source": [ "#Podział według regionów\n", "cars_normalized[\"province\"].value_counts()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MXSSLTdR-7xP", "outputId": "7facce01-e2e8-415b-9384-74253f1717d1" }, "execution_count": 26, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Mazowieckie 22219\n", "Śląskie 16706\n", "Wielkopolskie 14016\n", "Małopolskie 9756\n", "Dolnośląskie 8838\n", "Łódzkie 7884\n", "Pomorskie 7605\n", "Kujawsko-pomorskie 5371\n", "Lubelskie 4740\n", "Zachodniopomorskie 3963\n", "Podkarpackie 3553\n", "Świętokrzyskie 3337\n", "Warmińsko-mazurskie 2956\n", "Lubuskie 2743\n", "Podlaskie 2123\n", "Opolskie 2073\n", "Moravian-Silesian Region 35\n", "Berlin 3\n", "Wiedeń 2\n", "Niedersachsen 1\n", "Trenczyn 1\n", "Nordrhein-Westfalen 1\n", "Name: province, dtype: int64" ] }, "metadata": {}, "execution_count": 26 } ] }, { "cell_type": "code", "source": [ "#Podział według marki\n", "cars_normalized[\"mark\"].value_counts()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "XGlwLMbE_Mnf", "outputId": "fd743df6-2043-45ff-bea1-b19f03869eb8" }, "execution_count": 27, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "audi 12031\n", "opel 11914\n", "bmw 11070\n", "volkswagen 10848\n", "ford 9664\n", "mercedes-benz 7136\n", "renault 6976\n", "skoda 5888\n", "toyota 5119\n", "peugeot 5056\n", "volvo 4384\n", "hyundai 4032\n", "kia 3744\n", "nissan 3072\n", "fiat 2880\n", "mazda 2848\n", "seat 2848\n", "citroen 2720\n", "honda 2176\n", "mitsubishi 1120\n", "mini 1088\n", "alfa-romeo 704\n", "chevrolet 608\n", "Name: mark, dtype: int64" ] }, "metadata": {}, "execution_count": 27 } ] }, { "cell_type": "markdown", "source": [], "metadata": { "id": "2a30BavmDAzQ" } } ] }