From b680bf88e15a65c896ffb06ef295df5520ff7778 Mon Sep 17 00:00:00 2001 From: Marek Moryl Date: Thu, 20 Apr 2023 21:01:03 +0200 Subject: [PATCH] Add script --- .../IUM_project-checkpoint.ipynb | 6 - IUM_project.ipynb | 471 ------------------ prepare-dataset.py | 44 ++ test_file | 1 - 4 files changed, 44 insertions(+), 478 deletions(-) delete mode 100644 .ipynb_checkpoints/IUM_project-checkpoint.ipynb delete mode 100644 IUM_project.ipynb create mode 100644 prepare-dataset.py delete mode 100644 test_file diff --git a/.ipynb_checkpoints/IUM_project-checkpoint.ipynb b/.ipynb_checkpoints/IUM_project-checkpoint.ipynb deleted file mode 100644 index 363fcab..0000000 --- a/.ipynb_checkpoints/IUM_project-checkpoint.ipynb +++ /dev/null @@ -1,6 +0,0 @@ -{ - "cells": [], - "metadata": {}, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/IUM_project.ipynb b/IUM_project.ipynb deleted file mode 100644 index e003d35..0000000 --- a/IUM_project.ipynb +++ /dev/null @@ -1,471 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 4, - "id": "b14199d0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip3 install --upgrade pip\u001b[0m\n", - "Downloading property-salesmelbourne-city.zip to /Users/mmoryl/Projects/UAM/ium_s487183\n", - " 0%| | 0.00/589k [00:00=2020.1 in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (from pandas) (2023.2)\n", - "Requirement already satisfied: python-dateutil>=2.8.1 in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (from pandas) (2.8.2)\n", - "Requirement already satisfied: numpy>=1.21.0 in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (from pandas) (1.24.2)\n", - "Requirement already satisfied: six>=1.5 in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)\n", - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip3 install --upgrade pip\u001b[0m\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0SuburbAddressRoomsTypePriceMethodSellerGDateDistance...BathroomCarLandsizeBuildingAreaYearBuiltCouncilAreaLattitudeLongtitudeRegionnamePropertycount
01Abbotsford85 Turner St2h1480000SBiggin3/12/20162.5...1.01.0202.0NaNNaNYarra-37.79960144.99840Northern Metropolitan4019.0
12Abbotsford25 Bloomburg St2h1035000SBiggin4/02/20162.5...1.00.0156.079.01900.0Yarra-37.80790144.99340Northern Metropolitan4019.0
24Abbotsford5 Charles St3h1465000SPBiggin4/03/20172.5...2.00.0134.0150.01900.0Yarra-37.80930144.99440Northern Metropolitan4019.0
35Abbotsford40 Federation La3h850000PIBiggin4/03/20172.5...2.01.094.0NaNNaNYarra-37.79690144.99690Northern Metropolitan4019.0
46Abbotsford55a Park St4h1600000VBNelson4/06/20162.5...1.02.0120.0142.02014.0Yarra-37.80720144.99410Northern Metropolitan4019.0
..................................................................
1839123540Williamstown8/2 Thompson St2t622500SPGreg26/08/20176.8...2.01.0NaN89.02010.0NaN-37.86393144.90484Western Metropolitan6380.0
1839223541Williamstown96 Verdon St4h2500000PISweeney26/08/20176.8...1.05.0866.0157.01920.0NaN-37.85908144.89299Western Metropolitan6380.0
1839323544Yallambie17 Amaroo Wy4h1100000SBuckingham26/08/201712.7...3.02.0NaNNaNNaNNaN-37.72006145.10547Northern Metropolitan1369.0
1839423545Yarraville6 Agnes St4h1285000SPVillage26/08/20176.3...1.01.0362.0112.01920.0NaN-37.81188144.88449Western Metropolitan6543.0
1839523546Yarraville33 Freeman St4h1050000VBVillage26/08/20176.3...2.02.0NaN139.01950.0NaN-37.81829144.87404Western Metropolitan6543.0
\n", - "

18396 rows × 22 columns

\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 Suburb Address Rooms Type Price Method \\\n", - "0 1 Abbotsford 85 Turner St 2 h 1480000 S \n", - "1 2 Abbotsford 25 Bloomburg St 2 h 1035000 S \n", - "2 4 Abbotsford 5 Charles St 3 h 1465000 SP \n", - "3 5 Abbotsford 40 Federation La 3 h 850000 PI \n", - "4 6 Abbotsford 55a Park St 4 h 1600000 VB \n", - "... ... ... ... ... ... ... ... \n", - "18391 23540 Williamstown 8/2 Thompson St 2 t 622500 SP \n", - "18392 23541 Williamstown 96 Verdon St 4 h 2500000 PI \n", - "18393 23544 Yallambie 17 Amaroo Wy 4 h 1100000 S \n", - "18394 23545 Yarraville 6 Agnes St 4 h 1285000 SP \n", - "18395 23546 Yarraville 33 Freeman St 4 h 1050000 VB \n", - "\n", - " SellerG Date Distance ... Bathroom Car Landsize \\\n", - "0 Biggin 3/12/2016 2.5 ... 1.0 1.0 202.0 \n", - "1 Biggin 4/02/2016 2.5 ... 1.0 0.0 156.0 \n", - "2 Biggin 4/03/2017 2.5 ... 2.0 0.0 134.0 \n", - "3 Biggin 4/03/2017 2.5 ... 2.0 1.0 94.0 \n", - "4 Nelson 4/06/2016 2.5 ... 1.0 2.0 120.0 \n", - "... ... ... ... ... ... ... ... \n", - "18391 Greg 26/08/2017 6.8 ... 2.0 1.0 NaN \n", - "18392 Sweeney 26/08/2017 6.8 ... 1.0 5.0 866.0 \n", - "18393 Buckingham 26/08/2017 12.7 ... 3.0 2.0 NaN \n", - "18394 Village 26/08/2017 6.3 ... 1.0 1.0 362.0 \n", - "18395 Village 26/08/2017 6.3 ... 2.0 2.0 NaN \n", - "\n", - " BuildingArea YearBuilt CouncilArea Lattitude Longtitude \\\n", - "0 NaN NaN Yarra -37.79960 144.99840 \n", - "1 79.0 1900.0 Yarra -37.80790 144.99340 \n", - "2 150.0 1900.0 Yarra -37.80930 144.99440 \n", - "3 NaN NaN Yarra -37.79690 144.99690 \n", - "4 142.0 2014.0 Yarra -37.80720 144.99410 \n", - "... ... ... ... ... ... \n", - "18391 89.0 2010.0 NaN -37.86393 144.90484 \n", - "18392 157.0 1920.0 NaN -37.85908 144.89299 \n", - "18393 NaN NaN NaN -37.72006 145.10547 \n", - "18394 112.0 1920.0 NaN -37.81188 144.88449 \n", - "18395 139.0 1950.0 NaN -37.81829 144.87404 \n", - "\n", - " Regionname Propertycount \n", - "0 Northern Metropolitan 4019.0 \n", - "1 Northern Metropolitan 4019.0 \n", - "2 Northern Metropolitan 4019.0 \n", - "3 Northern Metropolitan 4019.0 \n", - "4 Northern Metropolitan 4019.0 \n", - "... ... ... \n", - "18391 Western Metropolitan 6380.0 \n", - "18392 Western Metropolitan 6380.0 \n", - "18393 Northern Metropolitan 1369.0 \n", - "18394 Western Metropolitan 6543.0 \n", - "18395 Western Metropolitan 6543.0 \n", - "\n", - "[18396 rows x 22 columns]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "!pip3 install pandas\n", - "import pandas as pd\n", - "sells = pd.read_csv('data/Property Sales of Melbourne City.csv')\n", - "sells\n", - "# sells[\"Car Model\"].value_counts()\n", - "# len(sells.index)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0768cc2e", - "metadata": {}, - "outputs": [], - "source": [ - "!pip3 install scikit-learn\n", - "from sklearn.model_selection import train_test_split" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/prepare-dataset.py b/prepare-dataset.py new file mode 100644 index 0000000..3d5e41e --- /dev/null +++ b/prepare-dataset.py @@ -0,0 +1,44 @@ +import pandas as pd +from sklearn.model_selection import train_test_split + +# get data +sells = pd.read_csv('data/Property Sales of Melbourne City.csv') + +# delete unnecessary columns and drop rows with NaN values +columns_to_drop = [ + 'Lattitude', + 'Longtitude', + 'CouncilArea', + 'Propertycount', + 'Method', + 'SellerG', + 'Date', + 'Postcode', + 'Bedroom2', + 'Bathroom', + 'Car', + 'BuildingArea', + 'Address' + ] +sells = sells.drop(columns_to_drop, axis=1).dropna() + +# normalize values +sells["Price"] = sells["Price"] / sells["Price"].max() +sells["Landsize"] = sells["Landsize"] / sells["Landsize"].max() +sells["Distance"] = sells["Distance"] / sells["Distance"].max() + +# split to train/dev/test subsets +X = sells +Y = sells.pop('Price') + +X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.3, random_state=1) +X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=1) + +# save subsets to files +X_train.to_csv('X_train.csv', index=False) +X_val.to_csv('X_val.csv', index=False) +X_test.to_csv('X_test.csv', index=False) + +Y_train.to_csv('Y_train.csv', index=False) +Y_val.to_csv('Y_val.csv', index=False) +Y_test.to_csv('Y_test.csv', index=False) diff --git a/test_file b/test_file deleted file mode 100644 index 3598c30..0000000 --- a/test_file +++ /dev/null @@ -1 +0,0 @@ -tests \ No newline at end of file