Add script

This commit is contained in:
Marek Moryl 2023-04-20 21:01:03 +02:00
parent d087d791a5
commit b680bf88e1
4 changed files with 44 additions and 478 deletions

View File

@ -1,6 +0,0 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1,471 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"id": "b14199d0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip3 install --upgrade pip\u001b[0m\n",
"Downloading property-salesmelbourne-city.zip to /Users/mmoryl/Projects/UAM/ium_s487183\n",
" 0%| | 0.00/589k [00:00<?, ?B/s]\n",
"100%|████████████████████████████████████████| 589k/589k [00:00<00:00, 7.61MB/s]\n",
"Archive: property-salesmelbourne-city.zip\n",
" inflating: data/Property Sales of Melbourne City.csv \n",
"Property Sales of Melbourne City.csv\n"
]
}
],
"source": [
"!pip3 install -q kaggle\n",
"!kaggle datasets download amalab182/property-salesmelbourne-city\n",
"!mkdir -p data\n",
"!unzip -o property-salesmelbourne-city.zip -d data\n",
"!rm property-salesmelbourne-city.zip\n",
"!ls data"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "10a21817",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pandas in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (1.5.3)\n",
"Requirement already satisfied: pytz>=2020.1 in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (from pandas) (2023.2)\n",
"Requirement already satisfied: python-dateutil>=2.8.1 in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (from pandas) (2.8.2)\n",
"Requirement already satisfied: numpy>=1.21.0 in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (from pandas) (1.24.2)\n",
"Requirement already satisfied: six>=1.5 in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)\n",
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip3 install --upgrade pip\u001b[0m\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Suburb</th>\n",
" <th>Address</th>\n",
" <th>Rooms</th>\n",
" <th>Type</th>\n",
" <th>Price</th>\n",
" <th>Method</th>\n",
" <th>SellerG</th>\n",
" <th>Date</th>\n",
" <th>Distance</th>\n",
" <th>...</th>\n",
" <th>Bathroom</th>\n",
" <th>Car</th>\n",
" <th>Landsize</th>\n",
" <th>BuildingArea</th>\n",
" <th>YearBuilt</th>\n",
" <th>CouncilArea</th>\n",
" <th>Lattitude</th>\n",
" <th>Longtitude</th>\n",
" <th>Regionname</th>\n",
" <th>Propertycount</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Abbotsford</td>\n",
" <td>85 Turner St</td>\n",
" <td>2</td>\n",
" <td>h</td>\n",
" <td>1480000</td>\n",
" <td>S</td>\n",
" <td>Biggin</td>\n",
" <td>3/12/2016</td>\n",
" <td>2.5</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>202.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Yarra</td>\n",
" <td>-37.79960</td>\n",
" <td>144.99840</td>\n",
" <td>Northern Metropolitan</td>\n",
" <td>4019.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Abbotsford</td>\n",
" <td>25 Bloomburg St</td>\n",
" <td>2</td>\n",
" <td>h</td>\n",
" <td>1035000</td>\n",
" <td>S</td>\n",
" <td>Biggin</td>\n",
" <td>4/02/2016</td>\n",
" <td>2.5</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>156.0</td>\n",
" <td>79.0</td>\n",
" <td>1900.0</td>\n",
" <td>Yarra</td>\n",
" <td>-37.80790</td>\n",
" <td>144.99340</td>\n",
" <td>Northern Metropolitan</td>\n",
" <td>4019.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4</td>\n",
" <td>Abbotsford</td>\n",
" <td>5 Charles St</td>\n",
" <td>3</td>\n",
" <td>h</td>\n",
" <td>1465000</td>\n",
" <td>SP</td>\n",
" <td>Biggin</td>\n",
" <td>4/03/2017</td>\n",
" <td>2.5</td>\n",
" <td>...</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>134.0</td>\n",
" <td>150.0</td>\n",
" <td>1900.0</td>\n",
" <td>Yarra</td>\n",
" <td>-37.80930</td>\n",
" <td>144.99440</td>\n",
" <td>Northern Metropolitan</td>\n",
" <td>4019.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>5</td>\n",
" <td>Abbotsford</td>\n",
" <td>40 Federation La</td>\n",
" <td>3</td>\n",
" <td>h</td>\n",
" <td>850000</td>\n",
" <td>PI</td>\n",
" <td>Biggin</td>\n",
" <td>4/03/2017</td>\n",
" <td>2.5</td>\n",
" <td>...</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>94.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Yarra</td>\n",
" <td>-37.79690</td>\n",
" <td>144.99690</td>\n",
" <td>Northern Metropolitan</td>\n",
" <td>4019.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>6</td>\n",
" <td>Abbotsford</td>\n",
" <td>55a Park St</td>\n",
" <td>4</td>\n",
" <td>h</td>\n",
" <td>1600000</td>\n",
" <td>VB</td>\n",
" <td>Nelson</td>\n",
" <td>4/06/2016</td>\n",
" <td>2.5</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>120.0</td>\n",
" <td>142.0</td>\n",
" <td>2014.0</td>\n",
" <td>Yarra</td>\n",
" <td>-37.80720</td>\n",
" <td>144.99410</td>\n",
" <td>Northern Metropolitan</td>\n",
" <td>4019.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18391</th>\n",
" <td>23540</td>\n",
" <td>Williamstown</td>\n",
" <td>8/2 Thompson St</td>\n",
" <td>2</td>\n",
" <td>t</td>\n",
" <td>622500</td>\n",
" <td>SP</td>\n",
" <td>Greg</td>\n",
" <td>26/08/2017</td>\n",
" <td>6.8</td>\n",
" <td>...</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>89.0</td>\n",
" <td>2010.0</td>\n",
" <td>NaN</td>\n",
" <td>-37.86393</td>\n",
" <td>144.90484</td>\n",
" <td>Western Metropolitan</td>\n",
" <td>6380.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18392</th>\n",
" <td>23541</td>\n",
" <td>Williamstown</td>\n",
" <td>96 Verdon St</td>\n",
" <td>4</td>\n",
" <td>h</td>\n",
" <td>2500000</td>\n",
" <td>PI</td>\n",
" <td>Sweeney</td>\n",
" <td>26/08/2017</td>\n",
" <td>6.8</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>5.0</td>\n",
" <td>866.0</td>\n",
" <td>157.0</td>\n",
" <td>1920.0</td>\n",
" <td>NaN</td>\n",
" <td>-37.85908</td>\n",
" <td>144.89299</td>\n",
" <td>Western Metropolitan</td>\n",
" <td>6380.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18393</th>\n",
" <td>23544</td>\n",
" <td>Yallambie</td>\n",
" <td>17 Amaroo Wy</td>\n",
" <td>4</td>\n",
" <td>h</td>\n",
" <td>1100000</td>\n",
" <td>S</td>\n",
" <td>Buckingham</td>\n",
" <td>26/08/2017</td>\n",
" <td>12.7</td>\n",
" <td>...</td>\n",
" <td>3.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>-37.72006</td>\n",
" <td>145.10547</td>\n",
" <td>Northern Metropolitan</td>\n",
" <td>1369.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18394</th>\n",
" <td>23545</td>\n",
" <td>Yarraville</td>\n",
" <td>6 Agnes St</td>\n",
" <td>4</td>\n",
" <td>h</td>\n",
" <td>1285000</td>\n",
" <td>SP</td>\n",
" <td>Village</td>\n",
" <td>26/08/2017</td>\n",
" <td>6.3</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>362.0</td>\n",
" <td>112.0</td>\n",
" <td>1920.0</td>\n",
" <td>NaN</td>\n",
" <td>-37.81188</td>\n",
" <td>144.88449</td>\n",
" <td>Western Metropolitan</td>\n",
" <td>6543.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18395</th>\n",
" <td>23546</td>\n",
" <td>Yarraville</td>\n",
" <td>33 Freeman St</td>\n",
" <td>4</td>\n",
" <td>h</td>\n",
" <td>1050000</td>\n",
" <td>VB</td>\n",
" <td>Village</td>\n",
" <td>26/08/2017</td>\n",
" <td>6.3</td>\n",
" <td>...</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>139.0</td>\n",
" <td>1950.0</td>\n",
" <td>NaN</td>\n",
" <td>-37.81829</td>\n",
" <td>144.87404</td>\n",
" <td>Western Metropolitan</td>\n",
" <td>6543.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>18396 rows × 22 columns</p>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 Suburb Address Rooms Type Price Method \\\n",
"0 1 Abbotsford 85 Turner St 2 h 1480000 S \n",
"1 2 Abbotsford 25 Bloomburg St 2 h 1035000 S \n",
"2 4 Abbotsford 5 Charles St 3 h 1465000 SP \n",
"3 5 Abbotsford 40 Federation La 3 h 850000 PI \n",
"4 6 Abbotsford 55a Park St 4 h 1600000 VB \n",
"... ... ... ... ... ... ... ... \n",
"18391 23540 Williamstown 8/2 Thompson St 2 t 622500 SP \n",
"18392 23541 Williamstown 96 Verdon St 4 h 2500000 PI \n",
"18393 23544 Yallambie 17 Amaroo Wy 4 h 1100000 S \n",
"18394 23545 Yarraville 6 Agnes St 4 h 1285000 SP \n",
"18395 23546 Yarraville 33 Freeman St 4 h 1050000 VB \n",
"\n",
" SellerG Date Distance ... Bathroom Car Landsize \\\n",
"0 Biggin 3/12/2016 2.5 ... 1.0 1.0 202.0 \n",
"1 Biggin 4/02/2016 2.5 ... 1.0 0.0 156.0 \n",
"2 Biggin 4/03/2017 2.5 ... 2.0 0.0 134.0 \n",
"3 Biggin 4/03/2017 2.5 ... 2.0 1.0 94.0 \n",
"4 Nelson 4/06/2016 2.5 ... 1.0 2.0 120.0 \n",
"... ... ... ... ... ... ... ... \n",
"18391 Greg 26/08/2017 6.8 ... 2.0 1.0 NaN \n",
"18392 Sweeney 26/08/2017 6.8 ... 1.0 5.0 866.0 \n",
"18393 Buckingham 26/08/2017 12.7 ... 3.0 2.0 NaN \n",
"18394 Village 26/08/2017 6.3 ... 1.0 1.0 362.0 \n",
"18395 Village 26/08/2017 6.3 ... 2.0 2.0 NaN \n",
"\n",
" BuildingArea YearBuilt CouncilArea Lattitude Longtitude \\\n",
"0 NaN NaN Yarra -37.79960 144.99840 \n",
"1 79.0 1900.0 Yarra -37.80790 144.99340 \n",
"2 150.0 1900.0 Yarra -37.80930 144.99440 \n",
"3 NaN NaN Yarra -37.79690 144.99690 \n",
"4 142.0 2014.0 Yarra -37.80720 144.99410 \n",
"... ... ... ... ... ... \n",
"18391 89.0 2010.0 NaN -37.86393 144.90484 \n",
"18392 157.0 1920.0 NaN -37.85908 144.89299 \n",
"18393 NaN NaN NaN -37.72006 145.10547 \n",
"18394 112.0 1920.0 NaN -37.81188 144.88449 \n",
"18395 139.0 1950.0 NaN -37.81829 144.87404 \n",
"\n",
" Regionname Propertycount \n",
"0 Northern Metropolitan 4019.0 \n",
"1 Northern Metropolitan 4019.0 \n",
"2 Northern Metropolitan 4019.0 \n",
"3 Northern Metropolitan 4019.0 \n",
"4 Northern Metropolitan 4019.0 \n",
"... ... ... \n",
"18391 Western Metropolitan 6380.0 \n",
"18392 Western Metropolitan 6380.0 \n",
"18393 Northern Metropolitan 1369.0 \n",
"18394 Western Metropolitan 6543.0 \n",
"18395 Western Metropolitan 6543.0 \n",
"\n",
"[18396 rows x 22 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"!pip3 install pandas\n",
"import pandas as pd\n",
"sells = pd.read_csv('data/Property Sales of Melbourne City.csv')\n",
"sells\n",
"# sells[\"Car Model\"].value_counts()\n",
"# len(sells.index)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0768cc2e",
"metadata": {},
"outputs": [],
"source": [
"!pip3 install scikit-learn\n",
"from sklearn.model_selection import train_test_split"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

44
prepare-dataset.py Normal file
View File

@ -0,0 +1,44 @@
import pandas as pd
from sklearn.model_selection import train_test_split
# get data
sells = pd.read_csv('data/Property Sales of Melbourne City.csv')
# delete unnecessary columns and drop rows with NaN values
columns_to_drop = [
'Lattitude',
'Longtitude',
'CouncilArea',
'Propertycount',
'Method',
'SellerG',
'Date',
'Postcode',
'Bedroom2',
'Bathroom',
'Car',
'BuildingArea',
'Address'
]
sells = sells.drop(columns_to_drop, axis=1).dropna()
# normalize values
sells["Price"] = sells["Price"] / sells["Price"].max()
sells["Landsize"] = sells["Landsize"] / sells["Landsize"].max()
sells["Distance"] = sells["Distance"] / sells["Distance"].max()
# split to train/dev/test subsets
X = sells
Y = sells.pop('Price')
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.3, random_state=1)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=1)
# save subsets to files
X_train.to_csv('X_train.csv', index=False)
X_val.to_csv('X_val.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
Y_train.to_csv('Y_train.csv', index=False)
Y_val.to_csv('Y_val.csv', index=False)
Y_test.to_csv('Y_test.csv', index=False)

View File

@ -1 +0,0 @@
tests