diff --git a/README.md b/README.md index 315d5fd..ce48fec 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,2 @@ -##Projekt na przedmiot inżynieria oprogramowania +## Projekt na przedmiot inżynieria oprogramowania -#### Starting script: - -1. pip install --user kaggle #API Kaggle, do pobrania zbioru -2. pip install --user pandas -3. kaggle datasets download -d akash14/house-price-dataset (U have to have Kaggle token, more info here: https://www.kaggle.com/docs/api) -4. unzip -o house-price-dataset.zip / tar -xf .\house-price-dataset.zip (for windows) - -data from https://www.kaggle.com/datasets/akash14/house-price-dataset diff --git a/src/preparation.py b/src/preparation.py deleted file mode 100644 index 1474cb9..0000000 --- a/src/preparation.py +++ /dev/null @@ -1,36 +0,0 @@ -import pandas as pd -import os - -# rename files -# os.rename('../Participants_Data_HPP/Train.csv', '../Participants_Data_HPP/Test1.csv') -# os.rename('../Participants_Data_HPP/Test.csv', '../Participants_Data_HPP/Train1.csv') - -# paths -filePathTest = "../Participants_Data_HPP/Train.csv" -filePathTrain = "../Participants_Data_HPP/Test.csv" - -dataTest = pd.read_csv(filePathTest) -dataTrain = pd.read_csv(filePathTrain) - -number_lines = len(dataTest.index) -row_size = number_lines // 2 - -# start looping through data writing it to a new file for each set -# no of csv files with row size -k = 2 -size = row_size - -# split test data to test and dev -for i in range(k): - df = dataTest[size * i:size * (i + 1)] - name = "" - if i == 0: - name = "Dev" - else: - name = "Test" - df.to_csv(f'../Participants_Data_HPP/' + name + '.csv', index=False) - -#df_1 = pd.read_csv("../Participants_Data_HPP/Dev.csv") - -#df_2 = pd.read_csv("../Participants_Data_HPP/Test.csv") - diff --git a/src/statistics.py b/src/statistics.py deleted file mode 100644 index cf36d15..0000000 --- a/src/statistics.py +++ /dev/null @@ -1,31 +0,0 @@ -import pandas as pd -import matplotlib.pyplot as plt - -#https://www.kaggle.com/code/aadhavvignesh/regression-with-scikit-learn-practical-ml-1 - -dataPath = '../Participants_Data_HPP/Train.csv' -data = pd.read_csv(dataPath) -info = data.info() -description = data.describe(include="all") -corr = data.corr() - -#select the most significant -data = data[['TARGET(PRICE_IN_LACS)', 'SQUARE_FT', 'BHK_NO.', 'RESALE']] -#print(data.head()) -data["BHK_NO."].value_counts().plot(kind="bar") -#plt.show() - -#normalize price column and flat area using min max technique -columnName1 = 'TARGET(PRICE_IN_LACS)' -columnName2 = 'SQUARE_FT' - -column1Min = data[columnName1].min() -column1Max = data[columnName1].max() -column2Min = data[columnName2].min() -column2Max = data[columnName2].max() - -data[columnName1] = (data[columnName1] - column1Min) / (column1Max - column1Min) -data[columnName2] = (data[columnName2] - column2Min) / (column2Max - column2Min) - -print(data.head()) -print(data.describe(include="all")) diff --git a/src/task1.ipynb b/src/task1.ipynb new file mode 100644 index 0000000..fdc3fed --- /dev/null +++ b/src/task1.ipynb @@ -0,0 +1,414 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scripts for first task" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Download data:\n", + "\n", + "run commands:\n", + "1. pip install --user kaggle #API Kaggle, do pobrania zbioru\n", + "2. pip install --user pandas\n", + "3. kaggle datasets download -d akash14/house-price-dataset (U have to have Kaggle token, more info here: https://www.kaggle.com/docs/api)\n", + "4. unzip -o house-price-dataset.zip / tar -xf .\\house-price-dataset.zip (for windows)\n", + "\n", + "data from https://www.kaggle.com/datasets/akash14/house-price-dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data preparation" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "# paths\n", + "filePathTest = \"../Participants_Data_HPP/Train.csv\"\n", + "filePathTrain = \"../Participants_Data_HPP/Test.csv\"\n", + "\n", + "dataTest = pd.read_csv(filePathTest)\n", + "dataTrain = pd.read_csv(filePathTrain)\n", + "\n", + "number_lines = len(dataTest.index)\n", + "row_size = number_lines // 2\n", + "\n", + "# start looping through data writing it to a new file for each set\n", + "# no of csv files with row size\n", + "k = 2\n", + "size = row_size\n", + "\n", + "# split test data to test and dev\n", + "for i in range(k):\n", + " df = dataTest[size * i:size * (i + 1)]\n", + " name = \"\"\n", + " if i == 0:\n", + " name = \"Dev\"\n", + " else:\n", + " name = \"Test\"\n", + " df.to_csv(f'../Participants_Data_HPP/' + name + '.csv', index=False)\n", + "\n", + "#df_1 = pd.read_csv(\"../Participants_Data_HPP/Dev.csv\")\n", + "\n", + "#df_2 = pd.read_csv(\"../Participants_Data_HPP/Test.csv\")\n", + "\n", + "#df_2 = pd.read_csv(\"../Participants_Data_HPP/Train.csv\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocessing data" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "#https://www.kaggle.com/code/aadhavvignesh/regression-with-scikit-learn-practical-ml-1\n", + "\n", + "dataPath = '../Participants_Data_HPP/Train.csv'\n", + "\n", + "#data informations\n", + "data = pd.read_csv(dataPath)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 29451 entries, 0 to 29450\n", + "Data columns (total 4 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 TARGET(PRICE_IN_LACS) 29451 non-null float64\n", + " 1 SQUARE_FT 29451 non-null float64\n", + " 2 BHK_NO. 29451 non-null int64 \n", + " 3 RESALE 29451 non-null int64 \n", + "dtypes: float64(2), int64(2)\n", + "memory usage: 920.5 KB\n" + ] + } + ], + "source": [ + "info = data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " POSTED_BY UNDER_CONSTRUCTION RERA BHK_NO. BHK_OR_RK \\\n", + "count 29451 29451.000000 29451.000000 29451.000000 29451 \n", + "unique 3 NaN NaN NaN 2 \n", + "top Dealer NaN NaN NaN BHK \n", + "freq 18291 NaN NaN NaN 29427 \n", + "mean NaN 0.179756 0.317918 2.392279 NaN \n", + "std NaN 0.383991 0.465675 0.879091 NaN \n", + "min NaN 0.000000 0.000000 1.000000 NaN \n", + "25% NaN 0.000000 0.000000 2.000000 NaN \n", + "50% NaN 0.000000 0.000000 2.000000 NaN \n", + "75% NaN 0.000000 1.000000 3.000000 NaN \n", + "max NaN 1.000000 1.000000 20.000000 NaN \n", + "\n", + " SQUARE_FT READY_TO_MOVE RESALE ADDRESS \\\n", + "count 2.945100e+04 29451.000000 29451.000000 29451 \n", + "unique NaN NaN NaN 6899 \n", + "top NaN NaN NaN Zirakpur,Chandigarh \n", + "freq NaN NaN NaN 509 \n", + "mean 1.980217e+04 0.820244 0.929578 NaN \n", + "std 1.901335e+06 0.383991 0.255861 NaN \n", + "min 3.000000e+00 0.000000 0.000000 NaN \n", + "25% 9.000211e+02 1.000000 1.000000 NaN \n", + "50% 1.175057e+03 1.000000 1.000000 NaN \n", + "75% 1.550688e+03 1.000000 1.000000 NaN \n", + "max 2.545455e+08 1.000000 1.000000 NaN \n", + "\n", + " LONGITUDE LATITUDE TARGET(PRICE_IN_LACS) \n", + "count 29451.000000 29451.000000 29451.000000 \n", + "unique NaN NaN NaN \n", + "top NaN NaN NaN \n", + "freq NaN NaN NaN \n", + "mean 21.300255 76.837695 142.898746 \n", + "std 6.205306 10.557747 656.880713 \n", + "min -37.713008 -121.761248 0.250000 \n", + "25% 18.452663 73.798100 38.000000 \n", + "50% 20.750000 77.324137 62.000000 \n", + "75% 26.900926 77.828740 100.000000 \n", + "max 59.912884 152.962676 30000.000000 \n" + ] + } + ], + "source": [ + "description = data.describe(include=\"all\")\n", + "print(description)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " UNDER_CONSTRUCTION RERA BHK_NO. SQUARE_FT \\\n", + "UNDER_CONSTRUCTION 1.000000 0.363826 -0.040712 -0.004204 \n", + "RERA 0.363826 1.000000 0.009547 -0.006229 \n", + "BHK_NO. -0.040712 0.009547 1.000000 0.005303 \n", + "SQUARE_FT -0.004204 -0.006229 0.005303 1.000000 \n", + "READY_TO_MOVE -1.000000 -0.363826 0.040712 0.004204 \n", + "RESALE -0.347405 -0.270351 0.014581 0.001732 \n", + "LONGITUDE 0.006440 0.104976 0.068730 -0.012591 \n", + "LATITUDE -0.000381 -0.065106 0.046930 0.000803 \n", + "TARGET(PRICE_IN_LACS) 0.055399 0.067636 0.112283 0.402685 \n", + "\n", + " READY_TO_MOVE RESALE LONGITUDE LATITUDE \\\n", + "UNDER_CONSTRUCTION -1.000000 -0.347405 0.006440 -0.000381 \n", + "RERA -0.363826 -0.270351 0.104976 -0.065106 \n", + "BHK_NO. 0.040712 0.014581 0.068730 0.046930 \n", + "SQUARE_FT 0.004204 0.001732 -0.012591 0.000803 \n", + "READY_TO_MOVE 1.000000 0.347405 -0.006440 0.000381 \n", + "RESALE 0.347405 1.000000 0.024038 0.014844 \n", + "LONGITUDE -0.006440 0.024038 1.000000 -0.155062 \n", + "LATITUDE 0.000381 0.014844 -0.155062 1.000000 \n", + "TARGET(PRICE_IN_LACS) -0.055399 -0.207378 -0.031112 -0.017254 \n", + "\n", + " TARGET(PRICE_IN_LACS) \n", + "UNDER_CONSTRUCTION 0.055399 \n", + "RERA 0.067636 \n", + "BHK_NO. 0.112283 \n", + "SQUARE_FT 0.402685 \n", + "READY_TO_MOVE -0.055399 \n", + "RESALE -0.207378 \n", + "LONGITUDE -0.031112 \n", + "LATITUDE -0.017254 \n", + "TARGET(PRICE_IN_LACS) 1.000000 \n" + ] + } + ], + "source": [ + "corr = data.corr()\n", + "print(corr)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD7CAYAAACIYvgKAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAUa0lEQVR4nO3dfZBdd33f8ffHUuxijC3Z3vhBUli1KDCCBDCq7AxpQ3FqyzZFbsdhRDKx4qrRdCIKhXRALp1RCziV0wTXlNipggQ2JRjHkFqJDUYxNrRp/SA/xI+4Xvwo1Q8bJJuCCUHm2z/uT8ll2UXac3dXK+v9mrmz53zP73zv7+7V7ueec89dpaqQJB3aDjvQE5AkHXiGgSTJMJAkGQaSJAwDSRKGgSQJmHugJ9DV8ccfX8PDwwd6GpJ0ULnjjjv+sqqGxtYP2jAYHh5m+/btB3oaknRQSfL4eHVPE0mSDANJkmEgScIwkCRhGEiSMAwkSRgGkiQMA0kSB/GHzsYzvP66/Rr32MZzpnkmknRw8chAkmQYSJIMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJLYjzBIsiXJs0nu66v9pyRfT3JPkj9OMq9v24VJRpI8lOTMvvqKVhtJsr6vvjjJra3+uSSHT+HjkyTth/05MvgUsGJMbRvwuqr6WeD/ABcCJFkKrAJe2/a5LMmcJHOA3wPOApYC72xjAS4GLqmqVwG7gTUDPSJJ0qTtMwyq6mvArjG1L1fVnrZ6C7CwLa8Erqqq71XVo8AIsLzdRqrqkar6a+AqYGWSAG8Frmn7XwGcO9hDkiRN1lS8Z/DPgS+25QXAk33bdrTaRPXjgOf6gmVvXZI0gwYKgyQfBPYAn5ma6ezz/tYm2Z5k++jo6EzcpSQdEjqHQZJfA94G/EpVVSvvBBb1DVvYahPVvwnMSzJ3TH1cVbWpqpZV1bKhoaGuU5ckjdEpDJKsAN4PvL2qXujbtBVYleSIJIuBJcBtwO3Aknbl0OH03mTe2kLkJuC8tv9q4NpuD0WS1NX+XFr6WeB/A69OsiPJGuDjwCuAbUnuTvL7AFV1P3A18ADwJWBdVb3Y3hN4F3AD8CBwdRsL8AHgfUlG6L2HsHlKH6EkaZ/m7mtAVb1znPKEv7Cr6iLgonHq1wPXj1N/hN7VRpKkA8RPIEuSDANJkmEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJIn9+J/ODnXD66/br3GPbTxnmmciSdPHIwNJkmEgSTIMJEkYBpIk9iMMkmxJ8myS+/pqxybZluTh9nV+qyfJx5KMJLknySl9+6xu4x9Osrqv/qYk97Z9PpYkU/0gJUk/3v4cGXwKWDGmth64saqWADe2dYCzgCXttha4HHrhAWwATgWWAxv2Bkgb8+t9+429L0nSNNtnGFTV14BdY8orgSva8hXAuX31K6vnFmBekpOAM4FtVbWrqnYD24AVbdvRVXVLVRVwZV8vSdIM6fqewQlV9VRbfho4oS0vAJ7sG7ej1X5cfcc4dUnSDBr4DeT2ir6mYC77lGRtku1Jto+Ojs7EXUrSIaFrGDzTTvHQvj7b6juBRX3jFrbaj6svHKc+rqraVFXLqmrZ0NBQx6lLksbqGgZbgb1XBK0Gru2rn9+uKjoNeL6dTroBOCPJ/PbG8RnADW3bt5Kc1q4iOr+vlyRphuzzbxMl+SzwFuD4JDvoXRW0Ebg6yRrgceAdbfj1wNnACPACcAFAVe1K8mHg9jbuQ1W1903p36B3xdLLgC+2myRpBu0zDKrqnRNsOn2csQWsm6DPFmDLOPXtwOv2NQ9J0vTxE8iSJMNAkmQYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEgOGQZL3Jrk/yX1JPpvk7yRZnOTWJCNJPpfk8Db2iLY+0rYP9/W5sNUfSnLmgI9JkjRJncMgyQLg3cCyqnodMAdYBVwMXFJVrwJ2A2vaLmuA3a1+SRtHkqVtv9cCK4DLkszpOi9J0uQNeppoLvCyJHOBI4GngLcC17TtVwDntuWVbZ22/fQkafWrqup7VfUoMAIsH3BekqRJ6BwGVbUT+B3gCXoh8DxwB/BcVe1pw3YAC9ryAuDJtu+eNv64/vo4+0iSZsAgp4nm03tVvxg4GXg5vdM80ybJ2iTbk2wfHR2dzruSpEPKIKeJfhF4tKpGq+r7wBeANwPz2mkjgIXAzra8E1gE0LYfA3yzvz7OPj+kqjZV1bKqWjY0NDTA1CVJ/QYJgyeA05Ic2c79nw48ANwEnNfGrAaubctb2zpt+1eqqlp9VbvaaDGwBLhtgHlJkiZp7r6HjK+qbk1yDXAnsAe4C9gEXAdcleQjrba57bIZ+HSSEWAXvSuIqKr7k1xNL0j2AOuq6sWu85IkTV7nMACoqg3AhjHlRxjnaqCq+ivglybocxFw0SBzkSR15yeQJUmGgSTJMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQGDIMk85Jck+TrSR5M8nNJjk2yLcnD7ev8NjZJPpZkJMk9SU7p67O6jX84yepBH5QkaXIGPTK4FPhSVb0GeD3wILAeuLGqlgA3tnWAs4Al7bYWuBwgybHABuBUYDmwYW+ASJJmRucwSHIM8A+BzQBV9ddV9RywEriiDbsCOLctrwSurJ5bgHlJTgLOBLZV1a6q2g1sA1Z0nZckafIGOTJYDIwCn0xyV5JPJHk5cEJVPdXGPA2c0JYXAE/27b+j1SaqS5JmyCBhMBc4Bbi8qt4IfIe/PSUEQFUVUAPcxw9JsjbJ9iTbR0dHp6qtJB3yBgmDHcCOqrq1rV9DLxyeaad/aF+fbdt3Aov69l/YahPVf0RVbaqqZVW1bGhoaICpS5L6dQ6DqnoaeDLJq1vpdOABYCuw94qg1cC1bXkrcH67qug04Pl2OukG4Iwk89sbx2e0miRphswdcP9/BXwmyeHAI8AF9ALm6iRrgMeBd7Sx1wNnAyPAC20sVbUryYeB29u4D1XVrgHnJUmahIHCoKruBpaNs+n0ccYWsG6CPluALYPMRZLUnZ9AliQZBpIkw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJKYgjBIMifJXUn+tK0vTnJrkpEkn0tyeKsf0dZH2vbhvh4XtvpDSc4cdE6SpMmZiiOD9wAP9q1fDFxSVa8CdgNrWn0NsLvVL2njSLIUWAW8FlgBXJZkzhTMS5K0nwYKgyQLgXOAT7T1AG8FrmlDrgDObcsr2zpt++lt/Ergqqr6XlU9CowAyweZlyRpcgY9MvjPwPuBH7T144DnqmpPW98BLGjLC4AnAdr259v4v6mPs48kaQZ0DoMkbwOerao7pnA++7rPtUm2J9k+Ojo6U3crSS95gxwZvBl4e5LHgKvonR66FJiXZG4bsxDY2ZZ3AosA2vZjgG/218fZ54dU1aaqWlZVy4aGhgaYuiSpX+cwqKoLq2phVQ3TewP4K1X1K8BNwHlt2Grg2ra8ta3Ttn+lqqrVV7WrjRYDS4Dbus5LkjR5c/c9ZNI+AFyV5CPAXcDmVt8MfDrJCLCLXoBQVfcnuRp4ANgDrKuqF6dhXpKkCUxJGFTVzcDNbfkRxrkaqKr+CvilCfa/CLhoKuYiSZo8P4EsSTIMJEnT856B9mF4/XX7Ne6xjedM80wkqccjA0mSYSBJMgwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAYIgySLktyU5IEk9yd5T6sfm2Rbkofb1/mtniQfSzKS5J4kp/T1Wt3GP5xk9eAPS5I0GYMcGewBfrOqlgKnAeuSLAXWAzdW1RLgxrYOcBawpN3WApdDLzyADcCpwHJgw94AkSTNjM5hUFVPVdWdbfn/AQ8CC4CVwBVt2BXAuW15JXBl9dwCzEtyEnAmsK2qdlXVbmAbsKLrvCRJkzcl7xkkGQbeCNwKnFBVT7VNTwMntOUFwJN9u+1otYnq493P2iTbk2wfHR2diqlLkpiCMEhyFPB54F9X1bf6t1VVATXoffT121RVy6pq2dDQ0FS1laRD3kBhkOQn6AXBZ6rqC638TDv9Q/v6bKvvBBb17b6w1SaqS5JmyCBXEwXYDDxYVR/t27QV2HtF0Grg2r76+e2qotOA59vppBuAM5LMb28cn9FqkqQZMneAfd8M/Cpwb5K7W+3fAhuBq5OsAR4H3tG2XQ+cDYwALwAXAFTVriQfBm5v4z5UVbsGmNchaXj9dfs17rGN50zzTCQdjDqHQVX9TyATbD59nPEFrJug1xZgS9e5SJIG4yeQJUmGgSTJMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiRg7oGewF5JVgCXAnOAT1TVxgM8pUPe8Prr9mvcYxvPmeaZSJpus+LIIMkc4PeAs4ClwDuTLD2ws5KkQ8dsOTJYDoxU1SMASa4CVgIPHNBZacp5tCHNTqmqAz0HkpwHrKiqf9HWfxU4tareNWbcWmBtW3018NB+tD8e+MspnK497Tmbex4Mc7Tnge35yqoaGlucLUcG+6WqNgGbJrNPku1VtWwq52FPe87WngfDHO05O3vOivcMgJ3Aor71ha0mSZoBsyUMbgeWJFmc5HBgFbD1AM9Jkg4Zs+I0UVXtSfIu4AZ6l5Zuqar7p6j9pE4r2dOeB3nPg2GO9pyFPWfFG8iSpANrtpwmkiQdQIaBJMkwkCS9BMMgyWuSnJ7kqDH1FQP0XJ7k77flpUnel+TsQec6nZJcOQ09f7499jMG6HFqkqPb8suS/Ickf5Lk4iTHdOz57iSL9j1yv/sdnuT8JL/Y1n85yceTrEvyE1N1PxpMkr+b5N8kuTTJR5P8y73/tjR5L6k3kJO8G1gHPAi8AXhPVV3btt1ZVad06LmB3t9MmgtsA04FbgL+MXBDVV00NbP/m/u7oKo+Ocl9xl6GG+AfAV8BqKq3d5zLbVW1vC3/Or3v7R8DZwB/0uWPCSa5H3h9u4JsE/ACcA1weqv/sw49nwe+A3wD+CzwR1U1Otk+ff0+Q+/5PhJ4DjgK+EKbY6pqdYeexwAXAucCPwkU8CxwLbCxqp7rOt9DUftZfxvwNeBs4C56z9U/BX6jqm4+YJM7WFXVS+YG3Asc1ZaHge30AgHgrgF6zqH3i+FbwNGt/jLgnml4DE902OdO4L8BbwF+oX19qi3/wgBzuatv+XZgqC2/HLi3Y88H++c9ZtvdXedJ7yj3DGAzMAp8CVgNvKJDv3va17nAM8Cctp6uzzm9y6Y/AJzYVzux1b48wHN0DLAR+DqwC/gmvRdDG4F5HXuuGNN/M3AP8IfACR36HQ38R+DTwC+P2XZZxzne2/e8HAnc3JZ/aoCf9ROBy+n90czjgH/f7udq4KSOPe8E/h3w97o+x5O8vy923feldprosKr6NkBVPUbvl+JZST5K7we5iz1V9WJVvQB8o6q+1fp/F/hBl4ZJ7pngdi9wQoeWy4A7gA8Cz1fvVdF3q+qrVfXVLnNsDksyP8lx9F4RjwJU1XeAPR173pfkgrb8F0mWAST5aeD7HXtWVf2gqr5cVWuAk4HLgBXAIx36HdY+/PgKer9o9p6+OgLoeppouKourqqn+yb9dFVdDLyyY0/o/aLaDbylqo6tquPoHRXubtu6+K2+5d+l98Lin9B7QfBfO/T7JL2fv88Dq5J8PskRbdtpHecIf/s5qSPoHb1RVU/Q/Tn6FL0/jvkkvaP/79I76vgfwO937DkfmAfclOS2JO9NcnLHXgAkOWWC25vonRHpZibSaqZu9E6LvGFMbS5wJfBix563Ake25cP66scw5pXtJHo+0560V465DQP/d4DHvxD4I+DjdDjCGKffY/R+mT7avp7U6kfR/VX8MfR+6L7Rvrffb72/Su80UZeed/2YbUd26PfeNqfHgXcDNwJ/QO9V4oaOc/wy8H76XlnTC/4PAH82wHP0UJdt++h5Z9/y3WO2Tfp5H6fHB4E/p/fqu+vP0HvoHa38Ab2jogtafQj42qD/jsb+/Azw773/e/kP6L1IeZpe2Kzt2PPF9rvupnFu3+38b6nrjrPx1n4ZnjjBtjd37HnEBPXjgZ/p2HMz8PMTbPvDKfg+nAP81jR+n48EFg/Y42jg9cCb6HDqYUyvn56Gx3gycHJbngecBywfoN984OL2i2s3vVM6D7basQP0nfKQAXYA7wN+s4Vi+rZN+jRZe5yHjan9GnA/8PgAj/217Xl5zRQ953/Rt/yRMdu6nhb9kbCjd9p5BfDJjj3vA5ZMsO3Jro//JfUGsjSbJXkNvRcst1Q7ndnqK6rqSx17zgfW0/v/P36ylZ+h97e9NlbV7g49N4wpXVZVo0lOBH67qs6fZL/fpve+yJ+Nqa8A/ktVLZnsHKdDkg/Re3zfHlN/Fb3v5Xkdel5VVaumao6t53n0wulH/oR/knOr6r936msYSNNvOq5024/7nPSVaTPdczrmOB0Ohu/loD0NA2kGtIsDfq6qvp1kmN7ltJ+uqkuT3FVVb5yG+3yiqn5qNvecjjlOh4Phezloz1nxV0ulQ8APXemW5C3ANUleSfcr3Uhyz0Sb6HZl2pT3nI45ToeD4Xs5XT3BMJBmyjNJ3lBVdwO0I4S3AVuAnxmg7wnAmfTelO4X4H/Nkp7TMcfpcDB8L6erp2EgzZDzGfPZjKraA5yfpMu1+3v9Kb0PWt49dkOSm2dJz+mY43Q4GL6X09XT9wwkSS/BP1QnSZo8w0CSZBhIkgwDSRKGgSQJ+P/uPiGXBAZ4vAAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "#print(data.head())\n", + "data[\"BHK_NO.\"].value_counts().plot(kind=\"bar\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TARGET(PRICE_IN_LACS)SQUARE_FTBHK_NO.RESALE
count29451.00000029451.00000029451.00000029451.000000
mean0.0047550.0000782.3922790.929578
std0.0218960.0074700.8790910.255861
min0.0000000.0000001.0000000.000000
25%0.0012580.0000042.0000001.000000
50%0.0020580.0000052.0000001.000000
75%0.0033250.0000063.0000001.000000
max1.0000001.00000020.0000001.000000
\n", + "
" + ], + "text/plain": [ + " TARGET(PRICE_IN_LACS) SQUARE_FT BHK_NO. RESALE\n", + "count 29451.000000 29451.000000 29451.000000 29451.000000\n", + "mean 0.004755 0.000078 2.392279 0.929578\n", + "std 0.021896 0.007470 0.879091 0.255861\n", + "min 0.000000 0.000000 1.000000 0.000000\n", + "25% 0.001258 0.000004 2.000000 1.000000\n", + "50% 0.002058 0.000005 2.000000 1.000000\n", + "75% 0.003325 0.000006 3.000000 1.000000\n", + "max 1.000000 1.000000 20.000000 1.000000" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#select the most significant\n", + "data = data[['TARGET(PRICE_IN_LACS)', 'SQUARE_FT', 'BHK_NO.', 'RESALE']]\n", + "#normalize price column and flat area using min max technique\n", + "columnName1 = 'TARGET(PRICE_IN_LACS)'\n", + "columnName2 = 'SQUARE_FT'\n", + "\n", + "column1Min = data[columnName1].min()\n", + "column1Max = data[columnName1].max()\n", + "column2Min = data[columnName2].min()\n", + "column2Max = data[columnName2].max()\n", + "\n", + "data[columnName1] = (data[columnName1] - column1Min) / (column1Max - column1Min)\n", + "data[columnName2] = (data[columnName2] - column2Min) / (column2Max - column2Min)\n", + "\n", + "data.describe(include=\"all\")" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "af5b2cd8b73544b3367a1819c4d38f2ddf8e465f90aac518592405ee31677a58" + }, + "kernelspec": { + "display_name": "Python 3.8.3 32-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}