Python2018/labs05/pandas_wprowadzenie.ipynb

2310 lines
72 KiB
Plaintext
Raw Normal View History

2018-07-17 16:45:17 +02:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# Analiza danych w Pythonie\n",
"\n",
"### Tomasz Dwojak\n",
"\n",
"### 3 czerwca 2018"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### Analiza danych:\n",
"\n",
" * R\n",
" * Python"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### Python Ekosystem\n",
"\n",
" * pandas: ramka danych\n",
" * sklearn: modele ML\n",
" * numpy: obliczenia\n",
" * matplotlib: wykresy"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [],
"source": [
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### Typy danych\n",
"\n",
" * Szereg (`pd.Series`)\n",
" * Ramka danych (`pd.DataFrame`)"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### Wczytanie danych"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [],
"source": [
"data = pd.read_csv(\"./data/iowa.csv.gz\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>MSSubClass</th>\n",
" <th>MSZoning</th>\n",
" <th>LotFrontage</th>\n",
" <th>LotArea</th>\n",
" <th>Street</th>\n",
" <th>Alley</th>\n",
" <th>LotShape</th>\n",
" <th>LandContour</th>\n",
" <th>Utilities</th>\n",
" <th>...</th>\n",
" <th>PoolArea</th>\n",
" <th>PoolQC</th>\n",
" <th>Fence</th>\n",
" <th>MiscFeature</th>\n",
" <th>MiscVal</th>\n",
" <th>MoSold</th>\n",
" <th>YrSold</th>\n",
" <th>SaleType</th>\n",
" <th>SaleCondition</th>\n",
" <th>SalePrice</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>65.0</td>\n",
" <td>8450</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>208500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>20</td>\n",
" <td>RL</td>\n",
" <td>80.0</td>\n",
" <td>9600</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>2007</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>181500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>68.0</td>\n",
" <td>11250</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>9</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>223500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>RL</td>\n",
" <td>60.0</td>\n",
" <td>9550</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2006</td>\n",
" <td>WD</td>\n",
" <td>Abnorml</td>\n",
" <td>140000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>84.0</td>\n",
" <td>14260</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>12</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>250000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 81 columns</p>\n",
"</div>"
],
"text/plain": [
" Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n",
"0 1 60 RL 65.0 8450 Pave NaN Reg \n",
"1 2 20 RL 80.0 9600 Pave NaN Reg \n",
"2 3 60 RL 68.0 11250 Pave NaN IR1 \n",
"3 4 70 RL 60.0 9550 Pave NaN IR1 \n",
"4 5 60 RL 84.0 14260 Pave NaN IR1 \n",
"\n",
" LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal \\\n",
"0 Lvl AllPub ... 0 NaN NaN NaN 0 \n",
"1 Lvl AllPub ... 0 NaN NaN NaN 0 \n",
"2 Lvl AllPub ... 0 NaN NaN NaN 0 \n",
"3 Lvl AllPub ... 0 NaN NaN NaN 0 \n",
"4 Lvl AllPub ... 0 NaN NaN NaN 0 \n",
"\n",
" MoSold YrSold SaleType SaleCondition SalePrice \n",
"0 2 2008 WD Normal 208500 \n",
"1 5 2007 WD Normal 181500 \n",
"2 9 2008 WD Normal 223500 \n",
"3 2 2006 WD Abnorml 140000 \n",
"4 12 2008 WD Normal 250000 \n",
"\n",
"[5 rows x 81 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1460, 81)\n"
]
}
],
"source": [
"shape = data.shape\n",
"rows = shape[0]\n",
"cols = shape[1]\n",
"\n",
"print(rows, cols)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 1460 entries, 0 to 1459\n",
"Data columns (total 81 columns):\n",
"Id 1460 non-null int64\n",
"MSSubClass 1460 non-null int64\n",
"MSZoning 1460 non-null object\n",
"LotFrontage 1201 non-null float64\n",
"LotArea 1460 non-null int64\n",
"Street 1460 non-null object\n",
"Alley 91 non-null object\n",
"LotShape 1460 non-null object\n",
"LandContour 1460 non-null object\n",
"Utilities 1460 non-null object\n",
"LotConfig 1460 non-null object\n",
"LandSlope 1460 non-null object\n",
"Neighborhood 1460 non-null object\n",
"Condition1 1460 non-null object\n",
"Condition2 1460 non-null object\n",
"BldgType 1460 non-null object\n",
"HouseStyle 1460 non-null object\n",
"OverallQual 1460 non-null int64\n",
"OverallCond 1460 non-null int64\n",
"YearBuilt 1460 non-null int64\n",
"YearRemodAdd 1460 non-null int64\n",
"RoofStyle 1460 non-null object\n",
"RoofMatl 1460 non-null object\n",
"Exterior1st 1460 non-null object\n",
"Exterior2nd 1460 non-null object\n",
"MasVnrType 1452 non-null object\n",
"MasVnrArea 1452 non-null float64\n",
"ExterQual 1460 non-null object\n",
"ExterCond 1460 non-null object\n",
"Foundation 1460 non-null object\n",
"BsmtQual 1423 non-null object\n",
"BsmtCond 1423 non-null object\n",
"BsmtExposure 1422 non-null object\n",
"BsmtFinType1 1423 non-null object\n",
"BsmtFinSF1 1460 non-null int64\n",
"BsmtFinType2 1422 non-null object\n",
"BsmtFinSF2 1460 non-null int64\n",
"BsmtUnfSF 1460 non-null int64\n",
"TotalBsmtSF 1460 non-null int64\n",
"Heating 1460 non-null object\n",
"HeatingQC 1460 non-null object\n",
"CentralAir 1460 non-null object\n",
"Electrical 1459 non-null object\n",
"1stFlrSF 1460 non-null int64\n",
"2ndFlrSF 1460 non-null int64\n",
"LowQualFinSF 1460 non-null int64\n",
"GrLivArea 1460 non-null int64\n",
"BsmtFullBath 1460 non-null int64\n",
"BsmtHalfBath 1460 non-null int64\n",
"FullBath 1460 non-null int64\n",
"HalfBath 1460 non-null int64\n",
"BedroomAbvGr 1460 non-null int64\n",
"KitchenAbvGr 1460 non-null int64\n",
"KitchenQual 1460 non-null object\n",
"TotRmsAbvGrd 1460 non-null int64\n",
"Functional 1460 non-null object\n",
"Fireplaces 1460 non-null int64\n",
"FireplaceQu 770 non-null object\n",
"GarageType 1379 non-null object\n",
"GarageYrBlt 1379 non-null float64\n",
"GarageFinish 1379 non-null object\n",
"GarageCars 1460 non-null int64\n",
"GarageArea 1460 non-null int64\n",
"GarageQual 1379 non-null object\n",
"GarageCond 1379 non-null object\n",
"PavedDrive 1460 non-null object\n",
"WoodDeckSF 1460 non-null int64\n",
"OpenPorchSF 1460 non-null int64\n",
"EnclosedPorch 1460 non-null int64\n",
"3SsnPorch 1460 non-null int64\n",
"ScreenPorch 1460 non-null int64\n",
"PoolArea 1460 non-null int64\n",
"PoolQC 7 non-null object\n",
"Fence 281 non-null object\n",
"MiscFeature 54 non-null object\n",
"MiscVal 1460 non-null int64\n",
"MoSold 1460 non-null int64\n",
"YrSold 1460 non-null int64\n",
"SaleType 1460 non-null object\n",
"SaleCondition 1460 non-null object\n",
"SalePrice 1460 non-null int64\n",
"dtypes: float64(3), int64(35), object(43)\n",
"memory usage: 924.0+ KB\n"
]
}
],
"source": [
"data.info()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>MSSubClass</th>\n",
" <th>LotFrontage</th>\n",
" <th>LotArea</th>\n",
" <th>OverallQual</th>\n",
" <th>OverallCond</th>\n",
" <th>YearBuilt</th>\n",
" <th>YearRemodAdd</th>\n",
" <th>MasVnrArea</th>\n",
" <th>BsmtFinSF1</th>\n",
" <th>...</th>\n",
" <th>WoodDeckSF</th>\n",
" <th>OpenPorchSF</th>\n",
" <th>EnclosedPorch</th>\n",
" <th>3SsnPorch</th>\n",
" <th>ScreenPorch</th>\n",
" <th>PoolArea</th>\n",
" <th>MiscVal</th>\n",
" <th>MoSold</th>\n",
" <th>YrSold</th>\n",
" <th>SalePrice</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>1460.000000</td>\n",
" <td>1460.000000</td>\n",
" <td>1201.000000</td>\n",
" <td>1460.000000</td>\n",
" <td>1460.000000</td>\n",
" <td>1460.000000</td>\n",
" <td>1460.000000</td>\n",
" <td>1460.000000</td>\n",
" <td>1452.000000</td>\n",
" <td>1460.000000</td>\n",
" <td>...</td>\n",
" <td>1460.000000</td>\n",
" <td>1460.000000</td>\n",
" <td>1460.000000</td>\n",
" <td>1460.000000</td>\n",
" <td>1460.000000</td>\n",
" <td>1460.000000</td>\n",
" <td>1460.000000</td>\n",
" <td>1460.000000</td>\n",
" <td>1460.000000</td>\n",
" <td>1460.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>730.500000</td>\n",
" <td>56.897260</td>\n",
" <td>70.049958</td>\n",
" <td>10516.828082</td>\n",
" <td>6.099315</td>\n",
" <td>5.575342</td>\n",
" <td>1971.267808</td>\n",
" <td>1984.865753</td>\n",
" <td>103.685262</td>\n",
" <td>443.639726</td>\n",
" <td>...</td>\n",
" <td>94.244521</td>\n",
" <td>46.660274</td>\n",
" <td>21.954110</td>\n",
" <td>3.409589</td>\n",
" <td>15.060959</td>\n",
" <td>2.758904</td>\n",
" <td>43.489041</td>\n",
" <td>6.321918</td>\n",
" <td>2007.815753</td>\n",
" <td>180921.195890</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>421.610009</td>\n",
" <td>42.300571</td>\n",
" <td>24.284752</td>\n",
" <td>9981.264932</td>\n",
" <td>1.382997</td>\n",
" <td>1.112799</td>\n",
" <td>30.202904</td>\n",
" <td>20.645407</td>\n",
" <td>181.066207</td>\n",
" <td>456.098091</td>\n",
" <td>...</td>\n",
" <td>125.338794</td>\n",
" <td>66.256028</td>\n",
" <td>61.119149</td>\n",
" <td>29.317331</td>\n",
" <td>55.757415</td>\n",
" <td>40.177307</td>\n",
" <td>496.123024</td>\n",
" <td>2.703626</td>\n",
" <td>1.328095</td>\n",
" <td>79442.502883</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1.000000</td>\n",
" <td>20.000000</td>\n",
" <td>21.000000</td>\n",
" <td>1300.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1872.000000</td>\n",
" <td>1950.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>2006.000000</td>\n",
" <td>34900.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>365.750000</td>\n",
" <td>20.000000</td>\n",
" <td>59.000000</td>\n",
" <td>7553.500000</td>\n",
" <td>5.000000</td>\n",
" <td>5.000000</td>\n",
" <td>1954.000000</td>\n",
" <td>1967.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>5.000000</td>\n",
" <td>2007.000000</td>\n",
" <td>129975.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>730.500000</td>\n",
" <td>50.000000</td>\n",
" <td>69.000000</td>\n",
" <td>9478.500000</td>\n",
" <td>6.000000</td>\n",
" <td>5.000000</td>\n",
" <td>1973.000000</td>\n",
" <td>1994.000000</td>\n",
" <td>0.000000</td>\n",
" <td>383.500000</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>25.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>6.000000</td>\n",
" <td>2008.000000</td>\n",
" <td>163000.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>1095.250000</td>\n",
" <td>70.000000</td>\n",
" <td>80.000000</td>\n",
" <td>11601.500000</td>\n",
" <td>7.000000</td>\n",
" <td>6.000000</td>\n",
" <td>2000.000000</td>\n",
" <td>2004.000000</td>\n",
" <td>166.000000</td>\n",
" <td>712.250000</td>\n",
" <td>...</td>\n",
" <td>168.000000</td>\n",
" <td>68.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>8.000000</td>\n",
" <td>2009.000000</td>\n",
" <td>214000.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>1460.000000</td>\n",
" <td>190.000000</td>\n",
" <td>313.000000</td>\n",
" <td>215245.000000</td>\n",
" <td>10.000000</td>\n",
" <td>9.000000</td>\n",
" <td>2010.000000</td>\n",
" <td>2010.000000</td>\n",
" <td>1600.000000</td>\n",
" <td>5644.000000</td>\n",
" <td>...</td>\n",
" <td>857.000000</td>\n",
" <td>547.000000</td>\n",
" <td>552.000000</td>\n",
" <td>508.000000</td>\n",
" <td>480.000000</td>\n",
" <td>738.000000</td>\n",
" <td>15500.000000</td>\n",
" <td>12.000000</td>\n",
" <td>2010.000000</td>\n",
" <td>755000.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8 rows × 38 columns</p>\n",
"</div>"
],
"text/plain": [
" Id MSSubClass LotFrontage LotArea OverallQual \\\n",
"count 1460.000000 1460.000000 1201.000000 1460.000000 1460.000000 \n",
"mean 730.500000 56.897260 70.049958 10516.828082 6.099315 \n",
"std 421.610009 42.300571 24.284752 9981.264932 1.382997 \n",
"min 1.000000 20.000000 21.000000 1300.000000 1.000000 \n",
"25% 365.750000 20.000000 59.000000 7553.500000 5.000000 \n",
"50% 730.500000 50.000000 69.000000 9478.500000 6.000000 \n",
"75% 1095.250000 70.000000 80.000000 11601.500000 7.000000 \n",
"max 1460.000000 190.000000 313.000000 215245.000000 10.000000 \n",
"\n",
" OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 \\\n",
"count 1460.000000 1460.000000 1460.000000 1452.000000 1460.000000 \n",
"mean 5.575342 1971.267808 1984.865753 103.685262 443.639726 \n",
"std 1.112799 30.202904 20.645407 181.066207 456.098091 \n",
"min 1.000000 1872.000000 1950.000000 0.000000 0.000000 \n",
"25% 5.000000 1954.000000 1967.000000 0.000000 0.000000 \n",
"50% 5.000000 1973.000000 1994.000000 0.000000 383.500000 \n",
"75% 6.000000 2000.000000 2004.000000 166.000000 712.250000 \n",
"max 9.000000 2010.000000 2010.000000 1600.000000 5644.000000 \n",
"\n",
" ... WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch \\\n",
"count ... 1460.000000 1460.000000 1460.000000 1460.000000 \n",
"mean ... 94.244521 46.660274 21.954110 3.409589 \n",
"std ... 125.338794 66.256028 61.119149 29.317331 \n",
"min ... 0.000000 0.000000 0.000000 0.000000 \n",
"25% ... 0.000000 0.000000 0.000000 0.000000 \n",
"50% ... 0.000000 25.000000 0.000000 0.000000 \n",
"75% ... 168.000000 68.000000 0.000000 0.000000 \n",
"max ... 857.000000 547.000000 552.000000 508.000000 \n",
"\n",
" ScreenPorch PoolArea MiscVal MoSold YrSold \\\n",
"count 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 \n",
"mean 15.060959 2.758904 43.489041 6.321918 2007.815753 \n",
"std 55.757415 40.177307 496.123024 2.703626 1.328095 \n",
"min 0.000000 0.000000 0.000000 1.000000 2006.000000 \n",
"25% 0.000000 0.000000 0.000000 5.000000 2007.000000 \n",
"50% 0.000000 0.000000 0.000000 6.000000 2008.000000 \n",
"75% 0.000000 0.000000 0.000000 8.000000 2009.000000 \n",
"max 480.000000 738.000000 15500.000000 12.000000 2010.000000 \n",
"\n",
" SalePrice \n",
"count 1460.000000 \n",
"mean 180921.195890 \n",
"std 79442.502883 \n",
"min 34900.000000 \n",
"25% 129975.000000 \n",
"50% 163000.000000 \n",
"75% 214000.000000 \n",
"max 755000.000000 \n",
"\n",
"[8 rows x 38 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### Dostęp do danych"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index([u'Id', u'MSSubClass', u'MSZoning', u'LotFrontage', u'LotArea',\n",
" u'Street', u'Alley', u'LotShape', u'LandContour', u'Utilities',\n",
" u'LotConfig', u'LandSlope', u'Neighborhood', u'Condition1',\n",
" u'Condition2', u'BldgType', u'HouseStyle', u'OverallQual',\n",
" u'OverallCond', u'YearBuilt', u'YearRemodAdd', u'RoofStyle',\n",
" u'RoofMatl', u'Exterior1st', u'Exterior2nd', u'MasVnrType',\n",
" u'MasVnrArea', u'ExterQual', u'ExterCond', u'Foundation', u'BsmtQual',\n",
" u'BsmtCond', u'BsmtExposure', u'BsmtFinType1', u'BsmtFinSF1',\n",
" u'BsmtFinType2', u'BsmtFinSF2', u'BsmtUnfSF', u'TotalBsmtSF',\n",
" u'Heating', u'HeatingQC', u'CentralAir', u'Electrical', u'1stFlrSF',\n",
" u'2ndFlrSF', u'LowQualFinSF', u'GrLivArea', u'BsmtFullBath',\n",
" u'BsmtHalfBath', u'FullBath', u'HalfBath', u'BedroomAbvGr',\n",
" u'KitchenAbvGr', u'KitchenQual', u'TotRmsAbvGrd', u'Functional',\n",
" u'Fireplaces', u'FireplaceQu', u'GarageType', u'GarageYrBlt',\n",
" u'GarageFinish', u'GarageCars', u'GarageArea', u'GarageQual',\n",
" u'GarageCond', u'PavedDrive', u'WoodDeckSF', u'OpenPorchSF',\n",
" u'EnclosedPorch', u'3SsnPorch', u'ScreenPorch', u'PoolArea', u'PoolQC',\n",
" u'Fence', u'MiscFeature', u'MiscVal', u'MoSold', u'YrSold', u'SaleType',\n",
" u'SaleCondition', u'SalePrice'],\n",
" dtype='object')\n"
]
}
],
"source": [
"print(data.columns)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 60\n",
"1 20\n",
"2 60\n",
"3 70\n",
"4 60\n",
"Name: MSSubClass, dtype: int64\n"
]
}
],
"source": [
"print(data['MSSubClass'].head())"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" MSSubClass SalePrice\n",
"0 60 208500\n",
"1 20 181500\n",
"2 60 223500\n",
"3 70 140000\n",
"4 60 250000\n"
]
}
],
"source": [
"print(data[['MSSubClass', 'SalePrice']].head())"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>MSSubClass</th>\n",
" <th>MSZoning</th>\n",
" <th>LotFrontage</th>\n",
" <th>LotArea</th>\n",
" <th>Street</th>\n",
" <th>Alley</th>\n",
" <th>LotShape</th>\n",
" <th>LandContour</th>\n",
" <th>Utilities</th>\n",
" <th>...</th>\n",
" <th>PoolArea</th>\n",
" <th>PoolQC</th>\n",
" <th>Fence</th>\n",
" <th>MiscFeature</th>\n",
" <th>MiscVal</th>\n",
" <th>MoSold</th>\n",
" <th>YrSold</th>\n",
" <th>SaleType</th>\n",
" <th>SaleCondition</th>\n",
" <th>SalePrice</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>65.0</td>\n",
" <td>8450</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>208500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>RL</td>\n",
" <td>60.0</td>\n",
" <td>9550</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2006</td>\n",
" <td>WD</td>\n",
" <td>Abnorml</td>\n",
" <td>140000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2 rows × 81 columns</p>\n",
"</div>"
],
"text/plain": [
" Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n",
"0 1 60 RL 65.0 8450 Pave NaN Reg \n",
"3 4 70 RL 60.0 9550 Pave NaN IR1 \n",
"\n",
" LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal \\\n",
"0 Lvl AllPub ... 0 NaN NaN NaN 0 \n",
"3 Lvl AllPub ... 0 NaN NaN NaN 0 \n",
"\n",
" MoSold YrSold SaleType SaleCondition SalePrice \n",
"0 2 2008 WD Normal 208500 \n",
"3 2 2006 WD Abnorml 140000 \n",
"\n",
"[2 rows x 81 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.loc[[0,3]]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>MSSubClass</th>\n",
" <th>MSZoning</th>\n",
" <th>LotFrontage</th>\n",
" <th>LotArea</th>\n",
" <th>Street</th>\n",
" <th>Alley</th>\n",
" <th>LotShape</th>\n",
" <th>LandContour</th>\n",
" <th>Utilities</th>\n",
" <th>...</th>\n",
" <th>PoolArea</th>\n",
" <th>PoolQC</th>\n",
" <th>Fence</th>\n",
" <th>MiscFeature</th>\n",
" <th>MiscVal</th>\n",
" <th>MoSold</th>\n",
" <th>YrSold</th>\n",
" <th>SaleType</th>\n",
" <th>SaleCondition</th>\n",
" <th>SalePrice</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>65.0</td>\n",
" <td>8450</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>208500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>20</td>\n",
" <td>RL</td>\n",
" <td>80.0</td>\n",
" <td>9600</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>2007</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>181500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>68.0</td>\n",
" <td>11250</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>9</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>223500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>RL</td>\n",
" <td>60.0</td>\n",
" <td>9550</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2006</td>\n",
" <td>WD</td>\n",
" <td>Abnorml</td>\n",
" <td>140000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>84.0</td>\n",
" <td>14260</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>12</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>250000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>6</td>\n",
" <td>50</td>\n",
" <td>RL</td>\n",
" <td>85.0</td>\n",
" <td>14115</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>MnPrv</td>\n",
" <td>Shed</td>\n",
" <td>700</td>\n",
" <td>10</td>\n",
" <td>2009</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>143000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6 rows × 81 columns</p>\n",
"</div>"
],
"text/plain": [
" Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n",
"0 1 60 RL 65.0 8450 Pave NaN Reg \n",
"1 2 20 RL 80.0 9600 Pave NaN Reg \n",
"2 3 60 RL 68.0 11250 Pave NaN IR1 \n",
"3 4 70 RL 60.0 9550 Pave NaN IR1 \n",
"4 5 60 RL 84.0 14260 Pave NaN IR1 \n",
"5 6 50 RL 85.0 14115 Pave NaN IR1 \n",
"\n",
" LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal \\\n",
"0 Lvl AllPub ... 0 NaN NaN NaN 0 \n",
"1 Lvl AllPub ... 0 NaN NaN NaN 0 \n",
"2 Lvl AllPub ... 0 NaN NaN NaN 0 \n",
"3 Lvl AllPub ... 0 NaN NaN NaN 0 \n",
"4 Lvl AllPub ... 0 NaN NaN NaN 0 \n",
"5 Lvl AllPub ... 0 NaN MnPrv Shed 700 \n",
"\n",
" MoSold YrSold SaleType SaleCondition SalePrice \n",
"0 2 2008 WD Normal 208500 \n",
"1 5 2007 WD Normal 181500 \n",
"2 9 2008 WD Normal 223500 \n",
"3 2 2006 WD Abnorml 140000 \n",
"4 12 2008 WD Normal 250000 \n",
"5 10 2009 WD Normal 143000 \n",
"\n",
"[6 rows x 81 columns]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.loc[0:5]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>MSSubClass</th>\n",
" <th>MSZoning</th>\n",
" <th>LotFrontage</th>\n",
" <th>LotArea</th>\n",
" <th>Street</th>\n",
" <th>Alley</th>\n",
" <th>LotShape</th>\n",
" <th>LandContour</th>\n",
" <th>Utilities</th>\n",
" <th>...</th>\n",
" <th>PoolArea</th>\n",
" <th>PoolQC</th>\n",
" <th>Fence</th>\n",
" <th>MiscFeature</th>\n",
" <th>MiscVal</th>\n",
" <th>MoSold</th>\n",
" <th>YrSold</th>\n",
" <th>SaleType</th>\n",
" <th>SaleCondition</th>\n",
" <th>SalePrice</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>65.0</td>\n",
" <td>8450</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>208500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>20</td>\n",
" <td>RL</td>\n",
" <td>80.0</td>\n",
" <td>9600</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>2007</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>181500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>68.0</td>\n",
" <td>11250</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>9</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>223500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>RL</td>\n",
" <td>60.0</td>\n",
" <td>9550</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2006</td>\n",
" <td>WD</td>\n",
" <td>Abnorml</td>\n",
" <td>140000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>84.0</td>\n",
" <td>14260</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>12</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>250000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 81 columns</p>\n",
"</div>"
],
"text/plain": [
" Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n",
"0 1 60 RL 65.0 8450 Pave NaN Reg \n",
"1 2 20 RL 80.0 9600 Pave NaN Reg \n",
"2 3 60 RL 68.0 11250 Pave NaN IR1 \n",
"3 4 70 RL 60.0 9550 Pave NaN IR1 \n",
"4 5 60 RL 84.0 14260 Pave NaN IR1 \n",
"\n",
" LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal \\\n",
"0 Lvl AllPub ... 0 NaN NaN NaN 0 \n",
"1 Lvl AllPub ... 0 NaN NaN NaN 0 \n",
"2 Lvl AllPub ... 0 NaN NaN NaN 0 \n",
"3 Lvl AllPub ... 0 NaN NaN NaN 0 \n",
"4 Lvl AllPub ... 0 NaN NaN NaN 0 \n",
"\n",
" MoSold YrSold SaleType SaleCondition SalePrice \n",
"0 2 2008 WD Normal 208500 \n",
"1 5 2007 WD Normal 181500 \n",
"2 9 2008 WD Normal 223500 \n",
"3 2 2006 WD Abnorml 140000 \n",
"4 12 2008 WD Normal 250000 \n",
"\n",
"[5 rows x 81 columns]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[data['MSZoning'] == 'RL'].head()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>MSSubClass</th>\n",
" <th>MSZoning</th>\n",
" <th>LotFrontage</th>\n",
" <th>LotArea</th>\n",
" <th>Street</th>\n",
" <th>Alley</th>\n",
" <th>LotShape</th>\n",
" <th>LandContour</th>\n",
" <th>Utilities</th>\n",
" <th>...</th>\n",
" <th>PoolArea</th>\n",
" <th>PoolQC</th>\n",
" <th>Fence</th>\n",
" <th>MiscFeature</th>\n",
" <th>MiscVal</th>\n",
" <th>MoSold</th>\n",
" <th>YrSold</th>\n",
" <th>SaleType</th>\n",
" <th>SaleCondition</th>\n",
" <th>SalePrice</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>65.0</td>\n",
" <td>8450</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>208500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>20</td>\n",
" <td>RL</td>\n",
" <td>80.0</td>\n",
" <td>9600</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>2007</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>181500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>7</td>\n",
" <td>20</td>\n",
" <td>RL</td>\n",
" <td>75.0</td>\n",
" <td>10084</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>2007</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>307000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>10</td>\n",
" <td>190</td>\n",
" <td>RL</td>\n",
" <td>50.0</td>\n",
" <td>7420</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>118000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>11</td>\n",
" <td>20</td>\n",
" <td>RL</td>\n",
" <td>70.0</td>\n",
" <td>11200</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>129500</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 81 columns</p>\n",
"</div>"
],
"text/plain": [
" Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n",
"0 1 60 RL 65.0 8450 Pave NaN Reg \n",
"1 2 20 RL 80.0 9600 Pave NaN Reg \n",
"6 7 20 RL 75.0 10084 Pave NaN Reg \n",
"9 10 190 RL 50.0 7420 Pave NaN Reg \n",
"10 11 20 RL 70.0 11200 Pave NaN Reg \n",
"\n",
" LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal \\\n",
"0 Lvl AllPub ... 0 NaN NaN NaN 0 \n",
"1 Lvl AllPub ... 0 NaN NaN NaN 0 \n",
"6 Lvl AllPub ... 0 NaN NaN NaN 0 \n",
"9 Lvl AllPub ... 0 NaN NaN NaN 0 \n",
"10 Lvl AllPub ... 0 NaN NaN NaN 0 \n",
"\n",
" MoSold YrSold SaleType SaleCondition SalePrice \n",
"0 2 2008 WD Normal 208500 \n",
"1 5 2007 WD Normal 181500 \n",
"6 8 2007 WD Normal 307000 \n",
"9 1 2008 WD Normal 118000 \n",
"10 2 2008 WD Normal 129500 \n",
"\n",
"[5 rows x 81 columns]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[(data['MSZoning'] == 'RL') & (data['LotShape'] == 'Reg')].head()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"ceny = data['SalePrice']"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"180921.19589041095"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ceny.mean()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"755000"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ceny.max()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'SalePrice'"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ceny.name"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"('Plus vat:', 0 256455.00\n",
"1 223245.00\n",
"2 274905.00\n",
"3 172200.00\n",
"4 307500.00\n",
"5 175890.00\n",
"6 377610.00\n",
"7 246000.00\n",
"8 159777.00\n",
"9 145140.00\n",
"10 159285.00\n",
"11 424350.00\n",
"12 177120.00\n",
"13 343785.00\n",
"14 193110.00\n",
"15 162360.00\n",
"16 183270.00\n",
"17 110700.00\n",
"18 195570.00\n",
"19 170970.00\n",
"20 400119.00\n",
"21 171462.00\n",
"22 282900.00\n",
"23 159777.00\n",
"24 189420.00\n",
"25 315249.00\n",
"26 165804.00\n",
"27 376380.00\n",
"28 255225.00\n",
"29 84255.00\n",
" ... \n",
"1430 236332.20\n",
"1431 176812.50\n",
"1432 79335.00\n",
"1433 229395.00\n",
"1434 196800.00\n",
"1435 214020.00\n",
"1436 148215.00\n",
"1437 485378.91\n",
"1438 184131.00\n",
"1439 242310.00\n",
"1440 234930.00\n",
"1441 183639.00\n",
"1442 381300.00\n",
"1443 148830.00\n",
"1444 220908.00\n",
"1445 158670.00\n",
"1446 194217.00\n",
"1447 295200.00\n",
"1448 137760.00\n",
"1449 113160.00\n",
"1450 167280.00\n",
"1451 353120.70\n",
"1452 178350.00\n",
"1453 103935.00\n",
"1454 227550.00\n",
"1455 215250.00\n",
"1456 258300.00\n",
"1457 327795.00\n",
"1458 174813.75\n",
"1459 181425.00\n",
"Name: SalePrice, Length: 1460, dtype: float64)\n"
]
}
],
"source": [
"print(\"Plus vat:\", ceny * 1.23)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['RL', 'RM', 'C (all)', 'FV', 'RH'], dtype=object)"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.MSZoning.unique()"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RL 1151\n",
"RM 218\n",
"FV 65\n",
"RH 16\n",
"C (all) 10\n",
"Name: MSZoning, dtype: int64"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.MSZoning.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"data['nowa'] = ceny * 1.23"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MSSubClass</th>\n",
" <th>MSZoning</th>\n",
" <th>LotFrontage</th>\n",
" <th>Street</th>\n",
" <th>Alley</th>\n",
" <th>LotShape</th>\n",
" <th>LandContour</th>\n",
" <th>Utilities</th>\n",
" <th>LotConfig</th>\n",
" <th>LandSlope</th>\n",
" <th>...</th>\n",
" <th>PoolQC</th>\n",
" <th>Fence</th>\n",
" <th>MiscFeature</th>\n",
" <th>MiscVal</th>\n",
" <th>MoSold</th>\n",
" <th>YrSold</th>\n",
" <th>SaleType</th>\n",
" <th>SaleCondition</th>\n",
" <th>SalePrice</th>\n",
" <th>nowa</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>65.0</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Inside</td>\n",
" <td>Gtl</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>208500</td>\n",
" <td>256455.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>20</td>\n",
" <td>RL</td>\n",
" <td>80.0</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>FR2</td>\n",
" <td>Gtl</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>2007</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>181500</td>\n",
" <td>223245.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>68.0</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Inside</td>\n",
" <td>Gtl</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>9</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>223500</td>\n",
" <td>274905.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>70</td>\n",
" <td>RL</td>\n",
" <td>60.0</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Corner</td>\n",
" <td>Gtl</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2006</td>\n",
" <td>WD</td>\n",
" <td>Abnorml</td>\n",
" <td>140000</td>\n",
" <td>172200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>84.0</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>FR2</td>\n",
" <td>Gtl</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>12</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>250000</td>\n",
" <td>307500.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 80 columns</p>\n",
"</div>"
],
"text/plain": [
" MSSubClass MSZoning LotFrontage Street Alley LotShape LandContour \\\n",
"0 60 RL 65.0 Pave NaN Reg Lvl \n",
"1 20 RL 80.0 Pave NaN Reg Lvl \n",
"2 60 RL 68.0 Pave NaN IR1 Lvl \n",
"3 70 RL 60.0 Pave NaN IR1 Lvl \n",
"4 60 RL 84.0 Pave NaN IR1 Lvl \n",
"\n",
" Utilities LotConfig LandSlope ... PoolQC Fence MiscFeature MiscVal \\\n",
"0 AllPub Inside Gtl ... NaN NaN NaN 0 \n",
"1 AllPub FR2 Gtl ... NaN NaN NaN 0 \n",
"2 AllPub Inside Gtl ... NaN NaN NaN 0 \n",
"3 AllPub Corner Gtl ... NaN NaN NaN 0 \n",
"4 AllPub FR2 Gtl ... NaN NaN NaN 0 \n",
"\n",
" MoSold YrSold SaleType SaleCondition SalePrice nowa \n",
"0 2 2008 WD Normal 208500 256455.0 \n",
"1 5 2007 WD Normal 181500 223245.0 \n",
"2 9 2008 WD Normal 223500 274905.0 \n",
"3 2 2006 WD Abnorml 140000 172200.0 \n",
"4 12 2008 WD Normal 250000 307500.0 \n",
"\n",
"[5 rows x 80 columns]"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.drop('LotArea', axis=1)\n",
"data.drop(['Id', 'LotArea'], axis=1).head()"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>MSSubClass</th>\n",
" <th>MSZoning</th>\n",
" <th>LotFrontage</th>\n",
" <th>LotArea</th>\n",
" <th>Street</th>\n",
" <th>Alley</th>\n",
" <th>LotShape</th>\n",
" <th>LandContour</th>\n",
" <th>Utilities</th>\n",
" <th>...</th>\n",
" <th>PoolQC</th>\n",
" <th>Fence</th>\n",
" <th>MiscFeature</th>\n",
" <th>MiscVal</th>\n",
" <th>MoSold</th>\n",
" <th>YrSold</th>\n",
" <th>SaleType</th>\n",
" <th>SaleCondition</th>\n",
" <th>SalePrice</th>\n",
" <th>nowa</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>20</td>\n",
" <td>RL</td>\n",
" <td>80.0</td>\n",
" <td>9600</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>2007</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>181500</td>\n",
" <td>223245.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>68.0</td>\n",
" <td>11250</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>9</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>223500</td>\n",
" <td>274905.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>RL</td>\n",
" <td>60.0</td>\n",
" <td>9550</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2006</td>\n",
" <td>WD</td>\n",
" <td>Abnorml</td>\n",
" <td>140000</td>\n",
" <td>172200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>84.0</td>\n",
" <td>14260</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>12</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>250000</td>\n",
" <td>307500.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>6</td>\n",
" <td>50</td>\n",
" <td>RL</td>\n",
" <td>85.0</td>\n",
" <td>14115</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>MnPrv</td>\n",
" <td>Shed</td>\n",
" <td>700</td>\n",
" <td>10</td>\n",
" <td>2009</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>143000</td>\n",
" <td>175890.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 82 columns</p>\n",
"</div>"
],
"text/plain": [
" Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n",
"1 2 20 RL 80.0 9600 Pave NaN Reg \n",
"2 3 60 RL 68.0 11250 Pave NaN IR1 \n",
"3 4 70 RL 60.0 9550 Pave NaN IR1 \n",
"4 5 60 RL 84.0 14260 Pave NaN IR1 \n",
"5 6 50 RL 85.0 14115 Pave NaN IR1 \n",
"\n",
" LandContour Utilities ... PoolQC Fence MiscFeature MiscVal MoSold \\\n",
"1 Lvl AllPub ... NaN NaN NaN 0 5 \n",
"2 Lvl AllPub ... NaN NaN NaN 0 9 \n",
"3 Lvl AllPub ... NaN NaN NaN 0 2 \n",
"4 Lvl AllPub ... NaN NaN NaN 0 12 \n",
"5 Lvl AllPub ... NaN MnPrv Shed 700 10 \n",
"\n",
" YrSold SaleType SaleCondition SalePrice nowa \n",
"1 2007 WD Normal 181500 223245.0 \n",
"2 2008 WD Normal 223500 274905.0 \n",
"3 2006 WD Abnorml 140000 172200.0 \n",
"4 2008 WD Normal 250000 307500.0 \n",
"5 2009 WD Normal 143000 175890.0 \n",
"\n",
"[5 rows x 82 columns]"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.drop(0).head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"celltoolbar": "Slideshow",
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}