uczenie_maszynowe_zadania/cw_7/.ipynb_checkpoints/main-checkpoint.ipynb

704 lines
22 KiB
Plaintext
Raw Permalink Normal View History

2023-07-04 20:42:14 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 39,
"id": "ffd08cc9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>state</th>\n",
" <th>county</th>\n",
" <th>community</th>\n",
" <th>communityname</th>\n",
" <th>fold</th>\n",
" <th>population</th>\n",
" <th>householdsize</th>\n",
" <th>racepctblack</th>\n",
" <th>racePctWhite</th>\n",
" <th>racePctAsian</th>\n",
" <th>...</th>\n",
" <th>LandArea</th>\n",
" <th>PopDens</th>\n",
" <th>PctUsePubTrans</th>\n",
" <th>PolicCars</th>\n",
" <th>PolicOperBudg</th>\n",
" <th>LemasPctPolicOnPatr</th>\n",
" <th>LemasGangUnitDeploy</th>\n",
" <th>LemasPctOfficDrugUn</th>\n",
" <th>PolicBudgPerPop</th>\n",
" <th>ViolentCrimesPerPop</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>36</td>\n",
" <td>1</td>\n",
" <td>1000</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.15</td>\n",
" <td>0.31</td>\n",
" <td>0.40</td>\n",
" <td>0.63</td>\n",
" <td>0.14</td>\n",
" <td>...</td>\n",
" <td>0.06</td>\n",
" <td>0.39</td>\n",
" <td>0.84</td>\n",
" <td>0.06</td>\n",
" <td>0.06</td>\n",
" <td>0.91</td>\n",
" <td>0.5</td>\n",
" <td>0.88</td>\n",
" <td>0.26</td>\n",
" <td>0.49</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>19</td>\n",
" <td>193</td>\n",
" <td>93926</td>\n",
" <td>94</td>\n",
" <td>1</td>\n",
" <td>0.11</td>\n",
" <td>0.43</td>\n",
" <td>0.04</td>\n",
" <td>0.89</td>\n",
" <td>0.09</td>\n",
" <td>...</td>\n",
" <td>0.16</td>\n",
" <td>0.12</td>\n",
" <td>0.07</td>\n",
" <td>0.04</td>\n",
" <td>0.01</td>\n",
" <td>0.81</td>\n",
" <td>1</td>\n",
" <td>0.56</td>\n",
" <td>0.09</td>\n",
" <td>0.63</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>51</td>\n",
" <td>680</td>\n",
" <td>47672</td>\n",
" <td>52</td>\n",
" <td>1</td>\n",
" <td>0.09</td>\n",
" <td>0.43</td>\n",
" <td>0.51</td>\n",
" <td>0.58</td>\n",
" <td>0.04</td>\n",
" <td>...</td>\n",
" <td>0.14</td>\n",
" <td>0.11</td>\n",
" <td>0.19</td>\n",
" <td>0.05</td>\n",
" <td>0.01</td>\n",
" <td>0.75</td>\n",
" <td>0</td>\n",
" <td>0.60</td>\n",
" <td>0.1</td>\n",
" <td>0.31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>68</th>\n",
" <td>34</td>\n",
" <td>23</td>\n",
" <td>58200</td>\n",
" <td>79</td>\n",
" <td>1</td>\n",
" <td>0.05</td>\n",
" <td>0.59</td>\n",
" <td>0.23</td>\n",
" <td>0.39</td>\n",
" <td>0.09</td>\n",
" <td>...</td>\n",
" <td>0.01</td>\n",
" <td>0.73</td>\n",
" <td>0.28</td>\n",
" <td>0</td>\n",
" <td>0.02</td>\n",
" <td>0.64</td>\n",
" <td>0</td>\n",
" <td>1.00</td>\n",
" <td>0.23</td>\n",
" <td>0.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74</th>\n",
" <td>9</td>\n",
" <td>9</td>\n",
" <td>46520</td>\n",
" <td>58</td>\n",
" <td>1</td>\n",
" <td>0.08</td>\n",
" <td>0.39</td>\n",
" <td>0.08</td>\n",
" <td>0.85</td>\n",
" <td>0.04</td>\n",
" <td>...</td>\n",
" <td>0.07</td>\n",
" <td>0.21</td>\n",
" <td>0.04</td>\n",
" <td>0.02</td>\n",
" <td>0.01</td>\n",
" <td>0.7</td>\n",
" <td>1</td>\n",
" <td>0.44</td>\n",
" <td>0.11</td>\n",
" <td>0.14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1880</th>\n",
" <td>34</td>\n",
" <td>39</td>\n",
" <td>40350</td>\n",
" <td>50</td>\n",
" <td>10</td>\n",
" <td>0.04</td>\n",
" <td>0.39</td>\n",
" <td>0.39</td>\n",
" <td>0.65</td>\n",
" <td>0.09</td>\n",
" <td>...</td>\n",
" <td>0.03</td>\n",
" <td>0.28</td>\n",
" <td>0.32</td>\n",
" <td>0.02</td>\n",
" <td>0.01</td>\n",
" <td>0.85</td>\n",
" <td>0</td>\n",
" <td>0.99</td>\n",
" <td>0.19</td>\n",
" <td>0.22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1963</th>\n",
" <td>36</td>\n",
" <td>27</td>\n",
" <td>59641</td>\n",
" <td>85</td>\n",
" <td>10</td>\n",
" <td>0.03</td>\n",
" <td>0.32</td>\n",
" <td>0.61</td>\n",
" <td>0.47</td>\n",
" <td>0.09</td>\n",
" <td>...</td>\n",
" <td>0.01</td>\n",
" <td>0.47</td>\n",
" <td>0.42</td>\n",
" <td>0.07</td>\n",
" <td>0.08</td>\n",
" <td>0.49</td>\n",
" <td>0</td>\n",
" <td>0.37</td>\n",
" <td>1</td>\n",
" <td>0.45</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1981</th>\n",
" <td>9</td>\n",
" <td>9</td>\n",
" <td>35650</td>\n",
" <td>36</td>\n",
" <td>10</td>\n",
" <td>0.07</td>\n",
" <td>0.38</td>\n",
" <td>0.17</td>\n",
" <td>0.84</td>\n",
" <td>0.11</td>\n",
" <td>...</td>\n",
" <td>0.09</td>\n",
" <td>0.13</td>\n",
" <td>0.17</td>\n",
" <td>0.02</td>\n",
" <td>0.01</td>\n",
" <td>0.72</td>\n",
" <td>0</td>\n",
" <td>0.62</td>\n",
" <td>0.15</td>\n",
" <td>0.07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1991</th>\n",
" <td>9</td>\n",
" <td>9</td>\n",
" <td>80070</td>\n",
" <td>110</td>\n",
" <td>10</td>\n",
" <td>0.16</td>\n",
" <td>0.37</td>\n",
" <td>0.25</td>\n",
" <td>0.69</td>\n",
" <td>0.04</td>\n",
" <td>...</td>\n",
" <td>0.08</td>\n",
" <td>0.32</td>\n",
" <td>0.18</td>\n",
" <td>0.08</td>\n",
" <td>0.06</td>\n",
" <td>0.78</td>\n",
" <td>0</td>\n",
" <td>0.91</td>\n",
" <td>0.28</td>\n",
" <td>0.23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1992</th>\n",
" <td>25</td>\n",
" <td>17</td>\n",
" <td>72600</td>\n",
" <td>107</td>\n",
" <td>10</td>\n",
" <td>0.08</td>\n",
" <td>0.51</td>\n",
" <td>0.06</td>\n",
" <td>0.87</td>\n",
" <td>0.22</td>\n",
" <td>...</td>\n",
" <td>0.03</td>\n",
" <td>0.38</td>\n",
" <td>0.33</td>\n",
" <td>0.02</td>\n",
" <td>0.02</td>\n",
" <td>0.79</td>\n",
" <td>0</td>\n",
" <td>0.22</td>\n",
" <td>0.18</td>\n",
" <td>0.19</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>123 rows × 128 columns</p>\n",
"</div>"
],
"text/plain": [
" state county community communityname fold population householdsize \\\n",
"16 36 1 1000 0 1 0.15 0.31 \n",
"23 19 193 93926 94 1 0.11 0.43 \n",
"33 51 680 47672 52 1 0.09 0.43 \n",
"68 34 23 58200 79 1 0.05 0.59 \n",
"74 9 9 46520 58 1 0.08 0.39 \n",
"... ... ... ... ... ... ... ... \n",
"1880 34 39 40350 50 10 0.04 0.39 \n",
"1963 36 27 59641 85 10 0.03 0.32 \n",
"1981 9 9 35650 36 10 0.07 0.38 \n",
"1991 9 9 80070 110 10 0.16 0.37 \n",
"1992 25 17 72600 107 10 0.08 0.51 \n",
"\n",
" racepctblack racePctWhite racePctAsian ... LandArea PopDens \\\n",
"16 0.40 0.63 0.14 ... 0.06 0.39 \n",
"23 0.04 0.89 0.09 ... 0.16 0.12 \n",
"33 0.51 0.58 0.04 ... 0.14 0.11 \n",
"68 0.23 0.39 0.09 ... 0.01 0.73 \n",
"74 0.08 0.85 0.04 ... 0.07 0.21 \n",
"... ... ... ... ... ... ... \n",
"1880 0.39 0.65 0.09 ... 0.03 0.28 \n",
"1963 0.61 0.47 0.09 ... 0.01 0.47 \n",
"1981 0.17 0.84 0.11 ... 0.09 0.13 \n",
"1991 0.25 0.69 0.04 ... 0.08 0.32 \n",
"1992 0.06 0.87 0.22 ... 0.03 0.38 \n",
"\n",
" PctUsePubTrans PolicCars PolicOperBudg LemasPctPolicOnPatr \\\n",
"16 0.84 0.06 0.06 0.91 \n",
"23 0.07 0.04 0.01 0.81 \n",
"33 0.19 0.05 0.01 0.75 \n",
"68 0.28 0 0.02 0.64 \n",
"74 0.04 0.02 0.01 0.7 \n",
"... ... ... ... ... \n",
"1880 0.32 0.02 0.01 0.85 \n",
"1963 0.42 0.07 0.08 0.49 \n",
"1981 0.17 0.02 0.01 0.72 \n",
"1991 0.18 0.08 0.06 0.78 \n",
"1992 0.33 0.02 0.02 0.79 \n",
"\n",
" LemasGangUnitDeploy LemasPctOfficDrugUn PolicBudgPerPop \\\n",
"16 0.5 0.88 0.26 \n",
"23 1 0.56 0.09 \n",
"33 0 0.60 0.1 \n",
"68 0 1.00 0.23 \n",
"74 1 0.44 0.11 \n",
"... ... ... ... \n",
"1880 0 0.99 0.19 \n",
"1963 0 0.37 1 \n",
"1981 0 0.62 0.15 \n",
"1991 0 0.91 0.28 \n",
"1992 0 0.22 0.18 \n",
"\n",
" ViolentCrimesPerPop \n",
"16 0.49 \n",
"23 0.63 \n",
"33 0.31 \n",
"68 0.50 \n",
"74 0.14 \n",
"... ... \n",
"1880 0.22 \n",
"1963 0.45 \n",
"1981 0.07 \n",
"1991 0.23 \n",
"1992 0.19 \n",
"\n",
"[123 rows x 128 columns]"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import sklearn\n",
"from sklearn.preprocessing import PolynomialFeatures, LabelEncoder\n",
"from sklearn.linear_model import LinearRegression, Ridge, RidgeCV\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import mean_squared_error\n",
"\n",
"col_names = [\n",
"\"state\",\n",
"\"county\",\n",
"\"community\",\n",
"\"communityname\",\n",
"\"fold\",\n",
"\"population\",\n",
"\"householdsize\",\n",
"\"racepctblack\",\n",
"\"racePctWhite\",\n",
"\"racePctAsian\",\n",
"\"racePctHisp\",\n",
"\"agePct12t21\",\n",
"\"agePct12t29\",\n",
"\"agePct16t24\",\n",
"\"agePct65up\",\n",
"\"numbUrban\",\n",
"\"pctUrban\",\n",
"\"medIncome\",\n",
"\"pctWWage\",\n",
"\"pctWFarmSelf\",\n",
"\"pctWInvInc\",\n",
"\"pctWSocSec\",\n",
"\"pctWPubAsst\",\n",
"\"pctWRetire\",\n",
"\"medFamInc\",\n",
"\"perCapInc\",\n",
"\"whitePerCap\",\n",
"\"blackPerCap\",\n",
"\"indianPerCap\",\n",
"\"AsianPerCap\",\n",
"\"OtherPerCap\",\n",
"\"HispPerCap\",\n",
"\"NumUnderPov\",\n",
"\"PctPopUnderPov\",\n",
"\"PctLess9thGrade\",\n",
"\"PctNotHSGrad\",\n",
"\"PctBSorMore\",\n",
"\"PctUnemployed\",\n",
"\"PctEmploy\",\n",
"\"PctEmplManu\",\n",
"\"PctEmplProfServ\",\n",
"\"PctOccupManu\",\n",
"\"PctOccupMgmtProf\",\n",
"\"MalePctDivorce\",\n",
"\"MalePctNevMarr\",\n",
"\"FemalePctDiv\",\n",
"\"TotalPctDiv\",\n",
"\"PersPerFam\",\n",
"\"PctFam2Par\",\n",
"\"PctKids2Par\",\n",
"\"PctYoungKids2Par\",\n",
"\"PctTeen2Par\",\n",
"\"PctWorkMomYoungKids\",\n",
"\"PctWorkMom\",\n",
"\"NumIlleg\",\n",
"\"PctIlleg\",\n",
"\"NumImmig\",\n",
"\"PctImmigRecent\",\n",
"\"PctImmigRec5\",\n",
"\"PctImmigRec8\",\n",
"\"PctImmigRec10\",\n",
"\"PctRecentImmig\",\n",
"\"PctRecImmig5\",\n",
"\"PctRecImmig8\",\n",
"\"PctRecImmig10\",\n",
"\"PctSpeakEnglOnly\",\n",
"\"PctNotSpeakEnglWell\",\n",
"\"PctLargHouseFam\",\n",
"\"PctLargHouseOccup\",\n",
"\"PersPerOccupHous\",\n",
"\"PersPerOwnOccHous\",\n",
"\"PersPerRentOccHous\",\n",
"\"PctPersOwnOccup\",\n",
"\"PctPersDenseHous\",\n",
"\"PctHousLess3BR\",\n",
"\"MedNumBR\",\n",
"\"HousVacant\",\n",
"\"PctHousOccup\",\n",
"\"PctHousOwnOcc\",\n",
"\"PctVacantBoarded\",\n",
"\"PctVacMore6Mos\",\n",
"\"MedYrHousBuilt\",\n",
"\"PctHousNoPhone\",\n",
"\"PctWOFullPlumb\",\n",
"\"OwnOccLowQuart\",\n",
"\"OwnOccMedVal\",\n",
"\"OwnOccHiQuart\",\n",
"\"RentLowQ\",\n",
"\"RentMedian\",\n",
"\"RentHighQ\",\n",
"\"MedRent\",\n",
"\"MedRentPctHousInc\",\n",
"\"MedOwnCostPctInc\",\n",
"\"MedOwnCostPctIncNoMtg\",\n",
"\"NumInShelters\",\n",
"\"NumStreet\",\n",
"\"PctForeignBorn\",\n",
"\"PctBornSameState\",\n",
"\"PctSameHouse85\",\n",
"\"PctSameCity85\",\n",
"\"PctSameState85\",\n",
"\"LemasSwornFT\",\n",
"\"LemasSwFTPerPop\",\n",
"\"LemasSwFTFieldOps\",\n",
"\"LemasSwFTFieldPerPop\",\n",
"\"LemasTotalReq\",\n",
"\"LemasTotReqPerPop\",\n",
"\"PolicReqPerOffic\",\n",
"\"PolicPerPop\",\n",
"\"RacialMatchCommPol\",\n",
"\"PctPolicWhite\",\n",
"\"PctPolicBlack\",\n",
"\"PctPolicHisp\",\n",
"\"PctPolicAsian\",\n",
"\"PctPolicMinor\",\n",
"\"OfficAssgnDrugUnits\",\n",
"\"NumKindsDrugsSeiz\",\n",
"\"PolicAveOTWorked\",\n",
"\"LandArea\",\n",
"\"PopDens\",\n",
"\"PctUsePubTrans\",\n",
"\"PolicCars\",\n",
"\"PolicOperBudg\",\n",
"\"LemasPctPolicOnPatr\",\n",
"\"LemasGangUnitDeploy\",\n",
"\"LemasPctOfficDrugUn\",\n",
"\"PolicBudgPerPop\",\n",
"\"ViolentCrimesPerPop\"]\n",
"\n",
"df = pd.read_csv('communities.data',names=col_names)\n",
"df = df.replace('?',None)\n",
"df = df.dropna(axis='rows')\n",
"\n",
"le = LabelEncoder()\n",
"le.fit(df['communityname'].unique())\n",
"df['communityname'] = le.transform(df['communityname'])\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "9c0f6f6d",
"metadata": {},
"outputs": [],
"source": [
"X = df.loc[:,df.columns != 'ViolentCrimesPerPop']\n",
"y = df['ViolentCrimesPerPop']"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "2bdc2e77",
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(poly_features, y, test_size=0.3)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "2bbd2ddb",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/tonywesoly/.local/lib/python3.8/site-packages/sklearn/linear_model/_ridge.py:251: UserWarning: Singular matrix in solving dual problem. Using least-squares solution instead.\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/plain": [
"1.1533542718655332"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ridgecv = RidgeCV(alphas=np.arange(1, 100, 5), scoring='r2', cv=10)\n",
"ridgecv.fit(X, y)\n",
"ridge = Ridge(alpha=ridgecv.alpha_)\n",
"ridge.fit(X_train, y_train)\n",
"ridge_y_predicted = ridge.predict(X_test)\n",
"ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_y_predicted))\n",
"ridge_rmse"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "dbfe728b",
"metadata": {},
"outputs": [],
"source": [
"#poly = PolynomialFeatures(degree=11, include_bias=False)\n",
"#poly_features = poly.fit_transform(X)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "3be15622",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.6511181528162753"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"poly_reg_model = LinearRegression()\n",
"poly_reg_model.fit(X_train,y_train)\n",
"poly_reg_y_predicted = poly_reg_model.predict(X_test)\n",
"poly_reg_rmse = np.sqrt(mean_squared_error(y_test, poly_reg_y_predicted))\n",
"poly_reg_rmse"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "4ab0949a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.3085495600528652"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n",
"lin_reg_model = LinearRegression()\n",
"lin_reg_model.fit(X_train, y_train)\n",
"lin_reg_y_predicted = lin_reg_model.predict(X_test)\n",
"lin_reg_rmse = np.sqrt(mean_squared_error(y_test, lin_reg_y_predicted))\n",
"lin_reg_rmse"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "99365180",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "0e0b2f8e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}