{ "cells": [ { "cell_type": "code", "execution_count": 39, "id": "ffd08cc9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
statecountycommunitycommunitynamefoldpopulationhouseholdsizeracepctblackracePctWhiteracePctAsian...LandAreaPopDensPctUsePubTransPolicCarsPolicOperBudgLemasPctPolicOnPatrLemasGangUnitDeployLemasPctOfficDrugUnPolicBudgPerPopViolentCrimesPerPop
163611000010.150.310.400.630.14...0.060.390.840.060.060.910.50.880.260.49
2319193939269410.110.430.040.890.09...0.160.120.070.040.010.8110.560.090.63
3351680476725210.090.430.510.580.04...0.140.110.190.050.010.7500.600.10.31
683423582007910.050.590.230.390.09...0.010.730.2800.020.6401.000.230.50
7499465205810.080.390.080.850.04...0.070.210.040.020.010.710.440.110.14
..................................................................
188034394035050100.040.390.390.650.09...0.030.280.320.020.010.8500.990.190.22
196336275964185100.030.320.610.470.09...0.010.470.420.070.080.4900.3710.45
1981993565036100.070.380.170.840.11...0.090.130.170.020.010.7200.620.150.07
19919980070110100.160.370.250.690.04...0.080.320.180.080.060.7800.910.280.23
1992251772600107100.080.510.060.870.22...0.030.380.330.020.020.7900.220.180.19
\n", "

123 rows × 128 columns

\n", "
" ], "text/plain": [ " state county community communityname fold population householdsize \\\n", "16 36 1 1000 0 1 0.15 0.31 \n", "23 19 193 93926 94 1 0.11 0.43 \n", "33 51 680 47672 52 1 0.09 0.43 \n", "68 34 23 58200 79 1 0.05 0.59 \n", "74 9 9 46520 58 1 0.08 0.39 \n", "... ... ... ... ... ... ... ... \n", "1880 34 39 40350 50 10 0.04 0.39 \n", "1963 36 27 59641 85 10 0.03 0.32 \n", "1981 9 9 35650 36 10 0.07 0.38 \n", "1991 9 9 80070 110 10 0.16 0.37 \n", "1992 25 17 72600 107 10 0.08 0.51 \n", "\n", " racepctblack racePctWhite racePctAsian ... LandArea PopDens \\\n", "16 0.40 0.63 0.14 ... 0.06 0.39 \n", "23 0.04 0.89 0.09 ... 0.16 0.12 \n", "33 0.51 0.58 0.04 ... 0.14 0.11 \n", "68 0.23 0.39 0.09 ... 0.01 0.73 \n", "74 0.08 0.85 0.04 ... 0.07 0.21 \n", "... ... ... ... ... ... ... \n", "1880 0.39 0.65 0.09 ... 0.03 0.28 \n", "1963 0.61 0.47 0.09 ... 0.01 0.47 \n", "1981 0.17 0.84 0.11 ... 0.09 0.13 \n", "1991 0.25 0.69 0.04 ... 0.08 0.32 \n", "1992 0.06 0.87 0.22 ... 0.03 0.38 \n", "\n", " PctUsePubTrans PolicCars PolicOperBudg LemasPctPolicOnPatr \\\n", "16 0.84 0.06 0.06 0.91 \n", "23 0.07 0.04 0.01 0.81 \n", "33 0.19 0.05 0.01 0.75 \n", "68 0.28 0 0.02 0.64 \n", "74 0.04 0.02 0.01 0.7 \n", "... ... ... ... ... \n", "1880 0.32 0.02 0.01 0.85 \n", "1963 0.42 0.07 0.08 0.49 \n", "1981 0.17 0.02 0.01 0.72 \n", "1991 0.18 0.08 0.06 0.78 \n", "1992 0.33 0.02 0.02 0.79 \n", "\n", " LemasGangUnitDeploy LemasPctOfficDrugUn PolicBudgPerPop \\\n", "16 0.5 0.88 0.26 \n", "23 1 0.56 0.09 \n", "33 0 0.60 0.1 \n", "68 0 1.00 0.23 \n", "74 1 0.44 0.11 \n", "... ... ... ... \n", "1880 0 0.99 0.19 \n", "1963 0 0.37 1 \n", "1981 0 0.62 0.15 \n", "1991 0 0.91 0.28 \n", "1992 0 0.22 0.18 \n", "\n", " ViolentCrimesPerPop \n", "16 0.49 \n", "23 0.63 \n", "33 0.31 \n", "68 0.50 \n", "74 0.14 \n", "... ... \n", "1880 0.22 \n", "1963 0.45 \n", "1981 0.07 \n", "1991 0.23 \n", "1992 0.19 \n", "\n", "[123 rows x 128 columns]" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import sklearn\n", "from sklearn.preprocessing import PolynomialFeatures, LabelEncoder\n", "from sklearn.linear_model import LinearRegression, Ridge, RidgeCV\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import mean_squared_error\n", "\n", "col_names = [\n", "\"state\",\n", "\"county\",\n", "\"community\",\n", "\"communityname\",\n", "\"fold\",\n", "\"population\",\n", "\"householdsize\",\n", "\"racepctblack\",\n", "\"racePctWhite\",\n", "\"racePctAsian\",\n", "\"racePctHisp\",\n", "\"agePct12t21\",\n", "\"agePct12t29\",\n", "\"agePct16t24\",\n", "\"agePct65up\",\n", "\"numbUrban\",\n", "\"pctUrban\",\n", "\"medIncome\",\n", "\"pctWWage\",\n", "\"pctWFarmSelf\",\n", "\"pctWInvInc\",\n", "\"pctWSocSec\",\n", "\"pctWPubAsst\",\n", "\"pctWRetire\",\n", "\"medFamInc\",\n", "\"perCapInc\",\n", "\"whitePerCap\",\n", "\"blackPerCap\",\n", "\"indianPerCap\",\n", "\"AsianPerCap\",\n", "\"OtherPerCap\",\n", "\"HispPerCap\",\n", "\"NumUnderPov\",\n", "\"PctPopUnderPov\",\n", "\"PctLess9thGrade\",\n", "\"PctNotHSGrad\",\n", "\"PctBSorMore\",\n", "\"PctUnemployed\",\n", "\"PctEmploy\",\n", "\"PctEmplManu\",\n", "\"PctEmplProfServ\",\n", "\"PctOccupManu\",\n", "\"PctOccupMgmtProf\",\n", "\"MalePctDivorce\",\n", "\"MalePctNevMarr\",\n", "\"FemalePctDiv\",\n", "\"TotalPctDiv\",\n", "\"PersPerFam\",\n", "\"PctFam2Par\",\n", "\"PctKids2Par\",\n", "\"PctYoungKids2Par\",\n", "\"PctTeen2Par\",\n", "\"PctWorkMomYoungKids\",\n", "\"PctWorkMom\",\n", "\"NumIlleg\",\n", "\"PctIlleg\",\n", "\"NumImmig\",\n", "\"PctImmigRecent\",\n", "\"PctImmigRec5\",\n", "\"PctImmigRec8\",\n", "\"PctImmigRec10\",\n", "\"PctRecentImmig\",\n", "\"PctRecImmig5\",\n", "\"PctRecImmig8\",\n", "\"PctRecImmig10\",\n", "\"PctSpeakEnglOnly\",\n", "\"PctNotSpeakEnglWell\",\n", "\"PctLargHouseFam\",\n", "\"PctLargHouseOccup\",\n", "\"PersPerOccupHous\",\n", "\"PersPerOwnOccHous\",\n", "\"PersPerRentOccHous\",\n", "\"PctPersOwnOccup\",\n", "\"PctPersDenseHous\",\n", "\"PctHousLess3BR\",\n", "\"MedNumBR\",\n", "\"HousVacant\",\n", "\"PctHousOccup\",\n", "\"PctHousOwnOcc\",\n", "\"PctVacantBoarded\",\n", "\"PctVacMore6Mos\",\n", "\"MedYrHousBuilt\",\n", "\"PctHousNoPhone\",\n", "\"PctWOFullPlumb\",\n", "\"OwnOccLowQuart\",\n", "\"OwnOccMedVal\",\n", "\"OwnOccHiQuart\",\n", "\"RentLowQ\",\n", "\"RentMedian\",\n", "\"RentHighQ\",\n", "\"MedRent\",\n", "\"MedRentPctHousInc\",\n", "\"MedOwnCostPctInc\",\n", "\"MedOwnCostPctIncNoMtg\",\n", "\"NumInShelters\",\n", "\"NumStreet\",\n", "\"PctForeignBorn\",\n", "\"PctBornSameState\",\n", "\"PctSameHouse85\",\n", "\"PctSameCity85\",\n", "\"PctSameState85\",\n", "\"LemasSwornFT\",\n", "\"LemasSwFTPerPop\",\n", "\"LemasSwFTFieldOps\",\n", "\"LemasSwFTFieldPerPop\",\n", "\"LemasTotalReq\",\n", "\"LemasTotReqPerPop\",\n", "\"PolicReqPerOffic\",\n", "\"PolicPerPop\",\n", "\"RacialMatchCommPol\",\n", "\"PctPolicWhite\",\n", "\"PctPolicBlack\",\n", "\"PctPolicHisp\",\n", "\"PctPolicAsian\",\n", "\"PctPolicMinor\",\n", "\"OfficAssgnDrugUnits\",\n", "\"NumKindsDrugsSeiz\",\n", "\"PolicAveOTWorked\",\n", "\"LandArea\",\n", "\"PopDens\",\n", "\"PctUsePubTrans\",\n", "\"PolicCars\",\n", "\"PolicOperBudg\",\n", "\"LemasPctPolicOnPatr\",\n", "\"LemasGangUnitDeploy\",\n", "\"LemasPctOfficDrugUn\",\n", "\"PolicBudgPerPop\",\n", "\"ViolentCrimesPerPop\"]\n", "\n", "df = pd.read_csv('communities.data',names=col_names)\n", "df = df.replace('?',None)\n", "df = df.dropna(axis='rows')\n", "\n", "le = LabelEncoder()\n", "le.fit(df['communityname'].unique())\n", "df['communityname'] = le.transform(df['communityname'])\n", "df" ] }, { "cell_type": "code", "execution_count": 40, "id": "9c0f6f6d", "metadata": {}, "outputs": [], "source": [ "X = df.loc[:,df.columns != 'ViolentCrimesPerPop']\n", "y = df['ViolentCrimesPerPop']" ] }, { "cell_type": "code", "execution_count": 41, "id": "2bdc2e77", "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(poly_features, y, test_size=0.3)" ] }, { "cell_type": "code", "execution_count": 42, "id": "2bbd2ddb", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/tonywesoly/.local/lib/python3.8/site-packages/sklearn/linear_model/_ridge.py:251: UserWarning: Singular matrix in solving dual problem. Using least-squares solution instead.\n", " warnings.warn(\n" ] }, { "data": { "text/plain": [ "1.1533542718655332" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ridgecv = RidgeCV(alphas=np.arange(1, 100, 5), scoring='r2', cv=10)\n", "ridgecv.fit(X, y)\n", "ridge = Ridge(alpha=ridgecv.alpha_)\n", "ridge.fit(X_train, y_train)\n", "ridge_y_predicted = ridge.predict(X_test)\n", "ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_y_predicted))\n", "ridge_rmse" ] }, { "cell_type": "code", "execution_count": 43, "id": "dbfe728b", "metadata": {}, "outputs": [], "source": [ "#poly = PolynomialFeatures(degree=11, include_bias=False)\n", "#poly_features = poly.fit_transform(X)" ] }, { "cell_type": "code", "execution_count": 44, "id": "3be15622", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.6511181528162753" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "poly_reg_model = LinearRegression()\n", "poly_reg_model.fit(X_train,y_train)\n", "poly_reg_y_predicted = poly_reg_model.predict(X_test)\n", "poly_reg_rmse = np.sqrt(mean_squared_error(y_test, poly_reg_y_predicted))\n", "poly_reg_rmse" ] }, { "cell_type": "code", "execution_count": 45, "id": "4ab0949a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.3085495600528652" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n", "lin_reg_model = LinearRegression()\n", "lin_reg_model.fit(X_train, y_train)\n", "lin_reg_y_predicted = lin_reg_model.predict(X_test)\n", "lin_reg_rmse = np.sqrt(mean_squared_error(y_test, lin_reg_y_predicted))\n", "lin_reg_rmse" ] }, { "cell_type": "code", "execution_count": null, "id": "99365180", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "0e0b2f8e", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }