22 KiB
22 KiB
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
col_names = [
"state",
"county",
"community",
"communityname",
"fold",
"population",
"householdsize",
"racepctblack",
"racePctWhite",
"racePctAsian",
"racePctHisp",
"agePct12t21",
"agePct12t29",
"agePct16t24",
"agePct65up",
"numbUrban",
"pctUrban",
"medIncome",
"pctWWage",
"pctWFarmSelf",
"pctWInvInc",
"pctWSocSec",
"pctWPubAsst",
"pctWRetire",
"medFamInc",
"perCapInc",
"whitePerCap",
"blackPerCap",
"indianPerCap",
"AsianPerCap",
"OtherPerCap",
"HispPerCap",
"NumUnderPov",
"PctPopUnderPov",
"PctLess9thGrade",
"PctNotHSGrad",
"PctBSorMore",
"PctUnemployed",
"PctEmploy",
"PctEmplManu",
"PctEmplProfServ",
"PctOccupManu",
"PctOccupMgmtProf",
"MalePctDivorce",
"MalePctNevMarr",
"FemalePctDiv",
"TotalPctDiv",
"PersPerFam",
"PctFam2Par",
"PctKids2Par",
"PctYoungKids2Par",
"PctTeen2Par",
"PctWorkMomYoungKids",
"PctWorkMom",
"NumIlleg",
"PctIlleg",
"NumImmig",
"PctImmigRecent",
"PctImmigRec5",
"PctImmigRec8",
"PctImmigRec10",
"PctRecentImmig",
"PctRecImmig5",
"PctRecImmig8",
"PctRecImmig10",
"PctSpeakEnglOnly",
"PctNotSpeakEnglWell",
"PctLargHouseFam",
"PctLargHouseOccup",
"PersPerOccupHous",
"PersPerOwnOccHous",
"PersPerRentOccHous",
"PctPersOwnOccup",
"PctPersDenseHous",
"PctHousLess3BR",
"MedNumBR",
"HousVacant",
"PctHousOccup",
"PctHousOwnOcc",
"PctVacantBoarded",
"PctVacMore6Mos",
"MedYrHousBuilt",
"PctHousNoPhone",
"PctWOFullPlumb",
"OwnOccLowQuart",
"OwnOccMedVal",
"OwnOccHiQuart",
"RentLowQ",
"RentMedian",
"RentHighQ",
"MedRent",
"MedRentPctHousInc",
"MedOwnCostPctInc",
"MedOwnCostPctIncNoMtg",
"NumInShelters",
"NumStreet",
"PctForeignBorn",
"PctBornSameState",
"PctSameHouse85",
"PctSameCity85",
"PctSameState85",
"LemasSwornFT",
"LemasSwFTPerPop",
"LemasSwFTFieldOps",
"LemasSwFTFieldPerPop",
"LemasTotalReq",
"LemasTotReqPerPop",
"PolicReqPerOffic",
"PolicPerPop",
"RacialMatchCommPol",
"PctPolicWhite",
"PctPolicBlack",
"PctPolicHisp",
"PctPolicAsian",
"PctPolicMinor",
"OfficAssgnDrugUnits",
"NumKindsDrugsSeiz",
"PolicAveOTWorked",
"LandArea",
"PopDens",
"PctUsePubTrans",
"PolicCars",
"PolicOperBudg",
"LemasPctPolicOnPatr",
"LemasGangUnitDeploy",
"LemasPctOfficDrugUn",
"PolicBudgPerPop",
"ViolentCrimesPerPop"]
df = pd.read_csv('communities.data',names=col_names)
df = df.replace('?',None)
df = df.dropna(axis='rows')
le = LabelEncoder()
le.fit(df['communityname'].unique())
df['communityname'] = le.transform(df['communityname'])
df
state | county | community | communityname | fold | population | householdsize | racepctblack | racePctWhite | racePctAsian | ... | LandArea | PopDens | PctUsePubTrans | PolicCars | PolicOperBudg | LemasPctPolicOnPatr | LemasGangUnitDeploy | LemasPctOfficDrugUn | PolicBudgPerPop | ViolentCrimesPerPop | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
16 | 36 | 1 | 1000 | 0 | 1 | 0.15 | 0.31 | 0.40 | 0.63 | 0.14 | ... | 0.06 | 0.39 | 0.84 | 0.06 | 0.06 | 0.91 | 0.5 | 0.88 | 0.26 | 0.49 |
23 | 19 | 193 | 93926 | 94 | 1 | 0.11 | 0.43 | 0.04 | 0.89 | 0.09 | ... | 0.16 | 0.12 | 0.07 | 0.04 | 0.01 | 0.81 | 1 | 0.56 | 0.09 | 0.63 |
33 | 51 | 680 | 47672 | 52 | 1 | 0.09 | 0.43 | 0.51 | 0.58 | 0.04 | ... | 0.14 | 0.11 | 0.19 | 0.05 | 0.01 | 0.75 | 0 | 0.60 | 0.1 | 0.31 |
68 | 34 | 23 | 58200 | 79 | 1 | 0.05 | 0.59 | 0.23 | 0.39 | 0.09 | ... | 0.01 | 0.73 | 0.28 | 0 | 0.02 | 0.64 | 0 | 1.00 | 0.23 | 0.50 |
74 | 9 | 9 | 46520 | 58 | 1 | 0.08 | 0.39 | 0.08 | 0.85 | 0.04 | ... | 0.07 | 0.21 | 0.04 | 0.02 | 0.01 | 0.7 | 1 | 0.44 | 0.11 | 0.14 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1880 | 34 | 39 | 40350 | 50 | 10 | 0.04 | 0.39 | 0.39 | 0.65 | 0.09 | ... | 0.03 | 0.28 | 0.32 | 0.02 | 0.01 | 0.85 | 0 | 0.99 | 0.19 | 0.22 |
1963 | 36 | 27 | 59641 | 85 | 10 | 0.03 | 0.32 | 0.61 | 0.47 | 0.09 | ... | 0.01 | 0.47 | 0.42 | 0.07 | 0.08 | 0.49 | 0 | 0.37 | 1 | 0.45 |
1981 | 9 | 9 | 35650 | 36 | 10 | 0.07 | 0.38 | 0.17 | 0.84 | 0.11 | ... | 0.09 | 0.13 | 0.17 | 0.02 | 0.01 | 0.72 | 0 | 0.62 | 0.15 | 0.07 |
1991 | 9 | 9 | 80070 | 110 | 10 | 0.16 | 0.37 | 0.25 | 0.69 | 0.04 | ... | 0.08 | 0.32 | 0.18 | 0.08 | 0.06 | 0.78 | 0 | 0.91 | 0.28 | 0.23 |
1992 | 25 | 17 | 72600 | 107 | 10 | 0.08 | 0.51 | 0.06 | 0.87 | 0.22 | ... | 0.03 | 0.38 | 0.33 | 0.02 | 0.02 | 0.79 | 0 | 0.22 | 0.18 | 0.19 |
123 rows × 128 columns
X = df.loc[:,df.columns != 'ViolentCrimesPerPop']
y = df['ViolentCrimesPerPop']
X_train, X_test, y_train, y_test = train_test_split(poly_features, y, test_size=0.3)
ridgecv = RidgeCV(alphas=np.arange(1, 100, 5), scoring='r2', cv=10)
ridgecv.fit(X, y)
ridge = Ridge(alpha=ridgecv.alpha_)
ridge.fit(X_train, y_train)
ridge_y_predicted = ridge.predict(X_test)
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_y_predicted))
ridge_rmse
/home/tonywesoly/.local/lib/python3.8/site-packages/sklearn/linear_model/_ridge.py:251: UserWarning: Singular matrix in solving dual problem. Using least-squares solution instead. warnings.warn(
1.1533542718655332
#poly = PolynomialFeatures(degree=11, include_bias=False)
#poly_features = poly.fit_transform(X)
poly_reg_model = LinearRegression()
poly_reg_model.fit(X_train,y_train)
poly_reg_y_predicted = poly_reg_model.predict(X_test)
poly_reg_rmse = np.sqrt(mean_squared_error(y_test, poly_reg_y_predicted))
poly_reg_rmse
1.6511181528162753
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
lin_reg_model = LinearRegression()
lin_reg_model.fit(X_train, y_train)
lin_reg_y_predicted = lin_reg_model.predict(X_test)
lin_reg_rmse = np.sqrt(mean_squared_error(y_test, lin_reg_y_predicted))
lin_reg_rmse
0.3085495600528652