uczenie_maszynowe_zadania/cw_7/main.ipynb
2023-07-04 20:42:14 +02:00

22 KiB
Raw Permalink Blame History

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

col_names = [
"state",
"county",
"community",
"communityname",
"fold",
"population",
"householdsize",
"racepctblack",
"racePctWhite",
"racePctAsian",
"racePctHisp",
"agePct12t21",
"agePct12t29",
"agePct16t24",
"agePct65up",
"numbUrban",
"pctUrban",
"medIncome",
"pctWWage",
"pctWFarmSelf",
"pctWInvInc",
"pctWSocSec",
"pctWPubAsst",
"pctWRetire",
"medFamInc",
"perCapInc",
"whitePerCap",
"blackPerCap",
"indianPerCap",
"AsianPerCap",
"OtherPerCap",
"HispPerCap",
"NumUnderPov",
"PctPopUnderPov",
"PctLess9thGrade",
"PctNotHSGrad",
"PctBSorMore",
"PctUnemployed",
"PctEmploy",
"PctEmplManu",
"PctEmplProfServ",
"PctOccupManu",
"PctOccupMgmtProf",
"MalePctDivorce",
"MalePctNevMarr",
"FemalePctDiv",
"TotalPctDiv",
"PersPerFam",
"PctFam2Par",
"PctKids2Par",
"PctYoungKids2Par",
"PctTeen2Par",
"PctWorkMomYoungKids",
"PctWorkMom",
"NumIlleg",
"PctIlleg",
"NumImmig",
"PctImmigRecent",
"PctImmigRec5",
"PctImmigRec8",
"PctImmigRec10",
"PctRecentImmig",
"PctRecImmig5",
"PctRecImmig8",
"PctRecImmig10",
"PctSpeakEnglOnly",
"PctNotSpeakEnglWell",
"PctLargHouseFam",
"PctLargHouseOccup",
"PersPerOccupHous",
"PersPerOwnOccHous",
"PersPerRentOccHous",
"PctPersOwnOccup",
"PctPersDenseHous",
"PctHousLess3BR",
"MedNumBR",
"HousVacant",
"PctHousOccup",
"PctHousOwnOcc",
"PctVacantBoarded",
"PctVacMore6Mos",
"MedYrHousBuilt",
"PctHousNoPhone",
"PctWOFullPlumb",
"OwnOccLowQuart",
"OwnOccMedVal",
"OwnOccHiQuart",
"RentLowQ",
"RentMedian",
"RentHighQ",
"MedRent",
"MedRentPctHousInc",
"MedOwnCostPctInc",
"MedOwnCostPctIncNoMtg",
"NumInShelters",
"NumStreet",
"PctForeignBorn",
"PctBornSameState",
"PctSameHouse85",
"PctSameCity85",
"PctSameState85",
"LemasSwornFT",
"LemasSwFTPerPop",
"LemasSwFTFieldOps",
"LemasSwFTFieldPerPop",
"LemasTotalReq",
"LemasTotReqPerPop",
"PolicReqPerOffic",
"PolicPerPop",
"RacialMatchCommPol",
"PctPolicWhite",
"PctPolicBlack",
"PctPolicHisp",
"PctPolicAsian",
"PctPolicMinor",
"OfficAssgnDrugUnits",
"NumKindsDrugsSeiz",
"PolicAveOTWorked",
"LandArea",
"PopDens",
"PctUsePubTrans",
"PolicCars",
"PolicOperBudg",
"LemasPctPolicOnPatr",
"LemasGangUnitDeploy",
"LemasPctOfficDrugUn",
"PolicBudgPerPop",
"ViolentCrimesPerPop"]

df = pd.read_csv('communities.data',names=col_names)
df = df.replace('?',None)
df = df.dropna(axis='rows')

le = LabelEncoder()
le.fit(df['communityname'].unique())
df['communityname'] = le.transform(df['communityname'])
df
state county community communityname fold population householdsize racepctblack racePctWhite racePctAsian ... LandArea PopDens PctUsePubTrans PolicCars PolicOperBudg LemasPctPolicOnPatr LemasGangUnitDeploy LemasPctOfficDrugUn PolicBudgPerPop ViolentCrimesPerPop
16 36 1 1000 0 1 0.15 0.31 0.40 0.63 0.14 ... 0.06 0.39 0.84 0.06 0.06 0.91 0.5 0.88 0.26 0.49
23 19 193 93926 94 1 0.11 0.43 0.04 0.89 0.09 ... 0.16 0.12 0.07 0.04 0.01 0.81 1 0.56 0.09 0.63
33 51 680 47672 52 1 0.09 0.43 0.51 0.58 0.04 ... 0.14 0.11 0.19 0.05 0.01 0.75 0 0.60 0.1 0.31
68 34 23 58200 79 1 0.05 0.59 0.23 0.39 0.09 ... 0.01 0.73 0.28 0 0.02 0.64 0 1.00 0.23 0.50
74 9 9 46520 58 1 0.08 0.39 0.08 0.85 0.04 ... 0.07 0.21 0.04 0.02 0.01 0.7 1 0.44 0.11 0.14
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1880 34 39 40350 50 10 0.04 0.39 0.39 0.65 0.09 ... 0.03 0.28 0.32 0.02 0.01 0.85 0 0.99 0.19 0.22
1963 36 27 59641 85 10 0.03 0.32 0.61 0.47 0.09 ... 0.01 0.47 0.42 0.07 0.08 0.49 0 0.37 1 0.45
1981 9 9 35650 36 10 0.07 0.38 0.17 0.84 0.11 ... 0.09 0.13 0.17 0.02 0.01 0.72 0 0.62 0.15 0.07
1991 9 9 80070 110 10 0.16 0.37 0.25 0.69 0.04 ... 0.08 0.32 0.18 0.08 0.06 0.78 0 0.91 0.28 0.23
1992 25 17 72600 107 10 0.08 0.51 0.06 0.87 0.22 ... 0.03 0.38 0.33 0.02 0.02 0.79 0 0.22 0.18 0.19

123 rows × 128 columns

X = df.loc[:,df.columns != 'ViolentCrimesPerPop']
y = df['ViolentCrimesPerPop']
X_train, X_test, y_train, y_test = train_test_split(poly_features, y, test_size=0.3)
ridgecv = RidgeCV(alphas=np.arange(1, 100, 5), scoring='r2', cv=10)
ridgecv.fit(X, y)
ridge = Ridge(alpha=ridgecv.alpha_)
ridge.fit(X_train, y_train)
ridge_y_predicted = ridge.predict(X_test)
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_y_predicted))
ridge_rmse
/home/tonywesoly/.local/lib/python3.8/site-packages/sklearn/linear_model/_ridge.py:251: UserWarning: Singular matrix in solving dual problem. Using least-squares solution instead.
  warnings.warn(
1.1533542718655332
#poly = PolynomialFeatures(degree=11, include_bias=False)
#poly_features = poly.fit_transform(X)
poly_reg_model = LinearRegression()
poly_reg_model.fit(X_train,y_train)
poly_reg_y_predicted = poly_reg_model.predict(X_test)
poly_reg_rmse = np.sqrt(mean_squared_error(y_test, poly_reg_y_predicted))
poly_reg_rmse
1.6511181528162753
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
lin_reg_model = LinearRegression()
lin_reg_model.fit(X_train, y_train)
lin_reg_y_predicted = lin_reg_model.predict(X_test)
lin_reg_rmse = np.sqrt(mean_squared_error(y_test, lin_reg_y_predicted))
lin_reg_rmse
0.3085495600528652