uczenie_maszynowe_zadania/main.ipynb at master

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

col_names = [
"state",
"county",
"community",
"communityname",
"fold",
"population",
"householdsize",
"racepctblack",
"racePctWhite",
"racePctAsian",
"racePctHisp",
"agePct12t21",
"agePct12t29",
"agePct16t24",
"agePct65up",
"numbUrban",
"pctUrban",
"medIncome",
"pctWWage",
"pctWFarmSelf",
"pctWInvInc",
"pctWSocSec",
"pctWPubAsst",
"pctWRetire",
"medFamInc",
"perCapInc",
"whitePerCap",
"blackPerCap",
"indianPerCap",
"AsianPerCap",
"OtherPerCap",
"HispPerCap",
"NumUnderPov",
"PctPopUnderPov",
"PctLess9thGrade",
"PctNotHSGrad",
"PctBSorMore",
"PctUnemployed",
"PctEmploy",
"PctEmplManu",
"PctEmplProfServ",
"PctOccupManu",
"PctOccupMgmtProf",
"MalePctDivorce",
"MalePctNevMarr",
"FemalePctDiv",
"TotalPctDiv",
"PersPerFam",
"PctFam2Par",
"PctKids2Par",
"PctYoungKids2Par",
"PctTeen2Par",
"PctWorkMomYoungKids",
"PctWorkMom",
"NumIlleg",
"PctIlleg",
"NumImmig",
"PctImmigRecent",
"PctImmigRec5",
"PctImmigRec8",
"PctImmigRec10",
"PctRecentImmig",
"PctRecImmig5",
"PctRecImmig8",
"PctRecImmig10",
"PctSpeakEnglOnly",
"PctNotSpeakEnglWell",
"PctLargHouseFam",
"PctLargHouseOccup",
"PersPerOccupHous",
"PersPerOwnOccHous",
"PersPerRentOccHous",
"PctPersOwnOccup",
"PctPersDenseHous",
"PctHousLess3BR",
"MedNumBR",
"HousVacant",
"PctHousOccup",
"PctHousOwnOcc",
"PctVacantBoarded",
"PctVacMore6Mos",
"MedYrHousBuilt",
"PctHousNoPhone",
"PctWOFullPlumb",
"OwnOccLowQuart",
"OwnOccMedVal",
"OwnOccHiQuart",
"RentLowQ",
"RentMedian",
"RentHighQ",
"MedRent",
"MedRentPctHousInc",
"MedOwnCostPctInc",
"MedOwnCostPctIncNoMtg",
"NumInShelters",
"NumStreet",
"PctForeignBorn",
"PctBornSameState",
"PctSameHouse85",
"PctSameCity85",
"PctSameState85",
"LemasSwornFT",
"LemasSwFTPerPop",
"LemasSwFTFieldOps",
"LemasSwFTFieldPerPop",
"LemasTotalReq",
"LemasTotReqPerPop",
"PolicReqPerOffic",
"PolicPerPop",
"RacialMatchCommPol",
"PctPolicWhite",
"PctPolicBlack",
"PctPolicHisp",
"PctPolicAsian",
"PctPolicMinor",
"OfficAssgnDrugUnits",
"NumKindsDrugsSeiz",
"PolicAveOTWorked",
"LandArea",
"PopDens",
"PctUsePubTrans",
"PolicCars",
"PolicOperBudg",
"LemasPctPolicOnPatr",
"LemasGangUnitDeploy",
"LemasPctOfficDrugUn",
"PolicBudgPerPop",
"ViolentCrimesPerPop"]

df = pd.read_csv('communities.data',names=col_names)
df = df.replace('?',None)
df = df.dropna(axis='rows')

le = LabelEncoder()
le.fit(df['communityname'].unique())
df['communityname'] = le.transform(df['communityname'])
df

	state	county	community	communityname	fold	population	householdsize	racepctblack	racePctWhite	racePctAsian	...	LandArea	PopDens	PctUsePubTrans	PolicCars	PolicOperBudg	LemasPctPolicOnPatr	LemasGangUnitDeploy	LemasPctOfficDrugUn	PolicBudgPerPop	ViolentCrimesPerPop
16	36	1	1000	0	1	0.15	0.31	0.40	0.63	0.14	...	0.06	0.39	0.84	0.06	0.06	0.91	0.5	0.88	0.26	0.49
23	19	193	93926	94	1	0.11	0.43	0.04	0.89	0.09	...	0.16	0.12	0.07	0.04	0.01	0.81	1	0.56	0.09	0.63
33	51	680	47672	52	1	0.09	0.43	0.51	0.58	0.04	...	0.14	0.11	0.19	0.05	0.01	0.75	0	0.60	0.1	0.31
68	34	23	58200	79	1	0.05	0.59	0.23	0.39	0.09	...	0.01	0.73	0.28	0	0.02	0.64	0	1.00	0.23	0.50
74	9	9	46520	58	1	0.08	0.39	0.08	0.85	0.04	...	0.07	0.21	0.04	0.02	0.01	0.7	1	0.44	0.11	0.14
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1880	34	39	40350	50	10	0.04	0.39	0.39	0.65	0.09	...	0.03	0.28	0.32	0.02	0.01	0.85	0	0.99	0.19	0.22
1963	36	27	59641	85	10	0.03	0.32	0.61	0.47	0.09	...	0.01	0.47	0.42	0.07	0.08	0.49	0	0.37	1	0.45
1981	9	9	35650	36	10	0.07	0.38	0.17	0.84	0.11	...	0.09	0.13	0.17	0.02	0.01	0.72	0	0.62	0.15	0.07
1991	9	9	80070	110	10	0.16	0.37	0.25	0.69	0.04	...	0.08	0.32	0.18	0.08	0.06	0.78	0	0.91	0.28	0.23
1992	25	17	72600	107	10	0.08	0.51	0.06	0.87	0.22	...	0.03	0.38	0.33	0.02	0.02	0.79	0	0.22	0.18	0.19

123 rows × 128 columns

X = df.loc[:,df.columns != 'ViolentCrimesPerPop']
y = df['ViolentCrimesPerPop']

X_train, X_test, y_train, y_test = train_test_split(poly_features, y, test_size=0.3)

ridgecv = RidgeCV(alphas=np.arange(1, 100, 5), scoring='r2', cv=10)
ridgecv.fit(X, y)
ridge = Ridge(alpha=ridgecv.alpha_)
ridge.fit(X_train, y_train)
ridge_y_predicted = ridge.predict(X_test)
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_y_predicted))
ridge_rmse

/home/tonywesoly/.local/lib/python3.8/site-packages/sklearn/linear_model/_ridge.py:251: UserWarning: Singular matrix in solving dual problem. Using least-squares solution instead.
  warnings.warn(

1.1533542718655332

#poly = PolynomialFeatures(degree=11, include_bias=False)
#poly_features = poly.fit_transform(X)

poly_reg_model = LinearRegression()
poly_reg_model.fit(X_train,y_train)
poly_reg_y_predicted = poly_reg_model.predict(X_test)
poly_reg_rmse = np.sqrt(mean_squared_error(y_test, poly_reg_y_predicted))
poly_reg_rmse

1.6511181528162753

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
lin_reg_model = LinearRegression()
lin_reg_model.fit(X_train, y_train)
lin_reg_y_predicted = lin_reg_model.predict(X_test)
lin_reg_rmse = np.sqrt(mean_squared_error(y_test, lin_reg_y_predicted))
lin_reg_rmse

0.3085495600528652

22 KiB Raw Permalink Blame History Unescape Escape

22 KiB

Raw Permalink Blame History