kerreg-MPSKICB/KernelRegression.ipynb
2021-05-27 20:46:48 +02:00

1.0 MiB
Raw Blame History

import numpy as np
import plotly.express as px
from statsmodels.nonparametric.kernel_regression import KernelReg
import plotly.graph_objs as go
import pandas as pd 
np.random.seed(1)# xwidth controls the range of x values.
xwidth = 20
x = np.arange(0,xwidth,1)# we want to add some noise to the x values so that dont sit at regular intervals
x_residuals = np.random.normal(scale=0.2, size=[x.shape[0]])# new_x is the range of x values we will be using all the way through
new_x = x + x_residuals# We generate residuals for y values since we want to show some variation in the data
num_points = x.shape[0]
residuals = np.random.normal(scale=2.0, size=[num_points])# We will be using fun_y to generate y values all the way through
fun_y = lambda x: -(x*x) + residuals
new_x
array([ 0.32486907,  0.87764872,  1.89436565,  2.78540628,  4.17308153,
        4.53969226,  6.34896235,  6.84775862,  8.06380782,  8.95012592,
       10.29242159, 10.58797186, 11.93551656, 12.92318913, 14.22675389,
       14.78002175, 15.96551436, 16.82442832, 18.00844275, 19.11656304])
np.random.seed(1)# xwidth controls the range of x values.
xwidth = 20
x = np.arange(0,xwidth,1)# we want to add some noise to the x values so that dont sit at regular intervals
x_residuals = np.random.normal(scale=0.2, size=[x.shape[0]])# new_x is the range of x values we will be using all the way through
new_x = x + x_residuals# We generate residuals for y values since we want to show some variation in the data
num_points = x.shape[0]
residuals = np.random.normal(scale=2.0, size=[num_points])# We will be using fun_y to generate y values all the way through
fun_y = lambda x: -(x*x) + residuals
# Plot the x and y values 
px.scatter(x=new_x,y=fun_y(new_x), title='Figure 1:  Visualizing the generated data')
#ker_log = KernelReg(new_x, fun_y(new_x), 'c')
#fig = px.scatter(x=new_x,y=fun_y(new_x),  title='Figure 2: Statsmodels fit to generated data')
#fig.add_trace(go.Scatter(x=new_x, y=pred_y, name='Statsmodels fit',  mode='lines'))
kernel_x = np.arange(-xwidth,xwidth, 0.1)
bw_manual = 2

def gauss_const(h):
    """
    Returns the normalization constant for a gaussian
    """
    return 1/(h*np.sqrt(np.pi*2))

def gauss_exp(ker_x, xi, h): 
    """
    Returns the gaussian function exponent term
    """
    num =  - 0.5*np.square((xi- ker_x))
    den = h*h
    return num/den

def kernel_function(h, ker_x, xi): 
    """
    Returns the gaussian function value. Combines the gauss_const and
    gauss_exp to get this result
    """
    const = gauss_const(h)
    gauss_val = const*np.exp(gauss_exp(ker_x,xi,h))
    return gauss_val

# We are selecting a single point and calculating the Kernel value
input_x = new_x[0]
col1 = gauss_const(bw_manual)
col2= gauss_exp(kernel_x, input_x, bw_manual)
col3 = kernel_function(bw_manual, kernel_x, input_x)
# Dataframe for a single observation point x_i. In the code x_i comes from new_x
data = {'Input_x': [input_x for x in range(col2.shape[0])],
        'kernel_x': kernel_x,
        'gaussian_const': [col1 for x in range(col2.shape[0])],
        'gaussian_exp': col2,
        'full_gaussian_value': col3,
        'bw':[bw_manual for x in range(col2.shape[0])],
        }
single_pt_KE = pd.DataFrame(data=data)
single_pt_KE
Input_x kernel_x gaussian_const gaussian_exp full_gaussian_value bw
0 0.324869 -20.0 0.199471 -51.637538 7.481390e-24 2
1 0.324869 -19.9 0.199471 -51.130666 1.241978e-23 2
2 0.324869 -19.8 0.199471 -50.626294 2.056647e-23 2
3 0.324869 -19.7 0.199471 -50.124423 3.397190e-23 2
4 0.324869 -19.6 0.199471 -49.625051 5.597502e-23 2
... ... ... ... ... ... ...
395 0.324869 19.5 0.199471 -45.960706 2.184737e-21 2
396 0.324869 19.6 0.199471 -46.441334 1.351030e-21 2
397 0.324869 19.7 0.199471 -46.924462 8.333837e-22 2
398 0.324869 19.8 0.199471 -47.410091 5.127898e-22 2
399 0.324869 19.9 0.199471 -47.898219 3.147371e-22 2

400 rows × 6 columns

# Plotting a scatter plot of Kernel 
px.line(x=kernel_x, y=col3, title='Figure 3: Kernel function for a single input value')
## Plotting gaussian for all input x points 
kernel_fns = {'kernel_x': kernel_x}
for input_x in new_x: 
    input_string= 'x_value_{}'.format(np.round(input_x,2)) 
    kernel_fns[input_string] = kernel_function(bw_manual, kernel_x, input_x)

kernels_df = pd.DataFrame(data=kernel_fns)
y_all = kernels_df.drop(columns='kernel_x')
px.line(kernels_df, x='kernel_x', y=y_all.columns, title='Gaussian for all input points', range_x=[-5,20])
def weights(bw_manual, input_x, all_input_values ): 
    w_row = []
    for x_i in all_input_values: 
        ki = kernel_function(bw_manual, x_i, input_x)
        ki_sum = np.sum(kernel_function(bw_manual, all_input_values, input_x))
        w_row.append(ki/ki_sum)
    return w_row

def single_y_pred(bw_manual, input_x, iks, igrek): 
    w = weights(bw_manual, input_x, iks)
    y_single = np.sum(np.dot(igrek,w))
    return y_single

#ypred_single = single_y_pred(bw_manual, new_x[0], new_x)
Y_pred = []
for input_x in new_x: 
    w = []
    Y_single = single_y_pred(bw_manual, input_x, new_x)
    Y_pred.append(Y_single)
data= {'x': new_x, 'y': fun_y(new_x), 'y_manual': np.array(y_all)}
fig = px.scatter(x=new_x,y=fun_y(x))
#fig.add_trace(go.Scatter(x=new_x, y=pred_y, name='Statsmodel KR',  mode='lines'))
fig.add_trace(go.Scatter(x=new_x, y=np.array(Y_pred), name='Manual KR',  mode='lines'))
fires_thefts = pd.read_csv('fires_thefts.csv', names=['x','y'])
single_pt_KE
Input_x kernel_x gaussian_const gaussian_exp full_gaussian_value bw
0 39.7 -21 0.398942 -264.5 5.370560e-116 1
1 39.7 -20 0.398942 -242.0 3.174282e-106 1
2 39.7 -19 0.398942 -220.5 6.902029e-97 1
3 39.7 -18 0.398942 -200.0 5.520948e-88 1
4 39.7 -17 0.398942 -180.5 1.624636e-79 1
5 39.7 -16 0.398942 -162.0 1.758750e-71 1
6 39.7 -15 0.398942 -144.5 7.004182e-64 1
7 39.7 -14 0.398942 -128.0 1.026163e-56 1
8 39.7 -13 0.398942 -112.5 5.530710e-50 1
9 39.7 -12 0.398942 -98.0 1.096607e-43 1
10 39.7 -11 0.398942 -84.5 7.998828e-38 1
11 39.7 -10 0.398942 -72.0 2.146384e-32 1
12 39.7 -9 0.398942 -60.5 2.118819e-27 1
13 39.7 -8 0.398942 -50.0 7.694599e-23 1
14 39.7 -7 0.398942 -40.5 1.027977e-18 1
15 39.7 -6 0.398942 -32.0 5.052271e-15 1
16 39.7 -5 0.398942 -24.5 9.134720e-12 1
17 39.7 -4 0.398942 -18.0 6.075883e-09 1
18 39.7 -3 0.398942 -12.5 1.486720e-06 1
19 39.7 -2 0.398942 -8.0 1.338302e-04 1
20 39.7 -1 0.398942 -4.5 4.431848e-03 1
21 39.7 0 0.398942 -2.0 5.399097e-02 1
22 39.7 1 0.398942 -0.5 2.419707e-01 1
23 39.7 2 0.398942 -0.0 3.989423e-01 1
24 39.7 3 0.398942 -0.5 2.419707e-01 1
25 39.7 4 0.398942 -2.0 5.399097e-02 1
26 39.7 5 0.398942 -4.5 4.431848e-03 1
27 39.7 6 0.398942 -8.0 1.338302e-04 1
28 39.7 7 0.398942 -12.5 1.486720e-06 1
29 39.7 8 0.398942 -18.0 6.075883e-09 1
30 39.7 9 0.398942 -24.5 9.134720e-12 1
31 39.7 10 0.398942 -32.0 5.052271e-15 1
32 39.7 11 0.398942 -40.5 1.027977e-18 1
33 39.7 12 0.398942 -50.0 7.694599e-23 1
34 39.7 13 0.398942 -60.5 2.118819e-27 1
35 39.7 14 0.398942 -72.0 2.146384e-32 1
36 39.7 15 0.398942 -84.5 7.998828e-38 1
37 39.7 16 0.398942 -98.0 1.096607e-43 1
38 39.7 17 0.398942 -112.5 5.530710e-50 1
39 39.7 18 0.398942 -128.0 1.026163e-56 1
40 39.7 19 0.398942 -144.5 7.004182e-64 1
41 39.7 20 0.398942 -162.0 1.758750e-71 1
XXX = np.sort(np.array(fires_thefts.x))
YYY = np.array(fires_thefts.y)


bw_manual =3
kernel_x = np.arange(-20,20, 0.1)

input_x = XXX[0]
col1 = gauss_const(bw_manual)
col2 = gauss_exp(kernel_x, input_x, bw_manual)
col3 = kernel_function(bw_manual, kernel_x, input_x)


## Plotting gaussian for all input x points 
kernel_fns = {'kernel_x': kernel_x}
for input_x in XXX: 
    input_string= 'x_value_{}'.format(np.round(input_x,2)) 
    kernel_fns[input_string] = kernel_function(bw_manual, kernel_x, input_x)

kernels_df = pd.DataFrame(data=kernel_fns)
y_all = kernels_df.drop(columns='kernel_x')
px.line(kernels_df, x='kernel_x', y=y_all.columns, title='Gaussian for all input points', range_x=[-5,100])

ypred_single = single_y_pred(bw_manual, XXX[0], XXX, YYY)

Y_pred = []
for input_x in XXX: 
    w = []
    Y_single = single_y_pred(bw_manual, input_x, XXX, YYY)
    Y_pred.append(Y_single)
    

# Dataframe for a single observation point x_i. In the code x_i comes from new_x
data = {'Input_x': [input_x for x in range(col2.shape[0])],
        'kernel_x': kernel_x,
        'gaussian_const': [col1 for x in range(col2.shape[0])],
        'gaussian_exp': col2,
        'full_gaussian_value': col3,
        'bw':[bw_manual for x in range(col2.shape[0])],
        }
single_pt_KE = pd.DataFrame(data=data)
single_pt_KE


data= {'x': XXX, 'y': YYY, 'y_manual': np.array(y_all)}
fig = px.scatter(x=XXX,y=YYY)
#fig.add_trace(go.Scatter(x=new_x, y=pred_y, name='Statsmodel KR',  mode='lines'))
fig.add_trace(go.Scatter(x=XXX, y=np.array(Y_pred), name='Manual KR',  mode='lines'))