168 lines
5.8 KiB
Cython
168 lines
5.8 KiB
Cython
from libc.math cimport isnan
|
|
from libcpp.vector cimport vector
|
|
|
|
from ..utils._typedefs cimport float32_t, float64_t, int32_t, int64_t
|
|
|
|
import numpy as np
|
|
|
|
|
|
ctypedef fused INT_DTYPE:
|
|
int64_t
|
|
int32_t
|
|
|
|
ctypedef fused Y_DTYPE:
|
|
int64_t
|
|
int32_t
|
|
float64_t
|
|
float32_t
|
|
|
|
|
|
def _fit_encoding_fast(
|
|
INT_DTYPE[:, ::1] X_int,
|
|
const Y_DTYPE[:] y,
|
|
int64_t[::1] n_categories,
|
|
double smooth,
|
|
double y_mean,
|
|
):
|
|
"""Fit a target encoding on X_int and y.
|
|
|
|
This implementation uses Eq 7 from [1] to compute the encoding.
|
|
As stated in the paper, Eq 7 is the same as Eq 3.
|
|
|
|
[1]: Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality
|
|
categorical attributes in classification and prediction problems"
|
|
"""
|
|
cdef:
|
|
int64_t sample_idx, feat_idx, cat_idx, n_cats
|
|
INT_DTYPE X_int_tmp
|
|
int n_samples = X_int.shape[0]
|
|
int n_features = X_int.shape[1]
|
|
double smooth_sum = smooth * y_mean
|
|
int64_t max_n_cats = np.max(n_categories)
|
|
double[::1] sums = np.empty(max_n_cats, dtype=np.float64)
|
|
double[::1] counts = np.empty(max_n_cats, dtype=np.float64)
|
|
list encodings = []
|
|
double[::1] current_encoding
|
|
# Gives access to encodings without gil
|
|
vector[double*] encoding_vec
|
|
|
|
encoding_vec.resize(n_features)
|
|
for feat_idx in range(n_features):
|
|
current_encoding = np.empty(shape=n_categories[feat_idx], dtype=np.float64)
|
|
encoding_vec[feat_idx] = ¤t_encoding[0]
|
|
encodings.append(np.asarray(current_encoding))
|
|
|
|
with nogil:
|
|
for feat_idx in range(n_features):
|
|
n_cats = n_categories[feat_idx]
|
|
|
|
for cat_idx in range(n_cats):
|
|
sums[cat_idx] = smooth_sum
|
|
counts[cat_idx] = smooth
|
|
|
|
for sample_idx in range(n_samples):
|
|
X_int_tmp = X_int[sample_idx, feat_idx]
|
|
# -1 are unknown categories, which are not counted
|
|
if X_int_tmp == -1:
|
|
continue
|
|
sums[X_int_tmp] += y[sample_idx]
|
|
counts[X_int_tmp] += 1.0
|
|
|
|
for cat_idx in range(n_cats):
|
|
if counts[cat_idx] == 0:
|
|
encoding_vec[feat_idx][cat_idx] = y_mean
|
|
else:
|
|
encoding_vec[feat_idx][cat_idx] = sums[cat_idx] / counts[cat_idx]
|
|
|
|
return encodings
|
|
|
|
|
|
def _fit_encoding_fast_auto_smooth(
|
|
INT_DTYPE[:, ::1] X_int,
|
|
const Y_DTYPE[:] y,
|
|
int64_t[::1] n_categories,
|
|
double y_mean,
|
|
double y_variance,
|
|
):
|
|
"""Fit a target encoding on X_int and y with auto smoothing.
|
|
|
|
This implementation uses Eq 5 and 6 from [1].
|
|
|
|
[1]: Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality
|
|
categorical attributes in classification and prediction problems"
|
|
"""
|
|
cdef:
|
|
int64_t sample_idx, feat_idx, cat_idx, n_cats
|
|
INT_DTYPE X_int_tmp
|
|
double diff
|
|
int n_samples = X_int.shape[0]
|
|
int n_features = X_int.shape[1]
|
|
int64_t max_n_cats = np.max(n_categories)
|
|
double[::1] means = np.empty(max_n_cats, dtype=np.float64)
|
|
int64_t[::1] counts = np.empty(max_n_cats, dtype=np.int64)
|
|
double[::1] sum_of_squared_diffs = np.empty(max_n_cats, dtype=np.float64)
|
|
double lambda_
|
|
list encodings = []
|
|
double[::1] current_encoding
|
|
# Gives access to encodings without gil
|
|
vector[double*] encoding_vec
|
|
|
|
encoding_vec.resize(n_features)
|
|
for feat_idx in range(n_features):
|
|
current_encoding = np.empty(shape=n_categories[feat_idx], dtype=np.float64)
|
|
encoding_vec[feat_idx] = ¤t_encoding[0]
|
|
encodings.append(np.asarray(current_encoding))
|
|
|
|
# TODO: parallelize this with OpenMP prange. When n_features >= n_threads, it's
|
|
# probably good to parallelize the outer loop. When n_features is too small,
|
|
# then it would probably better to parallelize the nested loops on n_samples and
|
|
# n_cats, but the code to handle thread-local temporary variables might be
|
|
# significantly more complex.
|
|
with nogil:
|
|
for feat_idx in range(n_features):
|
|
n_cats = n_categories[feat_idx]
|
|
|
|
for cat_idx in range(n_cats):
|
|
means[cat_idx] = 0.0
|
|
counts[cat_idx] = 0
|
|
sum_of_squared_diffs[cat_idx] = 0.0
|
|
|
|
# first pass to compute the mean
|
|
for sample_idx in range(n_samples):
|
|
X_int_tmp = X_int[sample_idx, feat_idx]
|
|
|
|
# -1 are unknown categories, which are not counted
|
|
if X_int_tmp == -1:
|
|
continue
|
|
counts[X_int_tmp] += 1
|
|
means[X_int_tmp] += y[sample_idx]
|
|
|
|
for cat_idx in range(n_cats):
|
|
means[cat_idx] /= counts[cat_idx]
|
|
|
|
# second pass to compute the sum of squared differences
|
|
for sample_idx in range(n_samples):
|
|
X_int_tmp = X_int[sample_idx, feat_idx]
|
|
if X_int_tmp == -1:
|
|
continue
|
|
diff = y[sample_idx] - means[X_int_tmp]
|
|
sum_of_squared_diffs[X_int_tmp] += diff * diff
|
|
|
|
for cat_idx in range(n_cats):
|
|
lambda_ = (
|
|
y_variance * counts[cat_idx] /
|
|
(y_variance * counts[cat_idx] + sum_of_squared_diffs[cat_idx] /
|
|
counts[cat_idx])
|
|
)
|
|
if isnan(lambda_):
|
|
# A nan can happen when:
|
|
# 1. counts[cat_idx] == 0
|
|
# 2. y_variance == 0 and sum_of_squared_diffs[cat_idx] == 0
|
|
encoding_vec[feat_idx][cat_idx] = y_mean
|
|
else:
|
|
encoding_vec[feat_idx][cat_idx] = (
|
|
lambda_ * means[cat_idx] + (1 - lambda_) * y_mean
|
|
)
|
|
|
|
return encodings
|