532 lines
20 KiB
Python
532 lines
20 KiB
Python
from numbers import Integral, Real
|
||
|
||
import numpy as np
|
||
|
||
from ..base import OneToOneFeatureMixin, _fit_context
|
||
from ..utils._param_validation import Interval, StrOptions
|
||
from ..utils.multiclass import type_of_target
|
||
from ..utils.validation import (
|
||
_check_feature_names_in,
|
||
_check_y,
|
||
check_consistent_length,
|
||
check_is_fitted,
|
||
)
|
||
from ._encoders import _BaseEncoder
|
||
from ._target_encoder_fast import _fit_encoding_fast, _fit_encoding_fast_auto_smooth
|
||
|
||
|
||
class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder):
|
||
"""Target Encoder for regression and classification targets.
|
||
|
||
Each category is encoded based on a shrunk estimate of the average target
|
||
values for observations belonging to the category. The encoding scheme mixes
|
||
the global target mean with the target mean conditioned on the value of the
|
||
category (see [MIC]_).
|
||
|
||
When the target type is "multiclass", encodings are based
|
||
on the conditional probability estimate for each class. The target is first
|
||
binarized using the "one-vs-all" scheme via
|
||
:class:`~sklearn.preprocessing.LabelBinarizer`, then the average target
|
||
value for each class and each category is used for encoding, resulting in
|
||
`n_features` * `n_classes` encoded output features.
|
||
|
||
:class:`TargetEncoder` considers missing values, such as `np.nan` or `None`,
|
||
as another category and encodes them like any other category. Categories
|
||
that are not seen during :meth:`fit` are encoded with the target mean, i.e.
|
||
`target_mean_`.
|
||
|
||
For a demo on the importance of the `TargetEncoder` internal cross-fitting,
|
||
see
|
||
:ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder_cross_val.py`.
|
||
For a comparison of different encoders, refer to
|
||
:ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`. Read
|
||
more in the :ref:`User Guide <target_encoder>`.
|
||
|
||
.. note::
|
||
`fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
|
||
:term:`cross fitting` scheme is used in `fit_transform` for encoding.
|
||
See the :ref:`User Guide <target_encoder>` for details.
|
||
|
||
.. versionadded:: 1.3
|
||
|
||
Parameters
|
||
----------
|
||
categories : "auto" or list of shape (n_features,) of array-like, default="auto"
|
||
Categories (unique values) per feature:
|
||
|
||
- `"auto"` : Determine categories automatically from the training data.
|
||
- list : `categories[i]` holds the categories expected in the i-th column. The
|
||
passed categories should not mix strings and numeric values within a single
|
||
feature, and should be sorted in case of numeric values.
|
||
|
||
The used categories are stored in the `categories_` fitted attribute.
|
||
|
||
target_type : {"auto", "continuous", "binary", "multiclass"}, default="auto"
|
||
Type of target.
|
||
|
||
- `"auto"` : Type of target is inferred with
|
||
:func:`~sklearn.utils.multiclass.type_of_target`.
|
||
- `"continuous"` : Continuous target
|
||
- `"binary"` : Binary target
|
||
- `"multiclass"` : Multiclass target
|
||
|
||
.. note::
|
||
The type of target inferred with `"auto"` may not be the desired target
|
||
type used for modeling. For example, if the target consisted of integers
|
||
between 0 and 100, then :func:`~sklearn.utils.multiclass.type_of_target`
|
||
will infer the target as `"multiclass"`. In this case, setting
|
||
`target_type="continuous"` will specify the target as a regression
|
||
problem. The `target_type_` attribute gives the target type used by the
|
||
encoder.
|
||
|
||
.. versionchanged:: 1.4
|
||
Added the option 'multiclass'.
|
||
|
||
smooth : "auto" or float, default="auto"
|
||
The amount of mixing of the target mean conditioned on the value of the
|
||
category with the global target mean. A larger `smooth` value will put
|
||
more weight on the global target mean.
|
||
If `"auto"`, then `smooth` is set to an empirical Bayes estimate.
|
||
|
||
cv : int, default=5
|
||
Determines the number of folds in the :term:`cross fitting` strategy used in
|
||
:meth:`fit_transform`. For classification targets, `StratifiedKFold` is used
|
||
and for continuous targets, `KFold` is used.
|
||
|
||
shuffle : bool, default=True
|
||
Whether to shuffle the data in :meth:`fit_transform` before splitting into
|
||
folds. Note that the samples within each split will not be shuffled.
|
||
|
||
random_state : int, RandomState instance or None, default=None
|
||
When `shuffle` is True, `random_state` affects the ordering of the
|
||
indices, which controls the randomness of each fold. Otherwise, this
|
||
parameter has no effect.
|
||
Pass an int for reproducible output across multiple function calls.
|
||
See :term:`Glossary <random_state>`.
|
||
|
||
Attributes
|
||
----------
|
||
encodings_ : list of shape (n_features,) or (n_features * n_classes) of \
|
||
ndarray
|
||
Encodings learnt on all of `X`.
|
||
For feature `i`, `encodings_[i]` are the encodings matching the
|
||
categories listed in `categories_[i]`. When `target_type_` is
|
||
"multiclass", the encoding for feature `i` and class `j` is stored in
|
||
`encodings_[j + (i * len(classes_))]`. E.g., for 2 features (f) and
|
||
3 classes (c), encodings are ordered:
|
||
f0_c0, f0_c1, f0_c2, f1_c0, f1_c1, f1_c2,
|
||
|
||
categories_ : list of shape (n_features,) of ndarray
|
||
The categories of each input feature determined during fitting or
|
||
specified in `categories`
|
||
(in order of the features in `X` and corresponding with the output
|
||
of :meth:`transform`).
|
||
|
||
target_type_ : str
|
||
Type of target.
|
||
|
||
target_mean_ : float
|
||
The overall mean of the target. This value is only used in :meth:`transform`
|
||
to encode categories.
|
||
|
||
n_features_in_ : int
|
||
Number of features seen during :term:`fit`.
|
||
|
||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||
Names of features seen during :term:`fit`. Defined only when `X`
|
||
has feature names that are all strings.
|
||
|
||
classes_ : ndarray or None
|
||
If `target_type_` is 'binary' or 'multiclass', holds the label for each class,
|
||
otherwise `None`.
|
||
|
||
See Also
|
||
--------
|
||
OrdinalEncoder : Performs an ordinal (integer) encoding of the categorical features.
|
||
Contrary to TargetEncoder, this encoding is not supervised. Treating the
|
||
resulting encoding as a numerical features therefore lead arbitrarily
|
||
ordered values and therefore typically lead to lower predictive performance
|
||
when used as preprocessing for a classifier or regressor.
|
||
OneHotEncoder : Performs a one-hot encoding of categorical features. This
|
||
unsupervised encoding is better suited for low cardinality categorical
|
||
variables as it generate one new feature per unique category.
|
||
|
||
References
|
||
----------
|
||
.. [MIC] :doi:`Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality
|
||
categorical attributes in classification and prediction problems"
|
||
SIGKDD Explor. Newsl. 3, 1 (July 2001), 27–32. <10.1145/507533.507538>`
|
||
|
||
Examples
|
||
--------
|
||
With `smooth="auto"`, the smoothing parameter is set to an empirical Bayes estimate:
|
||
|
||
>>> import numpy as np
|
||
>>> from sklearn.preprocessing import TargetEncoder
|
||
>>> X = np.array([["dog"] * 20 + ["cat"] * 30 + ["snake"] * 38], dtype=object).T
|
||
>>> y = [90.3] * 5 + [80.1] * 15 + [20.4] * 5 + [20.1] * 25 + [21.2] * 8 + [49] * 30
|
||
>>> enc_auto = TargetEncoder(smooth="auto")
|
||
>>> X_trans = enc_auto.fit_transform(X, y)
|
||
|
||
>>> # A high `smooth` parameter puts more weight on global mean on the categorical
|
||
>>> # encodings:
|
||
>>> enc_high_smooth = TargetEncoder(smooth=5000.0).fit(X, y)
|
||
>>> enc_high_smooth.target_mean_
|
||
44...
|
||
>>> enc_high_smooth.encodings_
|
||
[array([44..., 44..., 44...])]
|
||
|
||
>>> # On the other hand, a low `smooth` parameter puts more weight on target
|
||
>>> # conditioned on the value of the categorical:
|
||
>>> enc_low_smooth = TargetEncoder(smooth=1.0).fit(X, y)
|
||
>>> enc_low_smooth.encodings_
|
||
[array([20..., 80..., 43...])]
|
||
"""
|
||
|
||
_parameter_constraints: dict = {
|
||
"categories": [StrOptions({"auto"}), list],
|
||
"target_type": [StrOptions({"auto", "continuous", "binary", "multiclass"})],
|
||
"smooth": [StrOptions({"auto"}), Interval(Real, 0, None, closed="left")],
|
||
"cv": [Interval(Integral, 2, None, closed="left")],
|
||
"shuffle": ["boolean"],
|
||
"random_state": ["random_state"],
|
||
}
|
||
|
||
def __init__(
|
||
self,
|
||
categories="auto",
|
||
target_type="auto",
|
||
smooth="auto",
|
||
cv=5,
|
||
shuffle=True,
|
||
random_state=None,
|
||
):
|
||
self.categories = categories
|
||
self.smooth = smooth
|
||
self.target_type = target_type
|
||
self.cv = cv
|
||
self.shuffle = shuffle
|
||
self.random_state = random_state
|
||
|
||
@_fit_context(prefer_skip_nested_validation=True)
|
||
def fit(self, X, y):
|
||
"""Fit the :class:`TargetEncoder` to X and y.
|
||
|
||
Parameters
|
||
----------
|
||
X : array-like of shape (n_samples, n_features)
|
||
The data to determine the categories of each feature.
|
||
|
||
y : array-like of shape (n_samples,)
|
||
The target data used to encode the categories.
|
||
|
||
Returns
|
||
-------
|
||
self : object
|
||
Fitted encoder.
|
||
"""
|
||
self._fit_encodings_all(X, y)
|
||
return self
|
||
|
||
@_fit_context(prefer_skip_nested_validation=True)
|
||
def fit_transform(self, X, y):
|
||
"""Fit :class:`TargetEncoder` and transform X with the target encoding.
|
||
|
||
.. note::
|
||
`fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
|
||
:term:`cross fitting` scheme is used in `fit_transform` for encoding.
|
||
See the :ref:`User Guide <target_encoder>`. for details.
|
||
|
||
Parameters
|
||
----------
|
||
X : array-like of shape (n_samples, n_features)
|
||
The data to determine the categories of each feature.
|
||
|
||
y : array-like of shape (n_samples,)
|
||
The target data used to encode the categories.
|
||
|
||
Returns
|
||
-------
|
||
X_trans : ndarray of shape (n_samples, n_features) or \
|
||
(n_samples, (n_features * n_classes))
|
||
Transformed input.
|
||
"""
|
||
from ..model_selection import KFold, StratifiedKFold # avoid circular import
|
||
|
||
X_ordinal, X_known_mask, y_encoded, n_categories = self._fit_encodings_all(X, y)
|
||
|
||
# The cv splitter is voluntarily restricted to *KFold to enforce non
|
||
# overlapping validation folds, otherwise the fit_transform output will
|
||
# not be well-specified.
|
||
if self.target_type_ == "continuous":
|
||
cv = KFold(self.cv, shuffle=self.shuffle, random_state=self.random_state)
|
||
else:
|
||
cv = StratifiedKFold(
|
||
self.cv, shuffle=self.shuffle, random_state=self.random_state
|
||
)
|
||
|
||
# If 'multiclass' multiply axis=1 by num classes else keep shape the same
|
||
if self.target_type_ == "multiclass":
|
||
X_out = np.empty(
|
||
(X_ordinal.shape[0], X_ordinal.shape[1] * len(self.classes_)),
|
||
dtype=np.float64,
|
||
)
|
||
else:
|
||
X_out = np.empty_like(X_ordinal, dtype=np.float64)
|
||
|
||
for train_idx, test_idx in cv.split(X, y):
|
||
X_train, y_train = X_ordinal[train_idx, :], y_encoded[train_idx]
|
||
y_train_mean = np.mean(y_train, axis=0)
|
||
|
||
if self.target_type_ == "multiclass":
|
||
encodings = self._fit_encoding_multiclass(
|
||
X_train,
|
||
y_train,
|
||
n_categories,
|
||
y_train_mean,
|
||
)
|
||
else:
|
||
encodings = self._fit_encoding_binary_or_continuous(
|
||
X_train,
|
||
y_train,
|
||
n_categories,
|
||
y_train_mean,
|
||
)
|
||
self._transform_X_ordinal(
|
||
X_out,
|
||
X_ordinal,
|
||
~X_known_mask,
|
||
test_idx,
|
||
encodings,
|
||
y_train_mean,
|
||
)
|
||
return X_out
|
||
|
||
def transform(self, X):
|
||
"""Transform X with the target encoding.
|
||
|
||
.. note::
|
||
`fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
|
||
:term:`cross fitting` scheme is used in `fit_transform` for encoding.
|
||
See the :ref:`User Guide <target_encoder>`. for details.
|
||
|
||
Parameters
|
||
----------
|
||
X : array-like of shape (n_samples, n_features)
|
||
The data to determine the categories of each feature.
|
||
|
||
Returns
|
||
-------
|
||
X_trans : ndarray of shape (n_samples, n_features) or \
|
||
(n_samples, (n_features * n_classes))
|
||
Transformed input.
|
||
"""
|
||
X_ordinal, X_known_mask = self._transform(
|
||
X, handle_unknown="ignore", force_all_finite="allow-nan"
|
||
)
|
||
|
||
# If 'multiclass' multiply axis=1 by num of classes else keep shape the same
|
||
if self.target_type_ == "multiclass":
|
||
X_out = np.empty(
|
||
(X_ordinal.shape[0], X_ordinal.shape[1] * len(self.classes_)),
|
||
dtype=np.float64,
|
||
)
|
||
else:
|
||
X_out = np.empty_like(X_ordinal, dtype=np.float64)
|
||
|
||
self._transform_X_ordinal(
|
||
X_out,
|
||
X_ordinal,
|
||
~X_known_mask,
|
||
slice(None),
|
||
self.encodings_,
|
||
self.target_mean_,
|
||
)
|
||
return X_out
|
||
|
||
def _fit_encodings_all(self, X, y):
|
||
"""Fit a target encoding with all the data."""
|
||
# avoid circular import
|
||
from ..preprocessing import (
|
||
LabelBinarizer,
|
||
LabelEncoder,
|
||
)
|
||
|
||
check_consistent_length(X, y)
|
||
self._fit(X, handle_unknown="ignore", force_all_finite="allow-nan")
|
||
|
||
if self.target_type == "auto":
|
||
accepted_target_types = ("binary", "multiclass", "continuous")
|
||
inferred_type_of_target = type_of_target(y, input_name="y")
|
||
if inferred_type_of_target not in accepted_target_types:
|
||
raise ValueError(
|
||
"Unknown label type: Target type was inferred to be "
|
||
f"{inferred_type_of_target!r}. Only {accepted_target_types} are "
|
||
"supported."
|
||
)
|
||
self.target_type_ = inferred_type_of_target
|
||
else:
|
||
self.target_type_ = self.target_type
|
||
|
||
self.classes_ = None
|
||
if self.target_type_ == "binary":
|
||
label_encoder = LabelEncoder()
|
||
y = label_encoder.fit_transform(y)
|
||
self.classes_ = label_encoder.classes_
|
||
elif self.target_type_ == "multiclass":
|
||
label_binarizer = LabelBinarizer()
|
||
y = label_binarizer.fit_transform(y)
|
||
self.classes_ = label_binarizer.classes_
|
||
else: # continuous
|
||
y = _check_y(y, y_numeric=True, estimator=self)
|
||
|
||
self.target_mean_ = np.mean(y, axis=0)
|
||
|
||
X_ordinal, X_known_mask = self._transform(
|
||
X, handle_unknown="ignore", force_all_finite="allow-nan"
|
||
)
|
||
n_categories = np.fromiter(
|
||
(len(category_for_feature) for category_for_feature in self.categories_),
|
||
dtype=np.int64,
|
||
count=len(self.categories_),
|
||
)
|
||
if self.target_type_ == "multiclass":
|
||
encodings = self._fit_encoding_multiclass(
|
||
X_ordinal,
|
||
y,
|
||
n_categories,
|
||
self.target_mean_,
|
||
)
|
||
else:
|
||
encodings = self._fit_encoding_binary_or_continuous(
|
||
X_ordinal,
|
||
y,
|
||
n_categories,
|
||
self.target_mean_,
|
||
)
|
||
self.encodings_ = encodings
|
||
|
||
return X_ordinal, X_known_mask, y, n_categories
|
||
|
||
def _fit_encoding_binary_or_continuous(
|
||
self, X_ordinal, y, n_categories, target_mean
|
||
):
|
||
"""Learn target encodings."""
|
||
if self.smooth == "auto":
|
||
y_variance = np.var(y)
|
||
encodings = _fit_encoding_fast_auto_smooth(
|
||
X_ordinal,
|
||
y,
|
||
n_categories,
|
||
target_mean,
|
||
y_variance,
|
||
)
|
||
else:
|
||
encodings = _fit_encoding_fast(
|
||
X_ordinal,
|
||
y,
|
||
n_categories,
|
||
self.smooth,
|
||
target_mean,
|
||
)
|
||
return encodings
|
||
|
||
def _fit_encoding_multiclass(self, X_ordinal, y, n_categories, target_mean):
|
||
"""Learn multiclass encodings.
|
||
|
||
Learn encodings for each class (c) then reorder encodings such that
|
||
the same features (f) are grouped together. `reorder_index` enables
|
||
converting from:
|
||
f0_c0, f1_c0, f0_c1, f1_c1, f0_c2, f1_c2
|
||
to:
|
||
f0_c0, f0_c1, f0_c2, f1_c0, f1_c1, f1_c2
|
||
"""
|
||
n_features = self.n_features_in_
|
||
n_classes = len(self.classes_)
|
||
|
||
encodings = []
|
||
for i in range(n_classes):
|
||
y_class = y[:, i]
|
||
encoding = self._fit_encoding_binary_or_continuous(
|
||
X_ordinal,
|
||
y_class,
|
||
n_categories,
|
||
target_mean[i],
|
||
)
|
||
encodings.extend(encoding)
|
||
|
||
reorder_index = (
|
||
idx
|
||
for start in range(n_features)
|
||
for idx in range(start, (n_classes * n_features), n_features)
|
||
)
|
||
return [encodings[idx] for idx in reorder_index]
|
||
|
||
def _transform_X_ordinal(
|
||
self,
|
||
X_out,
|
||
X_ordinal,
|
||
X_unknown_mask,
|
||
row_indices,
|
||
encodings,
|
||
target_mean,
|
||
):
|
||
"""Transform X_ordinal using encodings.
|
||
|
||
In the multiclass case, `X_ordinal` and `X_unknown_mask` have column
|
||
(axis=1) size `n_features`, while `encodings` has length of size
|
||
`n_features * n_classes`. `feat_idx` deals with this by repeating
|
||
feature indices by `n_classes` E.g., for 3 features, 2 classes:
|
||
0,0,1,1,2,2
|
||
|
||
Additionally, `target_mean` is of shape (`n_classes`,) so `mean_idx`
|
||
cycles through 0 to `n_classes` - 1, `n_features` times.
|
||
"""
|
||
if self.target_type_ == "multiclass":
|
||
n_classes = len(self.classes_)
|
||
for e_idx, encoding in enumerate(encodings):
|
||
# Repeat feature indices by n_classes
|
||
feat_idx = e_idx // n_classes
|
||
# Cycle through each class
|
||
mean_idx = e_idx % n_classes
|
||
X_out[row_indices, e_idx] = encoding[X_ordinal[row_indices, feat_idx]]
|
||
X_out[X_unknown_mask[:, feat_idx], e_idx] = target_mean[mean_idx]
|
||
else:
|
||
for e_idx, encoding in enumerate(encodings):
|
||
X_out[row_indices, e_idx] = encoding[X_ordinal[row_indices, e_idx]]
|
||
X_out[X_unknown_mask[:, e_idx], e_idx] = target_mean
|
||
|
||
def get_feature_names_out(self, input_features=None):
|
||
"""Get output feature names for transformation.
|
||
|
||
Parameters
|
||
----------
|
||
input_features : array-like of str or None, default=None
|
||
Not used, present here for API consistency by convention.
|
||
|
||
Returns
|
||
-------
|
||
feature_names_out : ndarray of str objects
|
||
Transformed feature names. `feature_names_in_` is used unless it is
|
||
not defined, in which case the following input feature names are
|
||
generated: `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
|
||
When `type_of_target_` is "multiclass" the names are of the format
|
||
'<feature_name>_<class_name>'.
|
||
"""
|
||
check_is_fitted(self, "n_features_in_")
|
||
feature_names = _check_feature_names_in(self, input_features)
|
||
if self.target_type_ == "multiclass":
|
||
feature_names = [
|
||
f"{feature_name}_{class_name}"
|
||
for feature_name in feature_names
|
||
for class_name in self.classes_
|
||
]
|
||
return np.asarray(feature_names, dtype=object)
|
||
else:
|
||
return feature_names
|
||
|
||
def _more_tags(self):
|
||
return {
|
||
"requires_y": True,
|
||
}
|