1655 lines
64 KiB
Python
1655 lines
64 KiB
Python
"""
|
|
The :mod:`sklearn.compose._column_transformer` module implements utilities
|
|
to work with heterogeneous data and to apply different transformers to
|
|
different columns.
|
|
"""
|
|
|
|
# Author: Andreas Mueller
|
|
# Joris Van den Bossche
|
|
# License: BSD
|
|
import warnings
|
|
from collections import Counter, UserList
|
|
from itertools import chain
|
|
from numbers import Integral, Real
|
|
|
|
import numpy as np
|
|
from scipy import sparse
|
|
|
|
from ..base import TransformerMixin, _fit_context, clone
|
|
from ..pipeline import _fit_transform_one, _name_estimators, _transform_one
|
|
from ..preprocessing import FunctionTransformer
|
|
from ..utils import Bunch
|
|
from ..utils._estimator_html_repr import _VisualBlock
|
|
from ..utils._indexing import _determine_key_type, _get_column_indices
|
|
from ..utils._metadata_requests import METHODS
|
|
from ..utils._param_validation import HasMethods, Hidden, Interval, StrOptions
|
|
from ..utils._set_output import (
|
|
_get_container_adapter,
|
|
_get_output_config,
|
|
_safe_set_output,
|
|
)
|
|
from ..utils.metadata_routing import (
|
|
MetadataRouter,
|
|
MethodMapping,
|
|
_raise_for_params,
|
|
_routing_enabled,
|
|
process_routing,
|
|
)
|
|
from ..utils.metaestimators import _BaseComposition
|
|
from ..utils.parallel import Parallel, delayed
|
|
from ..utils.validation import (
|
|
_check_feature_names_in,
|
|
_get_feature_names,
|
|
_is_pandas_df,
|
|
_num_samples,
|
|
check_array,
|
|
check_is_fitted,
|
|
)
|
|
|
|
__all__ = ["ColumnTransformer", "make_column_transformer", "make_column_selector"]
|
|
|
|
|
|
_ERR_MSG_1DCOLUMN = (
|
|
"1D data passed to a transformer that expects 2D data. "
|
|
"Try to specify the column selection as a list of one "
|
|
"item instead of a scalar."
|
|
)
|
|
|
|
|
|
class ColumnTransformer(TransformerMixin, _BaseComposition):
|
|
"""Applies transformers to columns of an array or pandas DataFrame.
|
|
|
|
This estimator allows different columns or column subsets of the input
|
|
to be transformed separately and the features generated by each transformer
|
|
will be concatenated to form a single feature space.
|
|
This is useful for heterogeneous or columnar data, to combine several
|
|
feature extraction mechanisms or transformations into a single transformer.
|
|
|
|
Read more in the :ref:`User Guide <column_transformer>`.
|
|
|
|
.. versionadded:: 0.20
|
|
|
|
Parameters
|
|
----------
|
|
transformers : list of tuples
|
|
List of (name, transformer, columns) tuples specifying the
|
|
transformer objects to be applied to subsets of the data.
|
|
|
|
name : str
|
|
Like in Pipeline and FeatureUnion, this allows the transformer and
|
|
its parameters to be set using ``set_params`` and searched in grid
|
|
search.
|
|
transformer : {'drop', 'passthrough'} or estimator
|
|
Estimator must support :term:`fit` and :term:`transform`.
|
|
Special-cased strings 'drop' and 'passthrough' are accepted as
|
|
well, to indicate to drop the columns or to pass them through
|
|
untransformed, respectively.
|
|
columns : str, array-like of str, int, array-like of int, \
|
|
array-like of bool, slice or callable
|
|
Indexes the data on its second axis. Integers are interpreted as
|
|
positional columns, while strings can reference DataFrame columns
|
|
by name. A scalar string or int should be used where
|
|
``transformer`` expects X to be a 1d array-like (vector),
|
|
otherwise a 2d array will be passed to the transformer.
|
|
A callable is passed the input data `X` and can return any of the
|
|
above. To select multiple columns by name or dtype, you can use
|
|
:obj:`make_column_selector`.
|
|
|
|
remainder : {'drop', 'passthrough'} or estimator, default='drop'
|
|
By default, only the specified columns in `transformers` are
|
|
transformed and combined in the output, and the non-specified
|
|
columns are dropped. (default of ``'drop'``).
|
|
By specifying ``remainder='passthrough'``, all remaining columns that
|
|
were not specified in `transformers`, but present in the data passed
|
|
to `fit` will be automatically passed through. This subset of columns
|
|
is concatenated with the output of the transformers. For dataframes,
|
|
extra columns not seen during `fit` will be excluded from the output
|
|
of `transform`.
|
|
By setting ``remainder`` to be an estimator, the remaining
|
|
non-specified columns will use the ``remainder`` estimator. The
|
|
estimator must support :term:`fit` and :term:`transform`.
|
|
Note that using this feature requires that the DataFrame columns
|
|
input at :term:`fit` and :term:`transform` have identical order.
|
|
|
|
sparse_threshold : float, default=0.3
|
|
If the output of the different transformers contains sparse matrices,
|
|
these will be stacked as a sparse matrix if the overall density is
|
|
lower than this value. Use ``sparse_threshold=0`` to always return
|
|
dense. When the transformed output consists of all dense data, the
|
|
stacked result will be dense, and this keyword will be ignored.
|
|
|
|
n_jobs : int, default=None
|
|
Number of jobs to run in parallel.
|
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
|
for more details.
|
|
|
|
transformer_weights : dict, default=None
|
|
Multiplicative weights for features per transformer. The output of the
|
|
transformer is multiplied by these weights. Keys are transformer names,
|
|
values the weights.
|
|
|
|
verbose : bool, default=False
|
|
If True, the time elapsed while fitting each transformer will be
|
|
printed as it is completed.
|
|
|
|
verbose_feature_names_out : bool, default=True
|
|
If True, :meth:`ColumnTransformer.get_feature_names_out` will prefix
|
|
all feature names with the name of the transformer that generated that
|
|
feature.
|
|
If False, :meth:`ColumnTransformer.get_feature_names_out` will not
|
|
prefix any feature names and will error if feature names are not
|
|
unique.
|
|
|
|
.. versionadded:: 1.0
|
|
|
|
force_int_remainder_cols : bool, default=True
|
|
Force the columns of the last entry of `transformers_`, which
|
|
corresponds to the "remainder" transformer, to always be stored as
|
|
indices (int) rather than column names (str). See description of the
|
|
`transformers_` attribute for details.
|
|
|
|
.. note::
|
|
If you do not access the list of columns for the remainder columns
|
|
in the `transformers_` fitted attribute, you do not need to set
|
|
this parameter.
|
|
|
|
.. versionadded:: 1.5
|
|
|
|
.. versionchanged:: 1.7
|
|
The default value for `force_int_remainder_cols` will change from
|
|
`True` to `False` in version 1.7.
|
|
|
|
Attributes
|
|
----------
|
|
transformers_ : list
|
|
The collection of fitted transformers as tuples of (name,
|
|
fitted_transformer, column). `fitted_transformer` can be an estimator,
|
|
or `'drop'`; `'passthrough'` is replaced with an equivalent
|
|
:class:`~sklearn.preprocessing.FunctionTransformer`. In case there were
|
|
no columns selected, this will be the unfitted transformer. If there
|
|
are remaining columns, the final element is a tuple of the form:
|
|
('remainder', transformer, remaining_columns) corresponding to the
|
|
``remainder`` parameter. If there are remaining columns, then
|
|
``len(transformers_)==len(transformers)+1``, otherwise
|
|
``len(transformers_)==len(transformers)``.
|
|
|
|
.. versionchanged:: 1.5
|
|
If there are remaining columns and `force_int_remainder_cols` is
|
|
True, the remaining columns are always represented by their
|
|
positional indices in the input `X` (as in older versions). If
|
|
`force_int_remainder_cols` is False, the format attempts to match
|
|
that of the other transformers: if all columns were provided as
|
|
column names (`str`), the remaining columns are stored as column
|
|
names; if all columns were provided as mask arrays (`bool`), so are
|
|
the remaining columns; in all other cases the remaining columns are
|
|
stored as indices (`int`).
|
|
|
|
named_transformers_ : :class:`~sklearn.utils.Bunch`
|
|
Read-only attribute to access any transformer by given name.
|
|
Keys are transformer names and values are the fitted transformer
|
|
objects.
|
|
|
|
sparse_output_ : bool
|
|
Boolean flag indicating whether the output of ``transform`` is a
|
|
sparse matrix or a dense numpy array, which depends on the output
|
|
of the individual transformers and the `sparse_threshold` keyword.
|
|
|
|
output_indices_ : dict
|
|
A dictionary from each transformer name to a slice, where the slice
|
|
corresponds to indices in the transformed output. This is useful to
|
|
inspect which transformer is responsible for which transformed
|
|
feature(s).
|
|
|
|
.. versionadded:: 1.0
|
|
|
|
n_features_in_ : int
|
|
Number of features seen during :term:`fit`. Only defined if the
|
|
underlying transformers expose such an attribute when fit.
|
|
|
|
.. versionadded:: 0.24
|
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
|
Names of features seen during :term:`fit`. Defined only when `X`
|
|
has feature names that are all strings.
|
|
|
|
.. versionadded:: 1.0
|
|
|
|
See Also
|
|
--------
|
|
make_column_transformer : Convenience function for
|
|
combining the outputs of multiple transformer objects applied to
|
|
column subsets of the original feature space.
|
|
make_column_selector : Convenience function for selecting
|
|
columns based on datatype or the columns name with a regex pattern.
|
|
|
|
Notes
|
|
-----
|
|
The order of the columns in the transformed feature matrix follows the
|
|
order of how the columns are specified in the `transformers` list.
|
|
Columns of the original feature matrix that are not specified are
|
|
dropped from the resulting transformed feature matrix, unless specified
|
|
in the `passthrough` keyword. Those columns specified with `passthrough`
|
|
are added at the right to the output of the transformers.
|
|
|
|
Examples
|
|
--------
|
|
>>> import numpy as np
|
|
>>> from sklearn.compose import ColumnTransformer
|
|
>>> from sklearn.preprocessing import Normalizer
|
|
>>> ct = ColumnTransformer(
|
|
... [("norm1", Normalizer(norm='l1'), [0, 1]),
|
|
... ("norm2", Normalizer(norm='l1'), slice(2, 4))])
|
|
>>> X = np.array([[0., 1., 2., 2.],
|
|
... [1., 1., 0., 1.]])
|
|
>>> # Normalizer scales each row of X to unit norm. A separate scaling
|
|
>>> # is applied for the two first and two last elements of each
|
|
>>> # row independently.
|
|
>>> ct.fit_transform(X)
|
|
array([[0. , 1. , 0.5, 0.5],
|
|
[0.5, 0.5, 0. , 1. ]])
|
|
|
|
:class:`ColumnTransformer` can be configured with a transformer that requires
|
|
a 1d array by setting the column to a string:
|
|
|
|
>>> from sklearn.feature_extraction.text import CountVectorizer
|
|
>>> from sklearn.preprocessing import MinMaxScaler
|
|
>>> import pandas as pd # doctest: +SKIP
|
|
>>> X = pd.DataFrame({
|
|
... "documents": ["First item", "second one here", "Is this the last?"],
|
|
... "width": [3, 4, 5],
|
|
... }) # doctest: +SKIP
|
|
>>> # "documents" is a string which configures ColumnTransformer to
|
|
>>> # pass the documents column as a 1d array to the CountVectorizer
|
|
>>> ct = ColumnTransformer(
|
|
... [("text_preprocess", CountVectorizer(), "documents"),
|
|
... ("num_preprocess", MinMaxScaler(), ["width"])])
|
|
>>> X_trans = ct.fit_transform(X) # doctest: +SKIP
|
|
|
|
For a more detailed example of usage, see
|
|
:ref:`sphx_glr_auto_examples_compose_plot_column_transformer_mixed_types.py`.
|
|
"""
|
|
|
|
_required_parameters = ["transformers"]
|
|
|
|
_parameter_constraints: dict = {
|
|
"transformers": [list, Hidden(tuple)],
|
|
"remainder": [
|
|
StrOptions({"drop", "passthrough"}),
|
|
HasMethods(["fit", "transform"]),
|
|
HasMethods(["fit_transform", "transform"]),
|
|
],
|
|
"sparse_threshold": [Interval(Real, 0, 1, closed="both")],
|
|
"n_jobs": [Integral, None],
|
|
"transformer_weights": [dict, None],
|
|
"verbose": ["verbose"],
|
|
"verbose_feature_names_out": ["boolean"],
|
|
"force_int_remainder_cols": ["boolean"],
|
|
}
|
|
|
|
def __init__(
|
|
self,
|
|
transformers,
|
|
*,
|
|
remainder="drop",
|
|
sparse_threshold=0.3,
|
|
n_jobs=None,
|
|
transformer_weights=None,
|
|
verbose=False,
|
|
verbose_feature_names_out=True,
|
|
force_int_remainder_cols=True,
|
|
):
|
|
self.transformers = transformers
|
|
self.remainder = remainder
|
|
self.sparse_threshold = sparse_threshold
|
|
self.n_jobs = n_jobs
|
|
self.transformer_weights = transformer_weights
|
|
self.verbose = verbose
|
|
self.verbose_feature_names_out = verbose_feature_names_out
|
|
self.force_int_remainder_cols = force_int_remainder_cols
|
|
|
|
@property
|
|
def _transformers(self):
|
|
"""
|
|
Internal list of transformer only containing the name and
|
|
transformers, dropping the columns.
|
|
|
|
DO NOT USE: This is for the implementation of get_params via
|
|
BaseComposition._get_params which expects lists of tuples of len 2.
|
|
|
|
To iterate through the transformers, use ``self._iter`` instead.
|
|
"""
|
|
try:
|
|
return [(name, trans) for name, trans, _ in self.transformers]
|
|
except (TypeError, ValueError):
|
|
return self.transformers
|
|
|
|
@_transformers.setter
|
|
def _transformers(self, value):
|
|
"""DO NOT USE: This is for the implementation of set_params via
|
|
BaseComposition._get_params which gives lists of tuples of len 2.
|
|
"""
|
|
try:
|
|
self.transformers = [
|
|
(name, trans, col)
|
|
for ((name, trans), (_, _, col)) in zip(value, self.transformers)
|
|
]
|
|
except (TypeError, ValueError):
|
|
self.transformers = value
|
|
|
|
def set_output(self, *, transform=None):
|
|
"""Set the output container when `"transform"` and `"fit_transform"` are called.
|
|
|
|
Calling `set_output` will set the output of all estimators in `transformers`
|
|
and `transformers_`.
|
|
|
|
Parameters
|
|
----------
|
|
transform : {"default", "pandas", "polars"}, default=None
|
|
Configure output of `transform` and `fit_transform`.
|
|
|
|
- `"default"`: Default output format of a transformer
|
|
- `"pandas"`: DataFrame output
|
|
- `"polars"`: Polars output
|
|
- `None`: Transform configuration is unchanged
|
|
|
|
.. versionadded:: 1.4
|
|
`"polars"` option was added.
|
|
|
|
Returns
|
|
-------
|
|
self : estimator instance
|
|
Estimator instance.
|
|
"""
|
|
super().set_output(transform=transform)
|
|
|
|
transformers = (
|
|
trans
|
|
for _, trans, _ in chain(
|
|
self.transformers, getattr(self, "transformers_", [])
|
|
)
|
|
if trans not in {"passthrough", "drop"}
|
|
)
|
|
for trans in transformers:
|
|
_safe_set_output(trans, transform=transform)
|
|
|
|
if self.remainder not in {"passthrough", "drop"}:
|
|
_safe_set_output(self.remainder, transform=transform)
|
|
|
|
return self
|
|
|
|
def get_params(self, deep=True):
|
|
"""Get parameters for this estimator.
|
|
|
|
Returns the parameters given in the constructor as well as the
|
|
estimators contained within the `transformers` of the
|
|
`ColumnTransformer`.
|
|
|
|
Parameters
|
|
----------
|
|
deep : bool, default=True
|
|
If True, will return the parameters for this estimator and
|
|
contained subobjects that are estimators.
|
|
|
|
Returns
|
|
-------
|
|
params : dict
|
|
Parameter names mapped to their values.
|
|
"""
|
|
return self._get_params("_transformers", deep=deep)
|
|
|
|
def set_params(self, **kwargs):
|
|
"""Set the parameters of this estimator.
|
|
|
|
Valid parameter keys can be listed with ``get_params()``. Note that you
|
|
can directly set the parameters of the estimators contained in
|
|
`transformers` of `ColumnTransformer`.
|
|
|
|
Parameters
|
|
----------
|
|
**kwargs : dict
|
|
Estimator parameters.
|
|
|
|
Returns
|
|
-------
|
|
self : ColumnTransformer
|
|
This estimator.
|
|
"""
|
|
self._set_params("_transformers", **kwargs)
|
|
return self
|
|
|
|
def _iter(self, fitted, column_as_labels, skip_drop, skip_empty_columns):
|
|
"""
|
|
Generate (name, trans, columns, weight) tuples.
|
|
|
|
|
|
Parameters
|
|
----------
|
|
fitted : bool
|
|
If True, use the fitted transformers (``self.transformers_``) to
|
|
iterate through transformers, else use the transformers passed by
|
|
the user (``self.transformers``).
|
|
|
|
column_as_labels : bool
|
|
If True, columns are returned as string labels. If False, columns
|
|
are returned as they were given by the user. This can only be True
|
|
if the ``ColumnTransformer`` is already fitted.
|
|
|
|
skip_drop : bool
|
|
If True, 'drop' transformers are filtered out.
|
|
|
|
skip_empty_columns : bool
|
|
If True, transformers with empty selected columns are filtered out.
|
|
|
|
Yields
|
|
------
|
|
A generator of tuples containing:
|
|
- name : the name of the transformer
|
|
- transformer : the transformer object
|
|
- columns : the columns for that transformer
|
|
- weight : the weight of the transformer
|
|
"""
|
|
if fitted:
|
|
transformers = self.transformers_
|
|
else:
|
|
# interleave the validated column specifiers
|
|
transformers = [
|
|
(name, trans, column)
|
|
for (name, trans, _), column in zip(self.transformers, self._columns)
|
|
]
|
|
# add transformer tuple for remainder
|
|
if self._remainder[2]:
|
|
transformers = chain(transformers, [self._remainder])
|
|
|
|
# We want the warning about the future change of the remainder
|
|
# columns dtype to be shown only when a user accesses them
|
|
# directly, not when they are used by the ColumnTransformer itself.
|
|
# We disable warnings here; they are enabled when setting
|
|
# self.transformers_.
|
|
transformers = _with_dtype_warning_enabled_set_to(False, transformers)
|
|
|
|
get_weight = (self.transformer_weights or {}).get
|
|
|
|
for name, trans, columns in transformers:
|
|
if skip_drop and trans == "drop":
|
|
continue
|
|
if skip_empty_columns and _is_empty_column_selection(columns):
|
|
continue
|
|
|
|
if column_as_labels:
|
|
# Convert all columns to using their string labels
|
|
columns_is_scalar = np.isscalar(columns)
|
|
|
|
indices = self._transformer_to_input_indices[name]
|
|
columns = self.feature_names_in_[indices]
|
|
|
|
if columns_is_scalar:
|
|
# selection is done with one dimension
|
|
columns = columns[0]
|
|
|
|
yield (name, trans, columns, get_weight(name))
|
|
|
|
def _validate_transformers(self):
|
|
"""Validate names of transformers and the transformers themselves.
|
|
|
|
This checks whether given transformers have the required methods, i.e.
|
|
`fit` or `fit_transform` and `transform` implemented.
|
|
"""
|
|
if not self.transformers:
|
|
return
|
|
|
|
names, transformers, _ = zip(*self.transformers)
|
|
|
|
# validate names
|
|
self._validate_names(names)
|
|
|
|
# validate estimators
|
|
for t in transformers:
|
|
if t in ("drop", "passthrough"):
|
|
continue
|
|
if not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not hasattr(
|
|
t, "transform"
|
|
):
|
|
# Used to validate the transformers in the `transformers` list
|
|
raise TypeError(
|
|
"All estimators should implement fit and "
|
|
"transform, or can be 'drop' or 'passthrough' "
|
|
"specifiers. '%s' (type %s) doesn't." % (t, type(t))
|
|
)
|
|
|
|
def _validate_column_callables(self, X):
|
|
"""
|
|
Converts callable column specifications.
|
|
|
|
This stores a dictionary of the form `{step_name: column_indices}` and
|
|
calls the `columns` on `X` if `columns` is a callable for a given
|
|
transformer.
|
|
|
|
The results are then stored in `self._transformer_to_input_indices`.
|
|
"""
|
|
all_columns = []
|
|
transformer_to_input_indices = {}
|
|
for name, _, columns in self.transformers:
|
|
if callable(columns):
|
|
columns = columns(X)
|
|
all_columns.append(columns)
|
|
transformer_to_input_indices[name] = _get_column_indices(X, columns)
|
|
|
|
self._columns = all_columns
|
|
self._transformer_to_input_indices = transformer_to_input_indices
|
|
|
|
def _validate_remainder(self, X):
|
|
"""
|
|
Validates ``remainder`` and defines ``_remainder`` targeting
|
|
the remaining columns.
|
|
"""
|
|
cols = set(chain(*self._transformer_to_input_indices.values()))
|
|
remaining = sorted(set(range(self.n_features_in_)) - cols)
|
|
self._transformer_to_input_indices["remainder"] = remaining
|
|
remainder_cols = self._get_remainder_cols(remaining)
|
|
self._remainder = ("remainder", self.remainder, remainder_cols)
|
|
|
|
def _get_remainder_cols_dtype(self):
|
|
try:
|
|
all_dtypes = {_determine_key_type(c) for (*_, c) in self.transformers}
|
|
if len(all_dtypes) == 1:
|
|
return next(iter(all_dtypes))
|
|
except ValueError:
|
|
# _determine_key_type raises a ValueError if some transformer
|
|
# columns are Callables
|
|
return "int"
|
|
return "int"
|
|
|
|
def _get_remainder_cols(self, indices):
|
|
dtype = self._get_remainder_cols_dtype()
|
|
if self.force_int_remainder_cols and dtype != "int":
|
|
return _RemainderColsList(indices, future_dtype=dtype)
|
|
if dtype == "str":
|
|
return list(self.feature_names_in_[indices])
|
|
if dtype == "bool":
|
|
return [i in indices for i in range(self.n_features_in_)]
|
|
return indices
|
|
|
|
@property
|
|
def named_transformers_(self):
|
|
"""Access the fitted transformer by name.
|
|
|
|
Read-only attribute to access any transformer by given name.
|
|
Keys are transformer names and values are the fitted transformer
|
|
objects.
|
|
"""
|
|
# Use Bunch object to improve autocomplete
|
|
return Bunch(**{name: trans for name, trans, _ in self.transformers_})
|
|
|
|
def _get_feature_name_out_for_transformer(self, name, trans, feature_names_in):
|
|
"""Gets feature names of transformer.
|
|
|
|
Used in conjunction with self._iter(fitted=True) in get_feature_names_out.
|
|
"""
|
|
column_indices = self._transformer_to_input_indices[name]
|
|
names = feature_names_in[column_indices]
|
|
# An actual transformer
|
|
if not hasattr(trans, "get_feature_names_out"):
|
|
raise AttributeError(
|
|
f"Transformer {name} (type {type(trans).__name__}) does "
|
|
"not provide get_feature_names_out."
|
|
)
|
|
return trans.get_feature_names_out(names)
|
|
|
|
def get_feature_names_out(self, input_features=None):
|
|
"""Get output feature names for transformation.
|
|
|
|
Parameters
|
|
----------
|
|
input_features : array-like of str or None, default=None
|
|
Input features.
|
|
|
|
- If `input_features` is `None`, then `feature_names_in_` is
|
|
used as feature names in. If `feature_names_in_` is not defined,
|
|
then the following input feature names are generated:
|
|
`["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
|
|
- If `input_features` is an array-like, then `input_features` must
|
|
match `feature_names_in_` if `feature_names_in_` is defined.
|
|
|
|
Returns
|
|
-------
|
|
feature_names_out : ndarray of str objects
|
|
Transformed feature names.
|
|
"""
|
|
check_is_fitted(self)
|
|
input_features = _check_feature_names_in(self, input_features)
|
|
|
|
# List of tuples (name, feature_names_out)
|
|
transformer_with_feature_names_out = []
|
|
for name, trans, *_ in self._iter(
|
|
fitted=True,
|
|
column_as_labels=False,
|
|
skip_empty_columns=True,
|
|
skip_drop=True,
|
|
):
|
|
feature_names_out = self._get_feature_name_out_for_transformer(
|
|
name, trans, input_features
|
|
)
|
|
if feature_names_out is None:
|
|
continue
|
|
transformer_with_feature_names_out.append((name, feature_names_out))
|
|
|
|
if not transformer_with_feature_names_out:
|
|
# No feature names
|
|
return np.array([], dtype=object)
|
|
|
|
return self._add_prefix_for_feature_names_out(
|
|
transformer_with_feature_names_out
|
|
)
|
|
|
|
def _add_prefix_for_feature_names_out(self, transformer_with_feature_names_out):
|
|
"""Add prefix for feature names out that includes the transformer names.
|
|
|
|
Parameters
|
|
----------
|
|
transformer_with_feature_names_out : list of tuples of (str, array-like of str)
|
|
The tuple consistent of the transformer's name and its feature names out.
|
|
|
|
Returns
|
|
-------
|
|
feature_names_out : ndarray of shape (n_features,), dtype=str
|
|
Transformed feature names.
|
|
"""
|
|
if self.verbose_feature_names_out:
|
|
# Prefix the feature names out with the transformers name
|
|
names = list(
|
|
chain.from_iterable(
|
|
(f"{name}__{i}" for i in feature_names_out)
|
|
for name, feature_names_out in transformer_with_feature_names_out
|
|
)
|
|
)
|
|
return np.asarray(names, dtype=object)
|
|
|
|
# verbose_feature_names_out is False
|
|
# Check that names are all unique without a prefix
|
|
feature_names_count = Counter(
|
|
chain.from_iterable(s for _, s in transformer_with_feature_names_out)
|
|
)
|
|
top_6_overlap = [
|
|
name for name, count in feature_names_count.most_common(6) if count > 1
|
|
]
|
|
top_6_overlap.sort()
|
|
if top_6_overlap:
|
|
if len(top_6_overlap) == 6:
|
|
# There are more than 5 overlapping names, we only show the 5
|
|
# of the feature names
|
|
names_repr = str(top_6_overlap[:5])[:-1] + ", ...]"
|
|
else:
|
|
names_repr = str(top_6_overlap)
|
|
raise ValueError(
|
|
f"Output feature names: {names_repr} are not unique. Please set "
|
|
"verbose_feature_names_out=True to add prefixes to feature names"
|
|
)
|
|
|
|
return np.concatenate(
|
|
[name for _, name in transformer_with_feature_names_out],
|
|
)
|
|
|
|
def _update_fitted_transformers(self, transformers):
|
|
"""Set self.transformers_ from given transformers.
|
|
|
|
Parameters
|
|
----------
|
|
transformers : list of estimators
|
|
The fitted estimators as the output of
|
|
`self._call_func_on_transformers(func=_fit_transform_one, ...)`.
|
|
That function doesn't include 'drop' or transformers for which no
|
|
column is selected. 'drop' is kept as is, and for the no-column
|
|
transformers the unfitted transformer is put in
|
|
`self.transformers_`.
|
|
"""
|
|
# transformers are fitted; excludes 'drop' cases
|
|
fitted_transformers = iter(transformers)
|
|
transformers_ = []
|
|
|
|
for name, old, column, _ in self._iter(
|
|
fitted=False,
|
|
column_as_labels=False,
|
|
skip_drop=False,
|
|
skip_empty_columns=False,
|
|
):
|
|
if old == "drop":
|
|
trans = "drop"
|
|
elif _is_empty_column_selection(column):
|
|
trans = old
|
|
else:
|
|
trans = next(fitted_transformers)
|
|
transformers_.append((name, trans, column))
|
|
|
|
# sanity check that transformers is exhausted
|
|
assert not list(fitted_transformers)
|
|
self.transformers_ = _with_dtype_warning_enabled_set_to(True, transformers_)
|
|
|
|
def _validate_output(self, result):
|
|
"""
|
|
Ensure that the output of each transformer is 2D. Otherwise
|
|
hstack can raise an error or produce incorrect results.
|
|
"""
|
|
names = [
|
|
name
|
|
for name, _, _, _ in self._iter(
|
|
fitted=True,
|
|
column_as_labels=False,
|
|
skip_drop=True,
|
|
skip_empty_columns=True,
|
|
)
|
|
]
|
|
for Xs, name in zip(result, names):
|
|
if not getattr(Xs, "ndim", 0) == 2 and not hasattr(Xs, "__dataframe__"):
|
|
raise ValueError(
|
|
"The output of the '{0}' transformer should be 2D (numpy array, "
|
|
"scipy sparse array, dataframe).".format(name)
|
|
)
|
|
if _get_output_config("transform", self)["dense"] == "pandas":
|
|
return
|
|
try:
|
|
import pandas as pd
|
|
except ImportError:
|
|
return
|
|
for Xs, name in zip(result, names):
|
|
if not _is_pandas_df(Xs):
|
|
continue
|
|
for col_name, dtype in Xs.dtypes.to_dict().items():
|
|
if getattr(dtype, "na_value", None) is not pd.NA:
|
|
continue
|
|
if pd.NA not in Xs[col_name].values:
|
|
continue
|
|
class_name = self.__class__.__name__
|
|
# TODO(1.6): replace warning with ValueError
|
|
warnings.warn(
|
|
(
|
|
f"The output of the '{name}' transformer for column"
|
|
f" '{col_name}' has dtype {dtype} and uses pandas.NA to"
|
|
" represent null values. Storing this output in a numpy array"
|
|
" can cause errors in downstream scikit-learn estimators, and"
|
|
" inefficiencies. Starting with scikit-learn version 1.6, this"
|
|
" will raise a ValueError. To avoid this problem you can (i)"
|
|
" store the output in a pandas DataFrame by using"
|
|
f" {class_name}.set_output(transform='pandas') or (ii) modify"
|
|
f" the input data or the '{name}' transformer to avoid the"
|
|
" presence of pandas.NA (for example by using"
|
|
" pandas.DataFrame.astype)."
|
|
),
|
|
FutureWarning,
|
|
)
|
|
|
|
def _record_output_indices(self, Xs):
|
|
"""
|
|
Record which transformer produced which column.
|
|
"""
|
|
idx = 0
|
|
self.output_indices_ = {}
|
|
|
|
for transformer_idx, (name, _, _, _) in enumerate(
|
|
self._iter(
|
|
fitted=True,
|
|
column_as_labels=False,
|
|
skip_drop=True,
|
|
skip_empty_columns=True,
|
|
)
|
|
):
|
|
n_columns = Xs[transformer_idx].shape[1]
|
|
self.output_indices_[name] = slice(idx, idx + n_columns)
|
|
idx += n_columns
|
|
|
|
# `_iter` only generates transformers that have a non empty
|
|
# selection. Here we set empty slices for transformers that
|
|
# generate no output, which are safe for indexing
|
|
all_names = [t[0] for t in self.transformers] + ["remainder"]
|
|
for name in all_names:
|
|
if name not in self.output_indices_:
|
|
self.output_indices_[name] = slice(0, 0)
|
|
|
|
def _log_message(self, name, idx, total):
|
|
if not self.verbose:
|
|
return None
|
|
return "(%d of %d) Processing %s" % (idx, total, name)
|
|
|
|
def _call_func_on_transformers(self, X, y, func, column_as_labels, routed_params):
|
|
"""
|
|
Private function to fit and/or transform on demand.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, dataframe} of shape (n_samples, n_features)
|
|
The data to be used in fit and/or transform.
|
|
|
|
y : array-like of shape (n_samples,)
|
|
Targets.
|
|
|
|
func : callable
|
|
Function to call, which can be _fit_transform_one or
|
|
_transform_one.
|
|
|
|
column_as_labels : bool
|
|
Used to iterate through transformers. If True, columns are returned
|
|
as strings. If False, columns are returned as they were given by
|
|
the user. Can be True only if the ``ColumnTransformer`` is already
|
|
fitted.
|
|
|
|
routed_params : dict
|
|
The routed parameters as the output from ``process_routing``.
|
|
|
|
Returns
|
|
-------
|
|
Return value (transformers and/or transformed X data) depends
|
|
on the passed function.
|
|
"""
|
|
if func is _fit_transform_one:
|
|
fitted = False
|
|
else: # func is _transform_one
|
|
fitted = True
|
|
|
|
transformers = list(
|
|
self._iter(
|
|
fitted=fitted,
|
|
column_as_labels=column_as_labels,
|
|
skip_drop=True,
|
|
skip_empty_columns=True,
|
|
)
|
|
)
|
|
try:
|
|
jobs = []
|
|
for idx, (name, trans, columns, weight) in enumerate(transformers, start=1):
|
|
if func is _fit_transform_one:
|
|
if trans == "passthrough":
|
|
output_config = _get_output_config("transform", self)
|
|
trans = FunctionTransformer(
|
|
accept_sparse=True,
|
|
check_inverse=False,
|
|
feature_names_out="one-to-one",
|
|
).set_output(transform=output_config["dense"])
|
|
|
|
extra_args = dict(
|
|
message_clsname="ColumnTransformer",
|
|
message=self._log_message(name, idx, len(transformers)),
|
|
)
|
|
else: # func is _transform_one
|
|
extra_args = {}
|
|
jobs.append(
|
|
delayed(func)(
|
|
transformer=clone(trans) if not fitted else trans,
|
|
X=X,
|
|
y=y,
|
|
weight=weight,
|
|
columns=columns,
|
|
**extra_args,
|
|
params=routed_params[name],
|
|
)
|
|
)
|
|
|
|
return Parallel(n_jobs=self.n_jobs)(jobs)
|
|
|
|
except ValueError as e:
|
|
if "Expected 2D array, got 1D array instead" in str(e):
|
|
raise ValueError(_ERR_MSG_1DCOLUMN) from e
|
|
else:
|
|
raise
|
|
|
|
def fit(self, X, y=None, **params):
|
|
"""Fit all transformers using X.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, dataframe} of shape (n_samples, n_features)
|
|
Input data, of which specified subsets are used to fit the
|
|
transformers.
|
|
|
|
y : array-like of shape (n_samples,...), default=None
|
|
Targets for supervised learning.
|
|
|
|
**params : dict, default=None
|
|
Parameters to be passed to the underlying transformers' ``fit`` and
|
|
``transform`` methods.
|
|
|
|
You can only pass this if metadata routing is enabled, which you
|
|
can enable using ``sklearn.set_config(enable_metadata_routing=True)``.
|
|
|
|
.. versionadded:: 1.4
|
|
|
|
Returns
|
|
-------
|
|
self : ColumnTransformer
|
|
This estimator.
|
|
"""
|
|
_raise_for_params(params, self, "fit")
|
|
# we use fit_transform to make sure to set sparse_output_ (for which we
|
|
# need the transformed data) to have consistent output type in predict
|
|
self.fit_transform(X, y=y, **params)
|
|
return self
|
|
|
|
@_fit_context(
|
|
# estimators in ColumnTransformer.transformers are not validated yet
|
|
prefer_skip_nested_validation=False
|
|
)
|
|
def fit_transform(self, X, y=None, **params):
|
|
"""Fit all transformers, transform the data and concatenate results.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, dataframe} of shape (n_samples, n_features)
|
|
Input data, of which specified subsets are used to fit the
|
|
transformers.
|
|
|
|
y : array-like of shape (n_samples,), default=None
|
|
Targets for supervised learning.
|
|
|
|
**params : dict, default=None
|
|
Parameters to be passed to the underlying transformers' ``fit`` and
|
|
``transform`` methods.
|
|
|
|
You can only pass this if metadata routing is enabled, which you
|
|
can enable using ``sklearn.set_config(enable_metadata_routing=True)``.
|
|
|
|
.. versionadded:: 1.4
|
|
|
|
Returns
|
|
-------
|
|
X_t : {array-like, sparse matrix} of \
|
|
shape (n_samples, sum_n_components)
|
|
Horizontally stacked results of transformers. sum_n_components is the
|
|
sum of n_components (output dimension) over transformers. If
|
|
any result is a sparse matrix, everything will be converted to
|
|
sparse matrices.
|
|
"""
|
|
_raise_for_params(params, self, "fit_transform")
|
|
self._check_feature_names(X, reset=True)
|
|
|
|
X = _check_X(X)
|
|
# set n_features_in_ attribute
|
|
self._check_n_features(X, reset=True)
|
|
self._validate_transformers()
|
|
n_samples = _num_samples(X)
|
|
|
|
self._validate_column_callables(X)
|
|
self._validate_remainder(X)
|
|
|
|
if _routing_enabled():
|
|
routed_params = process_routing(self, "fit_transform", **params)
|
|
else:
|
|
routed_params = self._get_empty_routing()
|
|
|
|
result = self._call_func_on_transformers(
|
|
X,
|
|
y,
|
|
_fit_transform_one,
|
|
column_as_labels=False,
|
|
routed_params=routed_params,
|
|
)
|
|
|
|
if not result:
|
|
self._update_fitted_transformers([])
|
|
# All transformers are None
|
|
return np.zeros((n_samples, 0))
|
|
|
|
Xs, transformers = zip(*result)
|
|
|
|
# determine if concatenated output will be sparse or not
|
|
if any(sparse.issparse(X) for X in Xs):
|
|
nnz = sum(X.nnz if sparse.issparse(X) else X.size for X in Xs)
|
|
total = sum(
|
|
X.shape[0] * X.shape[1] if sparse.issparse(X) else X.size for X in Xs
|
|
)
|
|
density = nnz / total
|
|
self.sparse_output_ = density < self.sparse_threshold
|
|
else:
|
|
self.sparse_output_ = False
|
|
|
|
self._update_fitted_transformers(transformers)
|
|
self._validate_output(Xs)
|
|
self._record_output_indices(Xs)
|
|
|
|
return self._hstack(list(Xs), n_samples=n_samples)
|
|
|
|
def transform(self, X, **params):
|
|
"""Transform X separately by each transformer, concatenate results.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, dataframe} of shape (n_samples, n_features)
|
|
The data to be transformed by subset.
|
|
|
|
**params : dict, default=None
|
|
Parameters to be passed to the underlying transformers' ``transform``
|
|
method.
|
|
|
|
You can only pass this if metadata routing is enabled, which you
|
|
can enable using ``sklearn.set_config(enable_metadata_routing=True)``.
|
|
|
|
.. versionadded:: 1.4
|
|
|
|
Returns
|
|
-------
|
|
X_t : {array-like, sparse matrix} of \
|
|
shape (n_samples, sum_n_components)
|
|
Horizontally stacked results of transformers. sum_n_components is the
|
|
sum of n_components (output dimension) over transformers. If
|
|
any result is a sparse matrix, everything will be converted to
|
|
sparse matrices.
|
|
"""
|
|
_raise_for_params(params, self, "transform")
|
|
check_is_fitted(self)
|
|
X = _check_X(X)
|
|
|
|
# If ColumnTransformer is fit using a dataframe, and now a dataframe is
|
|
# passed to be transformed, we select columns by name instead. This
|
|
# enables the user to pass X at transform time with extra columns which
|
|
# were not present in fit time, and the order of the columns doesn't
|
|
# matter.
|
|
fit_dataframe_and_transform_dataframe = hasattr(self, "feature_names_in_") and (
|
|
_is_pandas_df(X) or hasattr(X, "__dataframe__")
|
|
)
|
|
|
|
n_samples = _num_samples(X)
|
|
column_names = _get_feature_names(X)
|
|
|
|
if fit_dataframe_and_transform_dataframe:
|
|
named_transformers = self.named_transformers_
|
|
# check that all names seen in fit are in transform, unless
|
|
# they were dropped
|
|
non_dropped_indices = [
|
|
ind
|
|
for name, ind in self._transformer_to_input_indices.items()
|
|
if name in named_transformers and named_transformers[name] != "drop"
|
|
]
|
|
|
|
all_indices = set(chain(*non_dropped_indices))
|
|
all_names = set(self.feature_names_in_[ind] for ind in all_indices)
|
|
|
|
diff = all_names - set(column_names)
|
|
if diff:
|
|
raise ValueError(f"columns are missing: {diff}")
|
|
else:
|
|
# ndarray was used for fitting or transforming, thus we only
|
|
# check that n_features_in_ is consistent
|
|
self._check_n_features(X, reset=False)
|
|
|
|
if _routing_enabled():
|
|
routed_params = process_routing(self, "transform", **params)
|
|
else:
|
|
routed_params = self._get_empty_routing()
|
|
|
|
Xs = self._call_func_on_transformers(
|
|
X,
|
|
None,
|
|
_transform_one,
|
|
column_as_labels=fit_dataframe_and_transform_dataframe,
|
|
routed_params=routed_params,
|
|
)
|
|
self._validate_output(Xs)
|
|
|
|
if not Xs:
|
|
# All transformers are None
|
|
return np.zeros((n_samples, 0))
|
|
|
|
return self._hstack(list(Xs), n_samples=n_samples)
|
|
|
|
def _hstack(self, Xs, *, n_samples):
|
|
"""Stacks Xs horizontally.
|
|
|
|
This allows subclasses to control the stacking behavior, while reusing
|
|
everything else from ColumnTransformer.
|
|
|
|
Parameters
|
|
----------
|
|
Xs : list of {array-like, sparse matrix, dataframe}
|
|
The container to concatenate.
|
|
n_samples : int
|
|
The number of samples in the input data to checking the transformation
|
|
consistency.
|
|
"""
|
|
if self.sparse_output_:
|
|
try:
|
|
# since all columns should be numeric before stacking them
|
|
# in a sparse matrix, `check_array` is used for the
|
|
# dtype conversion if necessary.
|
|
converted_Xs = [
|
|
check_array(X, accept_sparse=True, force_all_finite=False)
|
|
for X in Xs
|
|
]
|
|
except ValueError as e:
|
|
raise ValueError(
|
|
"For a sparse output, all columns should "
|
|
"be a numeric or convertible to a numeric."
|
|
) from e
|
|
|
|
return sparse.hstack(converted_Xs).tocsr()
|
|
else:
|
|
Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs]
|
|
adapter = _get_container_adapter("transform", self)
|
|
if adapter and all(adapter.is_supported_container(X) for X in Xs):
|
|
# rename before stacking as it avoids to error on temporary duplicated
|
|
# columns
|
|
transformer_names = [
|
|
t[0]
|
|
for t in self._iter(
|
|
fitted=True,
|
|
column_as_labels=False,
|
|
skip_drop=True,
|
|
skip_empty_columns=True,
|
|
)
|
|
]
|
|
feature_names_outs = [X.columns for X in Xs if X.shape[1] != 0]
|
|
if self.verbose_feature_names_out:
|
|
# `_add_prefix_for_feature_names_out` takes care about raising
|
|
# an error if there are duplicated columns.
|
|
feature_names_outs = self._add_prefix_for_feature_names_out(
|
|
list(zip(transformer_names, feature_names_outs))
|
|
)
|
|
else:
|
|
# check for duplicated columns and raise if any
|
|
feature_names_outs = list(chain.from_iterable(feature_names_outs))
|
|
feature_names_count = Counter(feature_names_outs)
|
|
if any(count > 1 for count in feature_names_count.values()):
|
|
duplicated_feature_names = sorted(
|
|
name
|
|
for name, count in feature_names_count.items()
|
|
if count > 1
|
|
)
|
|
err_msg = (
|
|
"Duplicated feature names found before concatenating the"
|
|
" outputs of the transformers:"
|
|
f" {duplicated_feature_names}.\n"
|
|
)
|
|
for transformer_name, X in zip(transformer_names, Xs):
|
|
if X.shape[1] == 0:
|
|
continue
|
|
dup_cols_in_transformer = sorted(
|
|
set(X.columns).intersection(duplicated_feature_names)
|
|
)
|
|
if len(dup_cols_in_transformer):
|
|
err_msg += (
|
|
f"Transformer {transformer_name} has conflicting "
|
|
f"columns names: {dup_cols_in_transformer}.\n"
|
|
)
|
|
raise ValueError(
|
|
err_msg
|
|
+ "Either make sure that the transformers named above "
|
|
"do not generate columns with conflicting names or set "
|
|
"verbose_feature_names_out=True to automatically "
|
|
"prefix to the output feature names with the name "
|
|
"of the transformer to prevent any conflicting "
|
|
"names."
|
|
)
|
|
|
|
names_idx = 0
|
|
for X in Xs:
|
|
if X.shape[1] == 0:
|
|
continue
|
|
names_out = feature_names_outs[names_idx : names_idx + X.shape[1]]
|
|
adapter.rename_columns(X, names_out)
|
|
names_idx += X.shape[1]
|
|
|
|
output = adapter.hstack(Xs)
|
|
output_samples = output.shape[0]
|
|
if output_samples != n_samples:
|
|
raise ValueError(
|
|
"Concatenating DataFrames from the transformer's output lead to"
|
|
" an inconsistent number of samples. The output may have Pandas"
|
|
" Indexes that do not match, or that transformers are returning"
|
|
" number of samples which are not the same as the number input"
|
|
" samples."
|
|
)
|
|
|
|
return output
|
|
|
|
return np.hstack(Xs)
|
|
|
|
def _sk_visual_block_(self):
|
|
if isinstance(self.remainder, str) and self.remainder == "drop":
|
|
transformers = self.transformers
|
|
elif hasattr(self, "_remainder"):
|
|
remainder_columns = self._remainder[2]
|
|
if (
|
|
hasattr(self, "feature_names_in_")
|
|
and remainder_columns
|
|
and not all(isinstance(col, str) for col in remainder_columns)
|
|
):
|
|
remainder_columns = self.feature_names_in_[remainder_columns].tolist()
|
|
transformers = chain(
|
|
self.transformers, [("remainder", self.remainder, remainder_columns)]
|
|
)
|
|
else:
|
|
transformers = chain(self.transformers, [("remainder", self.remainder, "")])
|
|
|
|
names, transformers, name_details = zip(*transformers)
|
|
return _VisualBlock(
|
|
"parallel", transformers, names=names, name_details=name_details
|
|
)
|
|
|
|
def __getitem__(self, key):
|
|
try:
|
|
return self.named_transformers_[key]
|
|
except AttributeError as e:
|
|
raise TypeError(
|
|
"ColumnTransformer is subscriptable after it is fitted"
|
|
) from e
|
|
except KeyError as e:
|
|
raise KeyError(f"'{key}' is not a valid transformer name") from e
|
|
|
|
def _get_empty_routing(self):
|
|
"""Return empty routing.
|
|
|
|
Used while routing can be disabled.
|
|
|
|
TODO: Remove when ``set_config(enable_metadata_routing=False)`` is no
|
|
more an option.
|
|
"""
|
|
return Bunch(
|
|
**{
|
|
name: Bunch(**{method: {} for method in METHODS})
|
|
for name, step, _, _ in self._iter(
|
|
fitted=False,
|
|
column_as_labels=False,
|
|
skip_drop=True,
|
|
skip_empty_columns=True,
|
|
)
|
|
}
|
|
)
|
|
|
|
def get_metadata_routing(self):
|
|
"""Get metadata routing of this object.
|
|
|
|
Please check :ref:`User Guide <metadata_routing>` on how the routing
|
|
mechanism works.
|
|
|
|
.. versionadded:: 1.4
|
|
|
|
Returns
|
|
-------
|
|
routing : MetadataRouter
|
|
A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
|
|
routing information.
|
|
"""
|
|
router = MetadataRouter(owner=self.__class__.__name__)
|
|
# Here we don't care about which columns are used for which
|
|
# transformers, and whether or not a transformer is used at all, which
|
|
# might happen if no columns are selected for that transformer. We
|
|
# request all metadata requested by all transformers.
|
|
transformers = chain(self.transformers, [("remainder", self.remainder, None)])
|
|
for name, step, _ in transformers:
|
|
method_mapping = MethodMapping()
|
|
if hasattr(step, "fit_transform"):
|
|
(
|
|
method_mapping.add(caller="fit", callee="fit_transform").add(
|
|
caller="fit_transform", callee="fit_transform"
|
|
)
|
|
)
|
|
else:
|
|
(
|
|
method_mapping.add(caller="fit", callee="fit")
|
|
.add(caller="fit", callee="transform")
|
|
.add(caller="fit_transform", callee="fit")
|
|
.add(caller="fit_transform", callee="transform")
|
|
)
|
|
method_mapping.add(caller="transform", callee="transform")
|
|
router.add(method_mapping=method_mapping, **{name: step})
|
|
|
|
return router
|
|
|
|
|
|
def _check_X(X):
|
|
"""Use check_array only when necessary, e.g. on lists and other non-array-likes."""
|
|
if hasattr(X, "__array__") or hasattr(X, "__dataframe__") or sparse.issparse(X):
|
|
return X
|
|
return check_array(X, force_all_finite="allow-nan", dtype=object)
|
|
|
|
|
|
def _is_empty_column_selection(column):
|
|
"""
|
|
Return True if the column selection is empty (empty list or all-False
|
|
boolean array).
|
|
|
|
"""
|
|
if hasattr(column, "dtype") and np.issubdtype(column.dtype, np.bool_):
|
|
return not column.any()
|
|
elif hasattr(column, "__len__"):
|
|
return (
|
|
len(column) == 0
|
|
or all(isinstance(col, bool) for col in column)
|
|
and not any(column)
|
|
)
|
|
else:
|
|
return False
|
|
|
|
|
|
def _get_transformer_list(estimators):
|
|
"""
|
|
Construct (name, trans, column) tuples from list
|
|
|
|
"""
|
|
transformers, columns = zip(*estimators)
|
|
names, _ = zip(*_name_estimators(transformers))
|
|
|
|
transformer_list = list(zip(names, transformers, columns))
|
|
return transformer_list
|
|
|
|
|
|
# This function is not validated using validate_params because
|
|
# it's just a factory for ColumnTransformer.
|
|
def make_column_transformer(
|
|
*transformers,
|
|
remainder="drop",
|
|
sparse_threshold=0.3,
|
|
n_jobs=None,
|
|
verbose=False,
|
|
verbose_feature_names_out=True,
|
|
force_int_remainder_cols=True,
|
|
):
|
|
"""Construct a ColumnTransformer from the given transformers.
|
|
|
|
This is a shorthand for the ColumnTransformer constructor; it does not
|
|
require, and does not permit, naming the transformers. Instead, they will
|
|
be given names automatically based on their types. It also does not allow
|
|
weighting with ``transformer_weights``.
|
|
|
|
Read more in the :ref:`User Guide <make_column_transformer>`.
|
|
|
|
Parameters
|
|
----------
|
|
*transformers : tuples
|
|
Tuples of the form (transformer, columns) specifying the
|
|
transformer objects to be applied to subsets of the data.
|
|
|
|
transformer : {'drop', 'passthrough'} or estimator
|
|
Estimator must support :term:`fit` and :term:`transform`.
|
|
Special-cased strings 'drop' and 'passthrough' are accepted as
|
|
well, to indicate to drop the columns or to pass them through
|
|
untransformed, respectively.
|
|
columns : str, array-like of str, int, array-like of int, slice, \
|
|
array-like of bool or callable
|
|
Indexes the data on its second axis. Integers are interpreted as
|
|
positional columns, while strings can reference DataFrame columns
|
|
by name. A scalar string or int should be used where
|
|
``transformer`` expects X to be a 1d array-like (vector),
|
|
otherwise a 2d array will be passed to the transformer.
|
|
A callable is passed the input data `X` and can return any of the
|
|
above. To select multiple columns by name or dtype, you can use
|
|
:obj:`make_column_selector`.
|
|
|
|
remainder : {'drop', 'passthrough'} or estimator, default='drop'
|
|
By default, only the specified columns in `transformers` are
|
|
transformed and combined in the output, and the non-specified
|
|
columns are dropped. (default of ``'drop'``).
|
|
By specifying ``remainder='passthrough'``, all remaining columns that
|
|
were not specified in `transformers` will be automatically passed
|
|
through. This subset of columns is concatenated with the output of
|
|
the transformers.
|
|
By setting ``remainder`` to be an estimator, the remaining
|
|
non-specified columns will use the ``remainder`` estimator. The
|
|
estimator must support :term:`fit` and :term:`transform`.
|
|
|
|
sparse_threshold : float, default=0.3
|
|
If the transformed output consists of a mix of sparse and dense data,
|
|
it will be stacked as a sparse matrix if the density is lower than this
|
|
value. Use ``sparse_threshold=0`` to always return dense.
|
|
When the transformed output consists of all sparse or all dense data,
|
|
the stacked result will be sparse or dense, respectively, and this
|
|
keyword will be ignored.
|
|
|
|
n_jobs : int, default=None
|
|
Number of jobs to run in parallel.
|
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
|
for more details.
|
|
|
|
verbose : bool, default=False
|
|
If True, the time elapsed while fitting each transformer will be
|
|
printed as it is completed.
|
|
|
|
verbose_feature_names_out : bool, default=True
|
|
If True, :meth:`ColumnTransformer.get_feature_names_out` will prefix
|
|
all feature names with the name of the transformer that generated that
|
|
feature.
|
|
If False, :meth:`ColumnTransformer.get_feature_names_out` will not
|
|
prefix any feature names and will error if feature names are not
|
|
unique.
|
|
|
|
.. versionadded:: 1.0
|
|
|
|
force_int_remainder_cols : bool, default=True
|
|
Force the columns of the last entry of `transformers_`, which
|
|
corresponds to the "remainder" transformer, to always be stored as
|
|
indices (int) rather than column names (str). See description of the
|
|
:attr:`ColumnTransformer.transformers_` attribute for details.
|
|
|
|
.. note::
|
|
If you do not access the list of columns for the remainder columns
|
|
in the :attr:`ColumnTransformer.transformers_` fitted attribute,
|
|
you do not need to set this parameter.
|
|
|
|
.. versionadded:: 1.5
|
|
|
|
.. versionchanged:: 1.7
|
|
The default value for `force_int_remainder_cols` will change from
|
|
`True` to `False` in version 1.7.
|
|
|
|
Returns
|
|
-------
|
|
ct : ColumnTransformer
|
|
Returns a :class:`ColumnTransformer` object.
|
|
|
|
See Also
|
|
--------
|
|
ColumnTransformer : Class that allows combining the
|
|
outputs of multiple transformer objects used on column subsets
|
|
of the data into a single feature space.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
|
>>> from sklearn.compose import make_column_transformer
|
|
>>> make_column_transformer(
|
|
... (StandardScaler(), ['numerical_column']),
|
|
... (OneHotEncoder(), ['categorical_column']))
|
|
ColumnTransformer(transformers=[('standardscaler', StandardScaler(...),
|
|
['numerical_column']),
|
|
('onehotencoder', OneHotEncoder(...),
|
|
['categorical_column'])])
|
|
"""
|
|
# transformer_weights keyword is not passed through because the user
|
|
# would need to know the automatically generated names of the transformers
|
|
transformer_list = _get_transformer_list(transformers)
|
|
return ColumnTransformer(
|
|
transformer_list,
|
|
n_jobs=n_jobs,
|
|
remainder=remainder,
|
|
sparse_threshold=sparse_threshold,
|
|
verbose=verbose,
|
|
verbose_feature_names_out=verbose_feature_names_out,
|
|
force_int_remainder_cols=force_int_remainder_cols,
|
|
)
|
|
|
|
|
|
class make_column_selector:
|
|
"""Create a callable to select columns to be used with
|
|
:class:`ColumnTransformer`.
|
|
|
|
:func:`make_column_selector` can select columns based on datatype or the
|
|
columns name with a regex. When using multiple selection criteria, **all**
|
|
criteria must match for a column to be selected.
|
|
|
|
For an example of how to use :func:`make_column_selector` within a
|
|
:class:`ColumnTransformer` to select columns based on data type (i.e.
|
|
`dtype`), refer to
|
|
:ref:`sphx_glr_auto_examples_compose_plot_column_transformer_mixed_types.py`.
|
|
|
|
Parameters
|
|
----------
|
|
pattern : str, default=None
|
|
Name of columns containing this regex pattern will be included. If
|
|
None, column selection will not be selected based on pattern.
|
|
|
|
dtype_include : column dtype or list of column dtypes, default=None
|
|
A selection of dtypes to include. For more details, see
|
|
:meth:`pandas.DataFrame.select_dtypes`.
|
|
|
|
dtype_exclude : column dtype or list of column dtypes, default=None
|
|
A selection of dtypes to exclude. For more details, see
|
|
:meth:`pandas.DataFrame.select_dtypes`.
|
|
|
|
Returns
|
|
-------
|
|
selector : callable
|
|
Callable for column selection to be used by a
|
|
:class:`ColumnTransformer`.
|
|
|
|
See Also
|
|
--------
|
|
ColumnTransformer : Class that allows combining the
|
|
outputs of multiple transformer objects used on column subsets
|
|
of the data into a single feature space.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
|
>>> from sklearn.compose import make_column_transformer
|
|
>>> from sklearn.compose import make_column_selector
|
|
>>> import numpy as np
|
|
>>> import pandas as pd # doctest: +SKIP
|
|
>>> X = pd.DataFrame({'city': ['London', 'London', 'Paris', 'Sallisaw'],
|
|
... 'rating': [5, 3, 4, 5]}) # doctest: +SKIP
|
|
>>> ct = make_column_transformer(
|
|
... (StandardScaler(),
|
|
... make_column_selector(dtype_include=np.number)), # rating
|
|
... (OneHotEncoder(),
|
|
... make_column_selector(dtype_include=object))) # city
|
|
>>> ct.fit_transform(X) # doctest: +SKIP
|
|
array([[ 0.90453403, 1. , 0. , 0. ],
|
|
[-1.50755672, 1. , 0. , 0. ],
|
|
[-0.30151134, 0. , 1. , 0. ],
|
|
[ 0.90453403, 0. , 0. , 1. ]])
|
|
"""
|
|
|
|
def __init__(self, pattern=None, *, dtype_include=None, dtype_exclude=None):
|
|
self.pattern = pattern
|
|
self.dtype_include = dtype_include
|
|
self.dtype_exclude = dtype_exclude
|
|
|
|
def __call__(self, df):
|
|
"""Callable for column selection to be used by a
|
|
:class:`ColumnTransformer`.
|
|
|
|
Parameters
|
|
----------
|
|
df : dataframe of shape (n_features, n_samples)
|
|
DataFrame to select columns from.
|
|
"""
|
|
if not hasattr(df, "iloc"):
|
|
raise ValueError(
|
|
"make_column_selector can only be applied to pandas dataframes"
|
|
)
|
|
df_row = df.iloc[:1]
|
|
if self.dtype_include is not None or self.dtype_exclude is not None:
|
|
df_row = df_row.select_dtypes(
|
|
include=self.dtype_include, exclude=self.dtype_exclude
|
|
)
|
|
cols = df_row.columns
|
|
if self.pattern is not None:
|
|
cols = cols[cols.str.contains(self.pattern, regex=True)]
|
|
return cols.tolist()
|
|
|
|
|
|
class _RemainderColsList(UserList):
|
|
"""A list that raises a warning whenever items are accessed.
|
|
|
|
It is used to store the columns handled by the "remainder" entry of
|
|
``ColumnTransformer.transformers_``, ie ``transformers_[-1][-1]``.
|
|
|
|
For some values of the ``ColumnTransformer`` ``transformers`` parameter,
|
|
this list of indices will be replaced by either a list of column names or a
|
|
boolean mask; in those cases we emit a ``FutureWarning`` the first time an
|
|
element is accessed.
|
|
|
|
Parameters
|
|
----------
|
|
columns : list of int
|
|
The remainder columns.
|
|
|
|
future_dtype : {'str', 'bool'}, default=None
|
|
The dtype that will be used by a ColumnTransformer with the same inputs
|
|
in a future release. There is a default value because providing a
|
|
constructor that takes a single argument is a requirement for
|
|
subclasses of UserList, but we do not use it in practice. It would only
|
|
be used if a user called methods that return a new list such are
|
|
copying or concatenating `_RemainderColsList`.
|
|
|
|
warning_was_emitted : bool, default=False
|
|
Whether the warning for that particular list was already shown, so we
|
|
only emit it once.
|
|
|
|
warning_enabled : bool, default=True
|
|
When False, the list never emits the warning nor updates
|
|
`warning_was_emitted``. This is used to obtain a quiet copy of the list
|
|
for use by the `ColumnTransformer` itself, so that the warning is only
|
|
shown when a user accesses it directly.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
columns,
|
|
*,
|
|
future_dtype=None,
|
|
warning_was_emitted=False,
|
|
warning_enabled=True,
|
|
):
|
|
super().__init__(columns)
|
|
self.future_dtype = future_dtype
|
|
self.warning_was_emitted = warning_was_emitted
|
|
self.warning_enabled = warning_enabled
|
|
|
|
def __getitem__(self, index):
|
|
self._show_remainder_cols_warning()
|
|
return super().__getitem__(index)
|
|
|
|
def _show_remainder_cols_warning(self):
|
|
if self.warning_was_emitted or not self.warning_enabled:
|
|
return
|
|
self.warning_was_emitted = True
|
|
future_dtype_description = {
|
|
"str": "column names (of type str)",
|
|
"bool": "a mask array (of type bool)",
|
|
# shouldn't happen because we always initialize it with a
|
|
# non-default future_dtype
|
|
None: "a different type depending on the ColumnTransformer inputs",
|
|
}.get(self.future_dtype, self.future_dtype)
|
|
|
|
# TODO(1.7) Update the warning to say that the old behavior will be
|
|
# removed in 1.9.
|
|
warnings.warn(
|
|
(
|
|
"\nThe format of the columns of the 'remainder' transformer in"
|
|
" ColumnTransformer.transformers_ will change in version 1.7 to"
|
|
" match the format of the other transformers.\nAt the moment the"
|
|
" remainder columns are stored as indices (of type int). With the same"
|
|
" ColumnTransformer configuration, in the future they will be stored"
|
|
f" as {future_dtype_description}.\nTo use the new behavior now and"
|
|
" suppress this warning, use"
|
|
" ColumnTransformer(force_int_remainder_cols=False).\n"
|
|
),
|
|
category=FutureWarning,
|
|
)
|
|
|
|
def _repr_pretty_(self, printer, *_):
|
|
"""Override display in ipython console, otherwise the class name is shown."""
|
|
printer.text(repr(self.data))
|
|
|
|
|
|
def _with_dtype_warning_enabled_set_to(warning_enabled, transformers):
|
|
result = []
|
|
for name, trans, columns in transformers:
|
|
if isinstance(columns, _RemainderColsList):
|
|
columns = _RemainderColsList(
|
|
columns.data,
|
|
future_dtype=columns.future_dtype,
|
|
warning_was_emitted=columns.warning_was_emitted,
|
|
warning_enabled=warning_enabled,
|
|
)
|
|
result.append((name, trans, columns))
|
|
return result
|