837 lines
33 KiB
Python
837 lines
33 KiB
Python
|
"""
|
||
|
The :mod:`sklearn.compose._column_transformer` module implements utilities
|
||
|
to work with heterogeneous data and to apply different transformers to
|
||
|
different columns.
|
||
|
"""
|
||
|
# Author: Andreas Mueller
|
||
|
# Joris Van den Bossche
|
||
|
# License: BSD
|
||
|
from itertools import chain
|
||
|
|
||
|
import numpy as np
|
||
|
from scipy import sparse
|
||
|
from joblib import Parallel
|
||
|
|
||
|
from ..base import clone, TransformerMixin
|
||
|
from ..utils._estimator_html_repr import _VisualBlock
|
||
|
from ..pipeline import _fit_transform_one, _transform_one, _name_estimators
|
||
|
from ..preprocessing import FunctionTransformer
|
||
|
from ..utils import Bunch
|
||
|
from ..utils import _safe_indexing
|
||
|
from ..utils import _get_column_indices
|
||
|
from ..utils import _determine_key_type
|
||
|
from ..utils.metaestimators import _BaseComposition
|
||
|
from ..utils.validation import check_array, check_is_fitted
|
||
|
from ..utils.validation import _deprecate_positional_args
|
||
|
from ..utils.fixes import delayed
|
||
|
|
||
|
|
||
|
__all__ = [
|
||
|
'ColumnTransformer', 'make_column_transformer', 'make_column_selector'
|
||
|
]
|
||
|
|
||
|
|
||
|
_ERR_MSG_1DCOLUMN = ("1D data passed to a transformer that expects 2D data. "
|
||
|
"Try to specify the column selection as a list of one "
|
||
|
"item instead of a scalar.")
|
||
|
|
||
|
|
||
|
class ColumnTransformer(TransformerMixin, _BaseComposition):
|
||
|
"""Applies transformers to columns of an array or pandas DataFrame.
|
||
|
|
||
|
This estimator allows different columns or column subsets of the input
|
||
|
to be transformed separately and the features generated by each transformer
|
||
|
will be concatenated to form a single feature space.
|
||
|
This is useful for heterogeneous or columnar data, to combine several
|
||
|
feature extraction mechanisms or transformations into a single transformer.
|
||
|
|
||
|
Read more in the :ref:`User Guide <column_transformer>`.
|
||
|
|
||
|
.. versionadded:: 0.20
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
transformers : list of tuples
|
||
|
List of (name, transformer, columns) tuples specifying the
|
||
|
transformer objects to be applied to subsets of the data.
|
||
|
|
||
|
name : str
|
||
|
Like in Pipeline and FeatureUnion, this allows the transformer and
|
||
|
its parameters to be set using ``set_params`` and searched in grid
|
||
|
search.
|
||
|
transformer : {'drop', 'passthrough'} or estimator
|
||
|
Estimator must support :term:`fit` and :term:`transform`.
|
||
|
Special-cased strings 'drop' and 'passthrough' are accepted as
|
||
|
well, to indicate to drop the columns or to pass them through
|
||
|
untransformed, respectively.
|
||
|
columns : str, array-like of str, int, array-like of int, \
|
||
|
array-like of bool, slice or callable
|
||
|
Indexes the data on its second axis. Integers are interpreted as
|
||
|
positional columns, while strings can reference DataFrame columns
|
||
|
by name. A scalar string or int should be used where
|
||
|
``transformer`` expects X to be a 1d array-like (vector),
|
||
|
otherwise a 2d array will be passed to the transformer.
|
||
|
A callable is passed the input data `X` and can return any of the
|
||
|
above. To select multiple columns by name or dtype, you can use
|
||
|
:obj:`make_column_selector`.
|
||
|
|
||
|
remainder : {'drop', 'passthrough'} or estimator, default='drop'
|
||
|
By default, only the specified columns in `transformers` are
|
||
|
transformed and combined in the output, and the non-specified
|
||
|
columns are dropped. (default of ``'drop'``).
|
||
|
By specifying ``remainder='passthrough'``, all remaining columns that
|
||
|
were not specified in `transformers` will be automatically passed
|
||
|
through. This subset of columns is concatenated with the output of
|
||
|
the transformers.
|
||
|
By setting ``remainder`` to be an estimator, the remaining
|
||
|
non-specified columns will use the ``remainder`` estimator. The
|
||
|
estimator must support :term:`fit` and :term:`transform`.
|
||
|
Note that using this feature requires that the DataFrame columns
|
||
|
input at :term:`fit` and :term:`transform` have identical order.
|
||
|
|
||
|
sparse_threshold : float, default=0.3
|
||
|
If the output of the different transformers contains sparse matrices,
|
||
|
these will be stacked as a sparse matrix if the overall density is
|
||
|
lower than this value. Use ``sparse_threshold=0`` to always return
|
||
|
dense. When the transformed output consists of all dense data, the
|
||
|
stacked result will be dense, and this keyword will be ignored.
|
||
|
|
||
|
n_jobs : int, default=None
|
||
|
Number of jobs to run in parallel.
|
||
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||
|
for more details.
|
||
|
|
||
|
transformer_weights : dict, default=None
|
||
|
Multiplicative weights for features per transformer. The output of the
|
||
|
transformer is multiplied by these weights. Keys are transformer names,
|
||
|
values the weights.
|
||
|
|
||
|
verbose : bool, default=False
|
||
|
If True, the time elapsed while fitting each transformer will be
|
||
|
printed as it is completed.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
transformers_ : list
|
||
|
The collection of fitted transformers as tuples of
|
||
|
(name, fitted_transformer, column). `fitted_transformer` can be an
|
||
|
estimator, 'drop', or 'passthrough'. In case there were no columns
|
||
|
selected, this will be the unfitted transformer.
|
||
|
If there are remaining columns, the final element is a tuple of the
|
||
|
form:
|
||
|
('remainder', transformer, remaining_columns) corresponding to the
|
||
|
``remainder`` parameter. If there are remaining columns, then
|
||
|
``len(transformers_)==len(transformers)+1``, otherwise
|
||
|
``len(transformers_)==len(transformers)``.
|
||
|
|
||
|
named_transformers_ : :class:`~sklearn.utils.Bunch`
|
||
|
Read-only attribute to access any transformer by given name.
|
||
|
Keys are transformer names and values are the fitted transformer
|
||
|
objects.
|
||
|
|
||
|
sparse_output_ : bool
|
||
|
Boolean flag indicating whether the output of ``transform`` is a
|
||
|
sparse matrix or a dense numpy array, which depends on the output
|
||
|
of the individual transformers and the `sparse_threshold` keyword.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The order of the columns in the transformed feature matrix follows the
|
||
|
order of how the columns are specified in the `transformers` list.
|
||
|
Columns of the original feature matrix that are not specified are
|
||
|
dropped from the resulting transformed feature matrix, unless specified
|
||
|
in the `passthrough` keyword. Those columns specified with `passthrough`
|
||
|
are added at the right to the output of the transformers.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
make_column_transformer : Convenience function for
|
||
|
combining the outputs of multiple transformer objects applied to
|
||
|
column subsets of the original feature space.
|
||
|
make_column_selector : Convenience function for selecting
|
||
|
columns based on datatype or the columns name with a regex pattern.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> import numpy as np
|
||
|
>>> from sklearn.compose import ColumnTransformer
|
||
|
>>> from sklearn.preprocessing import Normalizer
|
||
|
>>> ct = ColumnTransformer(
|
||
|
... [("norm1", Normalizer(norm='l1'), [0, 1]),
|
||
|
... ("norm2", Normalizer(norm='l1'), slice(2, 4))])
|
||
|
>>> X = np.array([[0., 1., 2., 2.],
|
||
|
... [1., 1., 0., 1.]])
|
||
|
>>> # Normalizer scales each row of X to unit norm. A separate scaling
|
||
|
>>> # is applied for the two first and two last elements of each
|
||
|
>>> # row independently.
|
||
|
>>> ct.fit_transform(X)
|
||
|
array([[0. , 1. , 0.5, 0.5],
|
||
|
[0.5, 0.5, 0. , 1. ]])
|
||
|
|
||
|
"""
|
||
|
_required_parameters = ['transformers']
|
||
|
|
||
|
@_deprecate_positional_args
|
||
|
def __init__(self,
|
||
|
transformers, *,
|
||
|
remainder='drop',
|
||
|
sparse_threshold=0.3,
|
||
|
n_jobs=None,
|
||
|
transformer_weights=None,
|
||
|
verbose=False):
|
||
|
self.transformers = transformers
|
||
|
self.remainder = remainder
|
||
|
self.sparse_threshold = sparse_threshold
|
||
|
self.n_jobs = n_jobs
|
||
|
self.transformer_weights = transformer_weights
|
||
|
self.verbose = verbose
|
||
|
|
||
|
@property
|
||
|
def _transformers(self):
|
||
|
"""
|
||
|
Internal list of transformer only containing the name and
|
||
|
transformers, dropping the columns. This is for the implementation
|
||
|
of get_params via BaseComposition._get_params which expects lists
|
||
|
of tuples of len 2.
|
||
|
"""
|
||
|
return [(name, trans) for name, trans, _ in self.transformers]
|
||
|
|
||
|
@_transformers.setter
|
||
|
def _transformers(self, value):
|
||
|
self.transformers = [
|
||
|
(name, trans, col) for ((name, trans), (_, _, col))
|
||
|
in zip(value, self.transformers)]
|
||
|
|
||
|
def get_params(self, deep=True):
|
||
|
"""Get parameters for this estimator.
|
||
|
|
||
|
Returns the parameters given in the constructor as well as the
|
||
|
estimators contained within the `transformers` of the
|
||
|
`ColumnTransformer`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
deep : bool, default=True
|
||
|
If True, will return the parameters for this estimator and
|
||
|
contained subobjects that are estimators.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
params : dict
|
||
|
Parameter names mapped to their values.
|
||
|
"""
|
||
|
return self._get_params('_transformers', deep=deep)
|
||
|
|
||
|
def set_params(self, **kwargs):
|
||
|
"""Set the parameters of this estimator.
|
||
|
|
||
|
Valid parameter keys can be listed with ``get_params()``. Note that you
|
||
|
can directly set the parameters of the estimators contained in
|
||
|
`transformers` of `ColumnTransformer`.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
self
|
||
|
"""
|
||
|
self._set_params('_transformers', **kwargs)
|
||
|
return self
|
||
|
|
||
|
def _iter(self, fitted=False, replace_strings=False):
|
||
|
"""
|
||
|
Generate (name, trans, column, weight) tuples.
|
||
|
|
||
|
If fitted=True, use the fitted transformers, else use the
|
||
|
user specified transformers updated with converted column names
|
||
|
and potentially appended with transformer for remainder.
|
||
|
|
||
|
"""
|
||
|
if fitted:
|
||
|
transformers = self.transformers_
|
||
|
else:
|
||
|
# interleave the validated column specifiers
|
||
|
transformers = [
|
||
|
(name, trans, column) for (name, trans, _), column
|
||
|
in zip(self.transformers, self._columns)
|
||
|
]
|
||
|
# add transformer tuple for remainder
|
||
|
if self._remainder[2] is not None:
|
||
|
transformers = chain(transformers, [self._remainder])
|
||
|
get_weight = (self.transformer_weights or {}).get
|
||
|
|
||
|
for name, trans, column in transformers:
|
||
|
if replace_strings:
|
||
|
# replace 'passthrough' with identity transformer and
|
||
|
# skip in case of 'drop'
|
||
|
if trans == 'passthrough':
|
||
|
trans = FunctionTransformer(
|
||
|
accept_sparse=True, check_inverse=False
|
||
|
)
|
||
|
elif trans == 'drop':
|
||
|
continue
|
||
|
elif _is_empty_column_selection(column):
|
||
|
continue
|
||
|
|
||
|
yield (name, trans, column, get_weight(name))
|
||
|
|
||
|
def _validate_transformers(self):
|
||
|
if not self.transformers:
|
||
|
return
|
||
|
|
||
|
names, transformers, _ = zip(*self.transformers)
|
||
|
|
||
|
# validate names
|
||
|
self._validate_names(names)
|
||
|
|
||
|
# validate estimators
|
||
|
for t in transformers:
|
||
|
if t in ('drop', 'passthrough'):
|
||
|
continue
|
||
|
if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not
|
||
|
hasattr(t, "transform")):
|
||
|
raise TypeError("All estimators should implement fit and "
|
||
|
"transform, or can be 'drop' or 'passthrough' "
|
||
|
"specifiers. '%s' (type %s) doesn't." %
|
||
|
(t, type(t)))
|
||
|
|
||
|
def _validate_column_callables(self, X):
|
||
|
"""
|
||
|
Converts callable column specifications.
|
||
|
"""
|
||
|
columns = []
|
||
|
for _, _, column in self.transformers:
|
||
|
if callable(column):
|
||
|
column = column(X)
|
||
|
columns.append(column)
|
||
|
self._columns = columns
|
||
|
|
||
|
def _validate_remainder(self, X):
|
||
|
"""
|
||
|
Validates ``remainder`` and defines ``_remainder`` targeting
|
||
|
the remaining columns.
|
||
|
"""
|
||
|
is_transformer = ((hasattr(self.remainder, "fit")
|
||
|
or hasattr(self.remainder, "fit_transform"))
|
||
|
and hasattr(self.remainder, "transform"))
|
||
|
if (self.remainder not in ('drop', 'passthrough')
|
||
|
and not is_transformer):
|
||
|
raise ValueError(
|
||
|
"The remainder keyword needs to be one of 'drop', "
|
||
|
"'passthrough', or estimator. '%s' was passed instead" %
|
||
|
self.remainder)
|
||
|
|
||
|
# Make it possible to check for reordered named columns on transform
|
||
|
self._has_str_cols = any(_determine_key_type(cols) == 'str'
|
||
|
for cols in self._columns)
|
||
|
if hasattr(X, 'columns'):
|
||
|
self._df_columns = X.columns
|
||
|
|
||
|
self._n_features = X.shape[1]
|
||
|
cols = []
|
||
|
for columns in self._columns:
|
||
|
cols.extend(_get_column_indices(X, columns))
|
||
|
|
||
|
remaining_idx = sorted(set(range(self._n_features)) - set(cols))
|
||
|
self._remainder = ('remainder', self.remainder, remaining_idx or None)
|
||
|
|
||
|
@property
|
||
|
def named_transformers_(self):
|
||
|
"""Access the fitted transformer by name.
|
||
|
|
||
|
Read-only attribute to access any transformer by given name.
|
||
|
Keys are transformer names and values are the fitted transformer
|
||
|
objects.
|
||
|
|
||
|
"""
|
||
|
# Use Bunch object to improve autocomplete
|
||
|
return Bunch(**{name: trans for name, trans, _
|
||
|
in self.transformers_})
|
||
|
|
||
|
def get_feature_names(self):
|
||
|
"""Get feature names from all transformers.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
feature_names : list of strings
|
||
|
Names of the features produced by transform.
|
||
|
"""
|
||
|
check_is_fitted(self)
|
||
|
feature_names = []
|
||
|
for name, trans, column, _ in self._iter(fitted=True):
|
||
|
if trans == 'drop' or _is_empty_column_selection(column):
|
||
|
continue
|
||
|
if trans == 'passthrough':
|
||
|
if hasattr(self, '_df_columns'):
|
||
|
if ((not isinstance(column, slice))
|
||
|
and all(isinstance(col, str) for col in column)):
|
||
|
feature_names.extend(column)
|
||
|
else:
|
||
|
feature_names.extend(self._df_columns[column])
|
||
|
else:
|
||
|
indices = np.arange(self._n_features)
|
||
|
feature_names.extend(['x%d' % i for i in indices[column]])
|
||
|
continue
|
||
|
if not hasattr(trans, 'get_feature_names'):
|
||
|
raise AttributeError("Transformer %s (type %s) does not "
|
||
|
"provide get_feature_names."
|
||
|
% (str(name), type(trans).__name__))
|
||
|
feature_names.extend([name + "__" + f for f in
|
||
|
trans.get_feature_names()])
|
||
|
return feature_names
|
||
|
|
||
|
def _update_fitted_transformers(self, transformers):
|
||
|
# transformers are fitted; excludes 'drop' cases
|
||
|
fitted_transformers = iter(transformers)
|
||
|
transformers_ = []
|
||
|
|
||
|
for name, old, column, _ in self._iter():
|
||
|
if old == 'drop':
|
||
|
trans = 'drop'
|
||
|
elif old == 'passthrough':
|
||
|
# FunctionTransformer is present in list of transformers,
|
||
|
# so get next transformer, but save original string
|
||
|
next(fitted_transformers)
|
||
|
trans = 'passthrough'
|
||
|
elif _is_empty_column_selection(column):
|
||
|
trans = old
|
||
|
else:
|
||
|
trans = next(fitted_transformers)
|
||
|
transformers_.append((name, trans, column))
|
||
|
|
||
|
# sanity check that transformers is exhausted
|
||
|
assert not list(fitted_transformers)
|
||
|
self.transformers_ = transformers_
|
||
|
|
||
|
def _validate_output(self, result):
|
||
|
"""
|
||
|
Ensure that the output of each transformer is 2D. Otherwise
|
||
|
hstack can raise an error or produce incorrect results.
|
||
|
"""
|
||
|
names = [name for name, _, _, _ in self._iter(fitted=True,
|
||
|
replace_strings=True)]
|
||
|
for Xs, name in zip(result, names):
|
||
|
if not getattr(Xs, 'ndim', 0) == 2:
|
||
|
raise ValueError(
|
||
|
"The output of the '{0}' transformer should be 2D (scipy "
|
||
|
"matrix, array, or pandas DataFrame).".format(name))
|
||
|
|
||
|
def _log_message(self, name, idx, total):
|
||
|
if not self.verbose:
|
||
|
return None
|
||
|
return '(%d of %d) Processing %s' % (idx, total, name)
|
||
|
|
||
|
def _fit_transform(self, X, y, func, fitted=False):
|
||
|
"""
|
||
|
Private function to fit and/or transform on demand.
|
||
|
|
||
|
Return value (transformers and/or transformed X data) depends
|
||
|
on the passed function.
|
||
|
``fitted=True`` ensures the fitted transformers are used.
|
||
|
"""
|
||
|
transformers = list(
|
||
|
self._iter(fitted=fitted, replace_strings=True))
|
||
|
try:
|
||
|
return Parallel(n_jobs=self.n_jobs)(
|
||
|
delayed(func)(
|
||
|
transformer=clone(trans) if not fitted else trans,
|
||
|
X=_safe_indexing(X, column, axis=1),
|
||
|
y=y,
|
||
|
weight=weight,
|
||
|
message_clsname='ColumnTransformer',
|
||
|
message=self._log_message(name, idx, len(transformers)))
|
||
|
for idx, (name, trans, column, weight) in enumerate(
|
||
|
self._iter(fitted=fitted, replace_strings=True), 1))
|
||
|
except ValueError as e:
|
||
|
if "Expected 2D array, got 1D array instead" in str(e):
|
||
|
raise ValueError(_ERR_MSG_1DCOLUMN) from e
|
||
|
else:
|
||
|
raise
|
||
|
|
||
|
def fit(self, X, y=None):
|
||
|
"""Fit all transformers using X.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, dataframe} of shape (n_samples, n_features)
|
||
|
Input data, of which specified subsets are used to fit the
|
||
|
transformers.
|
||
|
|
||
|
y : array-like of shape (n_samples,...), default=None
|
||
|
Targets for supervised learning.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
self : ColumnTransformer
|
||
|
This estimator
|
||
|
|
||
|
"""
|
||
|
# we use fit_transform to make sure to set sparse_output_ (for which we
|
||
|
# need the transformed data) to have consistent output type in predict
|
||
|
self.fit_transform(X, y=y)
|
||
|
return self
|
||
|
|
||
|
def fit_transform(self, X, y=None):
|
||
|
"""Fit all transformers, transform the data and concatenate results.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, dataframe} of shape (n_samples, n_features)
|
||
|
Input data, of which specified subsets are used to fit the
|
||
|
transformers.
|
||
|
|
||
|
y : array-like of shape (n_samples,), default=None
|
||
|
Targets for supervised learning.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
X_t : {array-like, sparse matrix} of \
|
||
|
shape (n_samples, sum_n_components)
|
||
|
hstack of results of transformers. sum_n_components is the
|
||
|
sum of n_components (output dimension) over transformers. If
|
||
|
any result is a sparse matrix, everything will be converted to
|
||
|
sparse matrices.
|
||
|
|
||
|
"""
|
||
|
# TODO: this should be `feature_names_in_` when we start having it
|
||
|
if hasattr(X, "columns"):
|
||
|
self._feature_names_in = np.asarray(X.columns)
|
||
|
else:
|
||
|
self._feature_names_in = None
|
||
|
X = _check_X(X)
|
||
|
# set n_features_in_ attribute
|
||
|
self._check_n_features(X, reset=True)
|
||
|
self._validate_transformers()
|
||
|
self._validate_column_callables(X)
|
||
|
self._validate_remainder(X)
|
||
|
|
||
|
result = self._fit_transform(X, y, _fit_transform_one)
|
||
|
|
||
|
if not result:
|
||
|
self._update_fitted_transformers([])
|
||
|
# All transformers are None
|
||
|
return np.zeros((X.shape[0], 0))
|
||
|
|
||
|
Xs, transformers = zip(*result)
|
||
|
|
||
|
# determine if concatenated output will be sparse or not
|
||
|
if any(sparse.issparse(X) for X in Xs):
|
||
|
nnz = sum(X.nnz if sparse.issparse(X) else X.size for X in Xs)
|
||
|
total = sum(X.shape[0] * X.shape[1] if sparse.issparse(X)
|
||
|
else X.size for X in Xs)
|
||
|
density = nnz / total
|
||
|
self.sparse_output_ = density < self.sparse_threshold
|
||
|
else:
|
||
|
self.sparse_output_ = False
|
||
|
|
||
|
self._update_fitted_transformers(transformers)
|
||
|
self._validate_output(Xs)
|
||
|
|
||
|
return self._hstack(list(Xs))
|
||
|
|
||
|
def transform(self, X):
|
||
|
"""Transform X separately by each transformer, concatenate results.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, dataframe} of shape (n_samples, n_features)
|
||
|
The data to be transformed by subset.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
X_t : {array-like, sparse matrix} of \
|
||
|
shape (n_samples, sum_n_components)
|
||
|
hstack of results of transformers. sum_n_components is the
|
||
|
sum of n_components (output dimension) over transformers. If
|
||
|
any result is a sparse matrix, everything will be converted to
|
||
|
sparse matrices.
|
||
|
|
||
|
"""
|
||
|
check_is_fitted(self)
|
||
|
X = _check_X(X)
|
||
|
if hasattr(X, "columns"):
|
||
|
X_feature_names = np.asarray(X.columns)
|
||
|
else:
|
||
|
X_feature_names = None
|
||
|
|
||
|
self._check_n_features(X, reset=False)
|
||
|
if (self._feature_names_in is not None and
|
||
|
X_feature_names is not None and
|
||
|
np.any(self._feature_names_in != X_feature_names)):
|
||
|
raise RuntimeError(
|
||
|
"Given feature/column names do not match the ones for the "
|
||
|
"data given during fit."
|
||
|
)
|
||
|
Xs = self._fit_transform(X, None, _transform_one, fitted=True)
|
||
|
self._validate_output(Xs)
|
||
|
|
||
|
if not Xs:
|
||
|
# All transformers are None
|
||
|
return np.zeros((X.shape[0], 0))
|
||
|
|
||
|
return self._hstack(list(Xs))
|
||
|
|
||
|
def _hstack(self, Xs):
|
||
|
"""Stacks Xs horizontally.
|
||
|
|
||
|
This allows subclasses to control the stacking behavior, while reusing
|
||
|
everything else from ColumnTransformer.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
Xs : list of {array-like, sparse matrix, dataframe}
|
||
|
"""
|
||
|
if self.sparse_output_:
|
||
|
try:
|
||
|
# since all columns should be numeric before stacking them
|
||
|
# in a sparse matrix, `check_array` is used for the
|
||
|
# dtype conversion if necessary.
|
||
|
converted_Xs = [check_array(X,
|
||
|
accept_sparse=True,
|
||
|
force_all_finite=False)
|
||
|
for X in Xs]
|
||
|
except ValueError as e:
|
||
|
raise ValueError(
|
||
|
"For a sparse output, all columns should "
|
||
|
"be a numeric or convertible to a numeric."
|
||
|
) from e
|
||
|
|
||
|
return sparse.hstack(converted_Xs).tocsr()
|
||
|
else:
|
||
|
Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs]
|
||
|
return np.hstack(Xs)
|
||
|
|
||
|
def _sk_visual_block_(self):
|
||
|
if isinstance(self.remainder, str) and self.remainder == 'drop':
|
||
|
transformers = self.transformers
|
||
|
elif hasattr(self, "_remainder"):
|
||
|
remainder_columns = self._remainder[2]
|
||
|
if hasattr(self, '_df_columns'):
|
||
|
remainder_columns = (
|
||
|
self._df_columns[remainder_columns].tolist()
|
||
|
)
|
||
|
transformers = chain(self.transformers,
|
||
|
[('remainder', self.remainder,
|
||
|
remainder_columns)])
|
||
|
else:
|
||
|
transformers = chain(self.transformers,
|
||
|
[('remainder', self.remainder, '')])
|
||
|
|
||
|
names, transformers, name_details = zip(*transformers)
|
||
|
return _VisualBlock('parallel', transformers,
|
||
|
names=names, name_details=name_details)
|
||
|
|
||
|
|
||
|
def _check_X(X):
|
||
|
"""Use check_array only on lists and other non-array-likes / sparse"""
|
||
|
if hasattr(X, '__array__') or sparse.issparse(X):
|
||
|
return X
|
||
|
return check_array(X, force_all_finite='allow-nan', dtype=object)
|
||
|
|
||
|
|
||
|
def _is_empty_column_selection(column):
|
||
|
"""
|
||
|
Return True if the column selection is empty (empty list or all-False
|
||
|
boolean array).
|
||
|
|
||
|
"""
|
||
|
if hasattr(column, 'dtype') and np.issubdtype(column.dtype, np.bool_):
|
||
|
return not column.any()
|
||
|
elif hasattr(column, '__len__'):
|
||
|
return (len(column) == 0 or
|
||
|
all(isinstance(col, bool) for col in column)
|
||
|
and not any(column))
|
||
|
else:
|
||
|
return False
|
||
|
|
||
|
|
||
|
def _get_transformer_list(estimators):
|
||
|
"""
|
||
|
Construct (name, trans, column) tuples from list
|
||
|
|
||
|
"""
|
||
|
transformers, columns = zip(*estimators)
|
||
|
names, _ = zip(*_name_estimators(transformers))
|
||
|
|
||
|
transformer_list = list(zip(names, transformers, columns))
|
||
|
return transformer_list
|
||
|
|
||
|
|
||
|
def make_column_transformer(*transformers,
|
||
|
remainder='drop',
|
||
|
sparse_threshold=0.3,
|
||
|
n_jobs=None,
|
||
|
verbose=False):
|
||
|
"""Construct a ColumnTransformer from the given transformers.
|
||
|
|
||
|
This is a shorthand for the ColumnTransformer constructor; it does not
|
||
|
require, and does not permit, naming the transformers. Instead, they will
|
||
|
be given names automatically based on their types. It also does not allow
|
||
|
weighting with ``transformer_weights``.
|
||
|
|
||
|
Read more in the :ref:`User Guide <make_column_transformer>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
*transformers : tuples
|
||
|
Tuples of the form (transformer, columns) specifying the
|
||
|
transformer objects to be applied to subsets of the data.
|
||
|
|
||
|
transformer : {'drop', 'passthrough'} or estimator
|
||
|
Estimator must support :term:`fit` and :term:`transform`.
|
||
|
Special-cased strings 'drop' and 'passthrough' are accepted as
|
||
|
well, to indicate to drop the columns or to pass them through
|
||
|
untransformed, respectively.
|
||
|
columns : str, array-like of str, int, array-like of int, slice, \
|
||
|
array-like of bool or callable
|
||
|
Indexes the data on its second axis. Integers are interpreted as
|
||
|
positional columns, while strings can reference DataFrame columns
|
||
|
by name. A scalar string or int should be used where
|
||
|
``transformer`` expects X to be a 1d array-like (vector),
|
||
|
otherwise a 2d array will be passed to the transformer.
|
||
|
A callable is passed the input data `X` and can return any of the
|
||
|
above. To select multiple columns by name or dtype, you can use
|
||
|
:obj:`make_column_selector`.
|
||
|
|
||
|
remainder : {'drop', 'passthrough'} or estimator, default='drop'
|
||
|
By default, only the specified columns in `transformers` are
|
||
|
transformed and combined in the output, and the non-specified
|
||
|
columns are dropped. (default of ``'drop'``).
|
||
|
By specifying ``remainder='passthrough'``, all remaining columns that
|
||
|
were not specified in `transformers` will be automatically passed
|
||
|
through. This subset of columns is concatenated with the output of
|
||
|
the transformers.
|
||
|
By setting ``remainder`` to be an estimator, the remaining
|
||
|
non-specified columns will use the ``remainder`` estimator. The
|
||
|
estimator must support :term:`fit` and :term:`transform`.
|
||
|
|
||
|
sparse_threshold : float, default=0.3
|
||
|
If the transformed output consists of a mix of sparse and dense data,
|
||
|
it will be stacked as a sparse matrix if the density is lower than this
|
||
|
value. Use ``sparse_threshold=0`` to always return dense.
|
||
|
When the transformed output consists of all sparse or all dense data,
|
||
|
the stacked result will be sparse or dense, respectively, and this
|
||
|
keyword will be ignored.
|
||
|
|
||
|
n_jobs : int, default=None
|
||
|
Number of jobs to run in parallel.
|
||
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||
|
for more details.
|
||
|
|
||
|
verbose : bool, default=False
|
||
|
If True, the time elapsed while fitting each transformer will be
|
||
|
printed as it is completed.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
ct : ColumnTransformer
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
ColumnTransformer : Class that allows combining the
|
||
|
outputs of multiple transformer objects used on column subsets
|
||
|
of the data into a single feature space.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||
|
>>> from sklearn.compose import make_column_transformer
|
||
|
>>> make_column_transformer(
|
||
|
... (StandardScaler(), ['numerical_column']),
|
||
|
... (OneHotEncoder(), ['categorical_column']))
|
||
|
ColumnTransformer(transformers=[('standardscaler', StandardScaler(...),
|
||
|
['numerical_column']),
|
||
|
('onehotencoder', OneHotEncoder(...),
|
||
|
['categorical_column'])])
|
||
|
|
||
|
"""
|
||
|
# transformer_weights keyword is not passed through because the user
|
||
|
# would need to know the automatically generated names of the transformers
|
||
|
transformer_list = _get_transformer_list(transformers)
|
||
|
return ColumnTransformer(transformer_list, n_jobs=n_jobs,
|
||
|
remainder=remainder,
|
||
|
sparse_threshold=sparse_threshold,
|
||
|
verbose=verbose)
|
||
|
|
||
|
|
||
|
class make_column_selector:
|
||
|
"""Create a callable to select columns to be used with
|
||
|
:class:`ColumnTransformer`.
|
||
|
|
||
|
:func:`make_column_selector` can select columns based on datatype or the
|
||
|
columns name with a regex. When using multiple selection criteria, **all**
|
||
|
criteria must match for a column to be selected.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
pattern : str, default=None
|
||
|
Name of columns containing this regex pattern will be included. If
|
||
|
None, column selection will not be selected based on pattern.
|
||
|
|
||
|
dtype_include : column dtype or list of column dtypes, default=None
|
||
|
A selection of dtypes to include. For more details, see
|
||
|
:meth:`pandas.DataFrame.select_dtypes`.
|
||
|
|
||
|
dtype_exclude : column dtype or list of column dtypes, default=None
|
||
|
A selection of dtypes to exclude. For more details, see
|
||
|
:meth:`pandas.DataFrame.select_dtypes`.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
selector : callable
|
||
|
Callable for column selection to be used by a
|
||
|
:class:`ColumnTransformer`.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
ColumnTransformer : Class that allows combining the
|
||
|
outputs of multiple transformer objects used on column subsets
|
||
|
of the data into a single feature space.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||
|
>>> from sklearn.compose import make_column_transformer
|
||
|
>>> from sklearn.compose import make_column_selector
|
||
|
>>> import pandas as pd # doctest: +SKIP
|
||
|
>>> X = pd.DataFrame({'city': ['London', 'London', 'Paris', 'Sallisaw'],
|
||
|
... 'rating': [5, 3, 4, 5]}) # doctest: +SKIP
|
||
|
>>> ct = make_column_transformer(
|
||
|
... (StandardScaler(),
|
||
|
... make_column_selector(dtype_include=np.number)), # rating
|
||
|
... (OneHotEncoder(),
|
||
|
... make_column_selector(dtype_include=object))) # city
|
||
|
>>> ct.fit_transform(X) # doctest: +SKIP
|
||
|
array([[ 0.90453403, 1. , 0. , 0. ],
|
||
|
[-1.50755672, 1. , 0. , 0. ],
|
||
|
[-0.30151134, 0. , 1. , 0. ],
|
||
|
[ 0.90453403, 0. , 0. , 1. ]])
|
||
|
"""
|
||
|
@_deprecate_positional_args
|
||
|
def __init__(self, pattern=None, *, dtype_include=None,
|
||
|
dtype_exclude=None):
|
||
|
self.pattern = pattern
|
||
|
self.dtype_include = dtype_include
|
||
|
self.dtype_exclude = dtype_exclude
|
||
|
|
||
|
def __call__(self, df):
|
||
|
"""Callable for column selection to be used by a
|
||
|
:class:`ColumnTransformer`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
df : dataframe of shape (n_features, n_samples)
|
||
|
DataFrame to select columns from.
|
||
|
"""
|
||
|
if not hasattr(df, 'iloc'):
|
||
|
raise ValueError("make_column_selector can only be applied to "
|
||
|
"pandas dataframes")
|
||
|
df_row = df.iloc[:1]
|
||
|
if self.dtype_include is not None or self.dtype_exclude is not None:
|
||
|
df_row = df_row.select_dtypes(include=self.dtype_include,
|
||
|
exclude=self.dtype_exclude)
|
||
|
cols = df_row.columns
|
||
|
if self.pattern is not None:
|
||
|
cols = cols[cols.str.contains(self.pattern, regex=True)]
|
||
|
return cols.tolist()
|