3RNN/Lib/site-packages/sklearn/compose/tests/test_column_transformer.py
2024-05-26 19:49:15 +02:00

2744 lines
91 KiB
Python

"""
Test the ColumnTransformer.
"""
import pickle
import re
import warnings
from unittest.mock import Mock
import joblib
import numpy as np
import pytest
from numpy.testing import assert_allclose
from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import (
ColumnTransformer,
make_column_selector,
make_column_transformer,
)
from sklearn.compose._column_transformer import _RemainderColsList
from sklearn.exceptions import NotFittedError
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import (
FunctionTransformer,
Normalizer,
OneHotEncoder,
StandardScaler,
)
from sklearn.tests.metadata_routing_common import (
ConsumingTransformer,
_Registry,
check_recorded_metadata,
)
from sklearn.utils._testing import (
_convert_container,
assert_allclose_dense_sparse,
assert_almost_equal,
assert_array_equal,
)
from sklearn.utils.fixes import CSR_CONTAINERS, parse_version
class Trans(TransformerMixin, BaseEstimator):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
# 1D Series -> 2D DataFrame
if hasattr(X, "to_frame"):
return X.to_frame()
# 1D array -> 2D array
if getattr(X, "ndim", 2) == 1:
return np.atleast_2d(X).T
return X
class DoubleTrans(BaseEstimator):
def fit(self, X, y=None):
return self
def transform(self, X):
return 2 * X
class SparseMatrixTrans(BaseEstimator):
def __init__(self, csr_container):
self.csr_container = csr_container
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
n_samples = len(X)
return self.csr_container(sparse.eye(n_samples, n_samples))
class TransNo2D(BaseEstimator):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X
class TransRaise(BaseEstimator):
def fit(self, X, y=None):
raise ValueError("specific message")
def transform(self, X, y=None):
raise ValueError("specific message")
def test_column_transformer():
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
X_res_first1D = np.array([0, 1, 2])
X_res_second1D = np.array([2, 4, 6])
X_res_first = X_res_first1D.reshape(-1, 1)
X_res_both = X_array
cases = [
# single column 1D / 2D
(0, X_res_first),
([0], X_res_first),
# list-like
([0, 1], X_res_both),
(np.array([0, 1]), X_res_both),
# slice
(slice(0, 1), X_res_first),
(slice(0, 2), X_res_both),
# boolean mask
(np.array([True, False]), X_res_first),
([True, False], X_res_first),
(np.array([True, True]), X_res_both),
([True, True], X_res_both),
]
for selection, res in cases:
ct = ColumnTransformer([("trans", Trans(), selection)], remainder="drop")
assert_array_equal(ct.fit_transform(X_array), res)
assert_array_equal(ct.fit(X_array).transform(X_array), res)
# callable that returns any of the allowed specifiers
ct = ColumnTransformer(
[("trans", Trans(), lambda x: selection)], remainder="drop"
)
assert_array_equal(ct.fit_transform(X_array), res)
assert_array_equal(ct.fit(X_array).transform(X_array), res)
ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", Trans(), [1])])
assert_array_equal(ct.fit_transform(X_array), X_res_both)
assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
assert len(ct.transformers_) == 2
# test with transformer_weights
transformer_weights = {"trans1": 0.1, "trans2": 10}
both = ColumnTransformer(
[("trans1", Trans(), [0]), ("trans2", Trans(), [1])],
transformer_weights=transformer_weights,
)
res = np.vstack(
[
transformer_weights["trans1"] * X_res_first1D,
transformer_weights["trans2"] * X_res_second1D,
]
).T
assert_array_equal(both.fit_transform(X_array), res)
assert_array_equal(both.fit(X_array).transform(X_array), res)
assert len(both.transformers_) == 2
both = ColumnTransformer(
[("trans", Trans(), [0, 1])], transformer_weights={"trans": 0.1}
)
assert_array_equal(both.fit_transform(X_array), 0.1 * X_res_both)
assert_array_equal(both.fit(X_array).transform(X_array), 0.1 * X_res_both)
assert len(both.transformers_) == 1
def test_column_transformer_tuple_transformers_parameter():
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
transformers = [("trans1", Trans(), [0]), ("trans2", Trans(), [1])]
ct_with_list = ColumnTransformer(transformers)
ct_with_tuple = ColumnTransformer(tuple(transformers))
assert_array_equal(
ct_with_list.fit_transform(X_array), ct_with_tuple.fit_transform(X_array)
)
assert_array_equal(
ct_with_list.fit(X_array).transform(X_array),
ct_with_tuple.fit(X_array).transform(X_array),
)
@pytest.mark.parametrize("constructor_name", ["dataframe", "polars"])
def test_column_transformer_dataframe(constructor_name):
if constructor_name == "dataframe":
dataframe_lib = pytest.importorskip("pandas")
else:
dataframe_lib = pytest.importorskip(constructor_name)
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
X_df = _convert_container(
X_array, constructor_name, columns_name=["first", "second"]
)
X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
X_res_both = X_array
cases = [
# String keys: label based
# list
(["first"], X_res_first),
(["first", "second"], X_res_both),
# slice
(slice("first", "second"), X_res_both),
# int keys: positional
# list
([0], X_res_first),
([0, 1], X_res_both),
(np.array([0, 1]), X_res_both),
# slice
(slice(0, 1), X_res_first),
(slice(0, 2), X_res_both),
# boolean mask
(np.array([True, False]), X_res_first),
([True, False], X_res_first),
]
if constructor_name == "dataframe":
# Scalars are only supported for pandas dataframes.
cases.extend(
[
# scalar
(0, X_res_first),
("first", X_res_first),
(
dataframe_lib.Series([True, False], index=["first", "second"]),
X_res_first,
),
]
)
for selection, res in cases:
ct = ColumnTransformer([("trans", Trans(), selection)], remainder="drop")
assert_array_equal(ct.fit_transform(X_df), res)
assert_array_equal(ct.fit(X_df).transform(X_df), res)
# callable that returns any of the allowed specifiers
ct = ColumnTransformer(
[("trans", Trans(), lambda X: selection)], remainder="drop"
)
assert_array_equal(ct.fit_transform(X_df), res)
assert_array_equal(ct.fit(X_df).transform(X_df), res)
ct = ColumnTransformer(
[("trans1", Trans(), ["first"]), ("trans2", Trans(), ["second"])]
)
assert_array_equal(ct.fit_transform(X_df), X_res_both)
assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
assert len(ct.transformers_) == 2
assert ct.transformers_[-1][0] != "remainder"
ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", Trans(), [1])])
assert_array_equal(ct.fit_transform(X_df), X_res_both)
assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
assert len(ct.transformers_) == 2
assert ct.transformers_[-1][0] != "remainder"
# test with transformer_weights
transformer_weights = {"trans1": 0.1, "trans2": 10}
both = ColumnTransformer(
[("trans1", Trans(), ["first"]), ("trans2", Trans(), ["second"])],
transformer_weights=transformer_weights,
)
res = np.vstack(
[
transformer_weights["trans1"] * X_df["first"],
transformer_weights["trans2"] * X_df["second"],
]
).T
assert_array_equal(both.fit_transform(X_df), res)
assert_array_equal(both.fit(X_df).transform(X_df), res)
assert len(both.transformers_) == 2
assert both.transformers_[-1][0] != "remainder"
# test multiple columns
both = ColumnTransformer(
[("trans", Trans(), ["first", "second"])], transformer_weights={"trans": 0.1}
)
assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both)
assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both)
assert len(both.transformers_) == 1
assert both.transformers_[-1][0] != "remainder"
both = ColumnTransformer(
[("trans", Trans(), [0, 1])], transformer_weights={"trans": 0.1}
)
assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both)
assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both)
assert len(both.transformers_) == 1
assert both.transformers_[-1][0] != "remainder"
# ensure pandas object is passed through
class TransAssert(BaseEstimator):
def __init__(self, expected_type_transform):
self.expected_type_transform = expected_type_transform
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
assert isinstance(X, self.expected_type_transform)
if isinstance(X, dataframe_lib.Series):
X = X.to_frame()
return X
ct = ColumnTransformer(
[
(
"trans",
TransAssert(expected_type_transform=dataframe_lib.DataFrame),
["first", "second"],
)
]
)
ct.fit_transform(X_df)
if constructor_name == "dataframe":
# DataFrame protocol does not have 1d columns, so we only test on Pandas
# dataframes.
ct = ColumnTransformer(
[
(
"trans",
TransAssert(expected_type_transform=dataframe_lib.Series),
"first",
)
],
remainder="drop",
)
ct.fit_transform(X_df)
# Only test on pandas because the dataframe protocol requires string column
# names
# integer column spec + integer column names -> still use positional
X_df2 = X_df.copy()
X_df2.columns = [1, 0]
ct = ColumnTransformer([("trans", Trans(), 0)], remainder="drop")
assert_array_equal(ct.fit_transform(X_df2), X_res_first)
assert_array_equal(ct.fit(X_df2).transform(X_df2), X_res_first)
assert len(ct.transformers_) == 2
assert ct.transformers_[-1][0] == "remainder"
assert ct.transformers_[-1][1] == "drop"
assert_array_equal(ct.transformers_[-1][2], [1])
@pytest.mark.parametrize("pandas", [True, False], ids=["pandas", "numpy"])
@pytest.mark.parametrize(
"column_selection",
[[], np.array([False, False]), [False, False]],
ids=["list", "bool", "bool_int"],
)
@pytest.mark.parametrize("callable_column", [False, True])
def test_column_transformer_empty_columns(pandas, column_selection, callable_column):
# test case that ensures that the column transformer does also work when
# a given transformer doesn't have any columns to work on
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
X_res_both = X_array
if pandas:
pd = pytest.importorskip("pandas")
X = pd.DataFrame(X_array, columns=["first", "second"])
else:
X = X_array
if callable_column:
column = lambda X: column_selection # noqa
else:
column = column_selection
ct = ColumnTransformer(
[("trans1", Trans(), [0, 1]), ("trans2", TransRaise(), column)]
)
assert_array_equal(ct.fit_transform(X), X_res_both)
assert_array_equal(ct.fit(X).transform(X), X_res_both)
assert len(ct.transformers_) == 2
assert isinstance(ct.transformers_[1][1], TransRaise)
ct = ColumnTransformer(
[("trans1", TransRaise(), column), ("trans2", Trans(), [0, 1])]
)
assert_array_equal(ct.fit_transform(X), X_res_both)
assert_array_equal(ct.fit(X).transform(X), X_res_both)
assert len(ct.transformers_) == 2
assert isinstance(ct.transformers_[0][1], TransRaise)
ct = ColumnTransformer([("trans", TransRaise(), column)], remainder="passthrough")
assert_array_equal(ct.fit_transform(X), X_res_both)
assert_array_equal(ct.fit(X).transform(X), X_res_both)
assert len(ct.transformers_) == 2 # including remainder
assert isinstance(ct.transformers_[0][1], TransRaise)
fixture = np.array([[], [], []])
ct = ColumnTransformer([("trans", TransRaise(), column)], remainder="drop")
assert_array_equal(ct.fit_transform(X), fixture)
assert_array_equal(ct.fit(X).transform(X), fixture)
assert len(ct.transformers_) == 2 # including remainder
assert isinstance(ct.transformers_[0][1], TransRaise)
def test_column_transformer_output_indices():
# Checks for the output_indices_ attribute
X_array = np.arange(6).reshape(3, 2)
ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", Trans(), [1])])
X_trans = ct.fit_transform(X_array)
assert ct.output_indices_ == {
"trans1": slice(0, 1),
"trans2": slice(1, 2),
"remainder": slice(0, 0),
}
assert_array_equal(X_trans[:, [0]], X_trans[:, ct.output_indices_["trans1"]])
assert_array_equal(X_trans[:, [1]], X_trans[:, ct.output_indices_["trans2"]])
# test with transformer_weights and multiple columns
ct = ColumnTransformer(
[("trans", Trans(), [0, 1])], transformer_weights={"trans": 0.1}
)
X_trans = ct.fit_transform(X_array)
assert ct.output_indices_ == {"trans": slice(0, 2), "remainder": slice(0, 0)}
assert_array_equal(X_trans[:, [0, 1]], X_trans[:, ct.output_indices_["trans"]])
assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]])
# test case that ensures that the attribute does also work when
# a given transformer doesn't have any columns to work on
ct = ColumnTransformer([("trans1", Trans(), [0, 1]), ("trans2", TransRaise(), [])])
X_trans = ct.fit_transform(X_array)
assert ct.output_indices_ == {
"trans1": slice(0, 2),
"trans2": slice(0, 0),
"remainder": slice(0, 0),
}
assert_array_equal(X_trans[:, [0, 1]], X_trans[:, ct.output_indices_["trans1"]])
assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["trans2"]])
assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]])
ct = ColumnTransformer([("trans", TransRaise(), [])], remainder="passthrough")
X_trans = ct.fit_transform(X_array)
assert ct.output_indices_ == {"trans": slice(0, 0), "remainder": slice(0, 2)}
assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["trans"]])
assert_array_equal(X_trans[:, [0, 1]], X_trans[:, ct.output_indices_["remainder"]])
def test_column_transformer_output_indices_df():
# Checks for the output_indices_ attribute with data frames
pd = pytest.importorskip("pandas")
X_df = pd.DataFrame(np.arange(6).reshape(3, 2), columns=["first", "second"])
ct = ColumnTransformer(
[("trans1", Trans(), ["first"]), ("trans2", Trans(), ["second"])]
)
X_trans = ct.fit_transform(X_df)
assert ct.output_indices_ == {
"trans1": slice(0, 1),
"trans2": slice(1, 2),
"remainder": slice(0, 0),
}
assert_array_equal(X_trans[:, [0]], X_trans[:, ct.output_indices_["trans1"]])
assert_array_equal(X_trans[:, [1]], X_trans[:, ct.output_indices_["trans2"]])
assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]])
ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", Trans(), [1])])
X_trans = ct.fit_transform(X_df)
assert ct.output_indices_ == {
"trans1": slice(0, 1),
"trans2": slice(1, 2),
"remainder": slice(0, 0),
}
assert_array_equal(X_trans[:, [0]], X_trans[:, ct.output_indices_["trans1"]])
assert_array_equal(X_trans[:, [1]], X_trans[:, ct.output_indices_["trans2"]])
assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]])
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_column_transformer_sparse_array(csr_container):
X_sparse = csr_container(sparse.eye(3, 2))
# no distinction between 1D and 2D
X_res_first = X_sparse[:, [0]]
X_res_both = X_sparse
for col in [(0,), [0], slice(0, 1)]:
for remainder, res in [("drop", X_res_first), ("passthrough", X_res_both)]:
ct = ColumnTransformer(
[("trans", Trans(), col)], remainder=remainder, sparse_threshold=0.8
)
assert sparse.issparse(ct.fit_transform(X_sparse))
assert_allclose_dense_sparse(ct.fit_transform(X_sparse), res)
assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse), res)
for col in [[0, 1], slice(0, 2)]:
ct = ColumnTransformer([("trans", Trans(), col)], sparse_threshold=0.8)
assert sparse.issparse(ct.fit_transform(X_sparse))
assert_allclose_dense_sparse(ct.fit_transform(X_sparse), X_res_both)
assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse), X_res_both)
def test_column_transformer_list():
X_list = [[1, float("nan"), "a"], [0, 0, "b"]]
expected_result = np.array(
[
[1, float("nan"), 1, 0],
[-1, 0, 0, 1],
]
)
ct = ColumnTransformer(
[
("numerical", StandardScaler(), [0, 1]),
("categorical", OneHotEncoder(), [2]),
]
)
assert_array_equal(ct.fit_transform(X_list), expected_result)
assert_array_equal(ct.fit(X_list).transform(X_list), expected_result)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_column_transformer_sparse_stacking(csr_container):
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
col_trans = ColumnTransformer(
[("trans1", Trans(), [0]), ("trans2", SparseMatrixTrans(csr_container), 1)],
sparse_threshold=0.8,
)
col_trans.fit(X_array)
X_trans = col_trans.transform(X_array)
assert sparse.issparse(X_trans)
assert X_trans.shape == (X_trans.shape[0], X_trans.shape[0] + 1)
assert_array_equal(X_trans.toarray()[:, 1:], np.eye(X_trans.shape[0]))
assert len(col_trans.transformers_) == 2
assert col_trans.transformers_[-1][0] != "remainder"
col_trans = ColumnTransformer(
[("trans1", Trans(), [0]), ("trans2", SparseMatrixTrans(csr_container), 1)],
sparse_threshold=0.1,
)
col_trans.fit(X_array)
X_trans = col_trans.transform(X_array)
assert not sparse.issparse(X_trans)
assert X_trans.shape == (X_trans.shape[0], X_trans.shape[0] + 1)
assert_array_equal(X_trans[:, 1:], np.eye(X_trans.shape[0]))
def test_column_transformer_mixed_cols_sparse():
df = np.array([["a", 1, True], ["b", 2, False]], dtype="O")
ct = make_column_transformer(
(OneHotEncoder(), [0]), ("passthrough", [1, 2]), sparse_threshold=1.0
)
# this shouldn't fail, since boolean can be coerced into a numeric
# See: https://github.com/scikit-learn/scikit-learn/issues/11912
X_trans = ct.fit_transform(df)
assert X_trans.getformat() == "csr"
assert_array_equal(X_trans.toarray(), np.array([[1, 0, 1, 1], [0, 1, 2, 0]]))
ct = make_column_transformer(
(OneHotEncoder(), [0]), ("passthrough", [0]), sparse_threshold=1.0
)
with pytest.raises(ValueError, match="For a sparse output, all columns should"):
# this fails since strings `a` and `b` cannot be
# coerced into a numeric.
ct.fit_transform(df)
def test_column_transformer_sparse_threshold():
X_array = np.array([["a", "b"], ["A", "B"]], dtype=object).T
# above data has sparsity of 4 / 8 = 0.5
# apply threshold even if all sparse
col_trans = ColumnTransformer(
[("trans1", OneHotEncoder(), [0]), ("trans2", OneHotEncoder(), [1])],
sparse_threshold=0.2,
)
res = col_trans.fit_transform(X_array)
assert not sparse.issparse(res)
assert not col_trans.sparse_output_
# mixed -> sparsity of (4 + 2) / 8 = 0.75
for thres in [0.75001, 1]:
col_trans = ColumnTransformer(
[
("trans1", OneHotEncoder(sparse_output=True), [0]),
("trans2", OneHotEncoder(sparse_output=False), [1]),
],
sparse_threshold=thres,
)
res = col_trans.fit_transform(X_array)
assert sparse.issparse(res)
assert col_trans.sparse_output_
for thres in [0.75, 0]:
col_trans = ColumnTransformer(
[
("trans1", OneHotEncoder(sparse_output=True), [0]),
("trans2", OneHotEncoder(sparse_output=False), [1]),
],
sparse_threshold=thres,
)
res = col_trans.fit_transform(X_array)
assert not sparse.issparse(res)
assert not col_trans.sparse_output_
# if nothing is sparse -> no sparse
for thres in [0.33, 0, 1]:
col_trans = ColumnTransformer(
[
("trans1", OneHotEncoder(sparse_output=False), [0]),
("trans2", OneHotEncoder(sparse_output=False), [1]),
],
sparse_threshold=thres,
)
res = col_trans.fit_transform(X_array)
assert not sparse.issparse(res)
assert not col_trans.sparse_output_
def test_column_transformer_error_msg_1D():
X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
col_trans = ColumnTransformer([("trans", StandardScaler(), 0)])
msg = "1D data passed to a transformer"
with pytest.raises(ValueError, match=msg):
col_trans.fit(X_array)
with pytest.raises(ValueError, match=msg):
col_trans.fit_transform(X_array)
col_trans = ColumnTransformer([("trans", TransRaise(), 0)])
for func in [col_trans.fit, col_trans.fit_transform]:
with pytest.raises(ValueError, match="specific message"):
func(X_array)
def test_2D_transformer_output():
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
# if one transformer is dropped, test that name is still correct
ct = ColumnTransformer([("trans1", "drop", 0), ("trans2", TransNo2D(), 1)])
msg = "the 'trans2' transformer should be 2D"
with pytest.raises(ValueError, match=msg):
ct.fit_transform(X_array)
# because fit is also doing transform, this raises already on fit
with pytest.raises(ValueError, match=msg):
ct.fit(X_array)
def test_2D_transformer_output_pandas():
pd = pytest.importorskip("pandas")
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
X_df = pd.DataFrame(X_array, columns=["col1", "col2"])
# if one transformer is dropped, test that name is still correct
ct = ColumnTransformer([("trans1", TransNo2D(), "col1")])
msg = "the 'trans1' transformer should be 2D"
with pytest.raises(ValueError, match=msg):
ct.fit_transform(X_df)
# because fit is also doing transform, this raises already on fit
with pytest.raises(ValueError, match=msg):
ct.fit(X_df)
@pytest.mark.parametrize("remainder", ["drop", "passthrough"])
def test_column_transformer_invalid_columns(remainder):
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
# general invalid
for col in [1.5, ["string", 1], slice(1, "s"), np.array([1.0])]:
ct = ColumnTransformer([("trans", Trans(), col)], remainder=remainder)
with pytest.raises(ValueError, match="No valid specification"):
ct.fit(X_array)
# invalid for arrays
for col in ["string", ["string", "other"], slice("a", "b")]:
ct = ColumnTransformer([("trans", Trans(), col)], remainder=remainder)
with pytest.raises(ValueError, match="Specifying the columns"):
ct.fit(X_array)
# transformed n_features does not match fitted n_features
col = [0, 1]
ct = ColumnTransformer([("trans", Trans(), col)], remainder=remainder)
ct.fit(X_array)
X_array_more = np.array([[0, 1, 2], [2, 4, 6], [3, 6, 9]]).T
msg = "X has 3 features, but ColumnTransformer is expecting 2 features as input."
with pytest.raises(ValueError, match=msg):
ct.transform(X_array_more)
X_array_fewer = np.array(
[
[0, 1, 2],
]
).T
err_msg = (
"X has 1 features, but ColumnTransformer is expecting 2 features as input."
)
with pytest.raises(ValueError, match=err_msg):
ct.transform(X_array_fewer)
def test_column_transformer_invalid_transformer():
class NoTrans(BaseEstimator):
def fit(self, X, y=None):
return self
def predict(self, X):
return X
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
ct = ColumnTransformer([("trans", NoTrans(), [0])])
msg = "All estimators should implement fit and transform"
with pytest.raises(TypeError, match=msg):
ct.fit(X_array)
def test_make_column_transformer():
scaler = StandardScaler()
norm = Normalizer()
ct = make_column_transformer((scaler, "first"), (norm, ["second"]))
names, transformers, columns = zip(*ct.transformers)
assert names == ("standardscaler", "normalizer")
assert transformers == (scaler, norm)
assert columns == ("first", ["second"])
def test_make_column_transformer_pandas():
pd = pytest.importorskip("pandas")
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
X_df = pd.DataFrame(X_array, columns=["first", "second"])
norm = Normalizer()
ct1 = ColumnTransformer([("norm", Normalizer(), X_df.columns)])
ct2 = make_column_transformer((norm, X_df.columns))
assert_almost_equal(ct1.fit_transform(X_df), ct2.fit_transform(X_df))
def test_make_column_transformer_kwargs():
scaler = StandardScaler()
norm = Normalizer()
ct = make_column_transformer(
(scaler, "first"),
(norm, ["second"]),
n_jobs=3,
remainder="drop",
sparse_threshold=0.5,
)
assert (
ct.transformers
== make_column_transformer((scaler, "first"), (norm, ["second"])).transformers
)
assert ct.n_jobs == 3
assert ct.remainder == "drop"
assert ct.sparse_threshold == 0.5
# invalid keyword parameters should raise an error message
msg = re.escape(
"make_column_transformer() got an unexpected "
"keyword argument 'transformer_weights'"
)
with pytest.raises(TypeError, match=msg):
make_column_transformer(
(scaler, "first"),
(norm, ["second"]),
transformer_weights={"pca": 10, "Transf": 1},
)
def test_make_column_transformer_remainder_transformer():
scaler = StandardScaler()
norm = Normalizer()
remainder = StandardScaler()
ct = make_column_transformer(
(scaler, "first"), (norm, ["second"]), remainder=remainder
)
assert ct.remainder == remainder
def test_column_transformer_get_set_params():
ct = ColumnTransformer(
[("trans1", StandardScaler(), [0]), ("trans2", StandardScaler(), [1])]
)
exp = {
"n_jobs": None,
"remainder": "drop",
"sparse_threshold": 0.3,
"trans1": ct.transformers[0][1],
"trans1__copy": True,
"trans1__with_mean": True,
"trans1__with_std": True,
"trans2": ct.transformers[1][1],
"trans2__copy": True,
"trans2__with_mean": True,
"trans2__with_std": True,
"transformers": ct.transformers,
"transformer_weights": None,
"verbose_feature_names_out": True,
"verbose": False,
"force_int_remainder_cols": True,
}
assert ct.get_params() == exp
ct.set_params(trans1__with_mean=False)
assert not ct.get_params()["trans1__with_mean"]
ct.set_params(trans1="passthrough")
exp = {
"n_jobs": None,
"remainder": "drop",
"sparse_threshold": 0.3,
"trans1": "passthrough",
"trans2": ct.transformers[1][1],
"trans2__copy": True,
"trans2__with_mean": True,
"trans2__with_std": True,
"transformers": ct.transformers,
"transformer_weights": None,
"verbose_feature_names_out": True,
"verbose": False,
"force_int_remainder_cols": True,
}
assert ct.get_params() == exp
def test_column_transformer_named_estimators():
X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
ct = ColumnTransformer(
[
("trans1", StandardScaler(), [0]),
("trans2", StandardScaler(with_std=False), [1]),
]
)
assert not hasattr(ct, "transformers_")
ct.fit(X_array)
assert hasattr(ct, "transformers_")
assert isinstance(ct.named_transformers_["trans1"], StandardScaler)
assert isinstance(ct.named_transformers_.trans1, StandardScaler)
assert isinstance(ct.named_transformers_["trans2"], StandardScaler)
assert isinstance(ct.named_transformers_.trans2, StandardScaler)
assert not ct.named_transformers_.trans2.with_std
# check it are fitted transformers
assert ct.named_transformers_.trans1.mean_ == 1.0
def test_column_transformer_cloning():
X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
ct = ColumnTransformer([("trans", StandardScaler(), [0])])
ct.fit(X_array)
assert not hasattr(ct.transformers[0][1], "mean_")
assert hasattr(ct.transformers_[0][1], "mean_")
ct = ColumnTransformer([("trans", StandardScaler(), [0])])
ct.fit_transform(X_array)
assert not hasattr(ct.transformers[0][1], "mean_")
assert hasattr(ct.transformers_[0][1], "mean_")
def test_column_transformer_get_feature_names():
X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
ct = ColumnTransformer([("trans", Trans(), [0, 1])])
# raise correct error when not fitted
with pytest.raises(NotFittedError):
ct.get_feature_names_out()
# raise correct error when no feature names are available
ct.fit(X_array)
msg = re.escape(
"Transformer trans (type Trans) does not provide get_feature_names_out"
)
with pytest.raises(AttributeError, match=msg):
ct.get_feature_names_out()
def test_column_transformer_special_strings():
# one 'drop' -> ignore
X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", "drop", [1])])
exp = np.array([[0.0], [1.0], [2.0]])
assert_array_equal(ct.fit_transform(X_array), exp)
assert_array_equal(ct.fit(X_array).transform(X_array), exp)
assert len(ct.transformers_) == 2
assert ct.transformers_[-1][0] != "remainder"
# all 'drop' -> return shape 0 array
ct = ColumnTransformer([("trans1", "drop", [0]), ("trans2", "drop", [1])])
assert_array_equal(ct.fit(X_array).transform(X_array).shape, (3, 0))
assert_array_equal(ct.fit_transform(X_array).shape, (3, 0))
assert len(ct.transformers_) == 2
assert ct.transformers_[-1][0] != "remainder"
# 'passthrough'
X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", "passthrough", [1])])
exp = X_array
assert_array_equal(ct.fit_transform(X_array), exp)
assert_array_equal(ct.fit(X_array).transform(X_array), exp)
assert len(ct.transformers_) == 2
assert ct.transformers_[-1][0] != "remainder"
def test_column_transformer_remainder():
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
X_res_second = np.array([2, 4, 6]).reshape(-1, 1)
X_res_both = X_array
# default drop
ct = ColumnTransformer([("trans1", Trans(), [0])])
assert_array_equal(ct.fit_transform(X_array), X_res_first)
assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first)
assert len(ct.transformers_) == 2
assert ct.transformers_[-1][0] == "remainder"
assert ct.transformers_[-1][1] == "drop"
assert_array_equal(ct.transformers_[-1][2], [1])
# specify passthrough
ct = ColumnTransformer([("trans", Trans(), [0])], remainder="passthrough")
assert_array_equal(ct.fit_transform(X_array), X_res_both)
assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
assert len(ct.transformers_) == 2
assert ct.transformers_[-1][0] == "remainder"
assert isinstance(ct.transformers_[-1][1], FunctionTransformer)
assert_array_equal(ct.transformers_[-1][2], [1])
# column order is not preserved (passed through added to end)
ct = ColumnTransformer([("trans1", Trans(), [1])], remainder="passthrough")
assert_array_equal(ct.fit_transform(X_array), X_res_both[:, ::-1])
assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both[:, ::-1])
assert len(ct.transformers_) == 2
assert ct.transformers_[-1][0] == "remainder"
assert isinstance(ct.transformers_[-1][1], FunctionTransformer)
assert_array_equal(ct.transformers_[-1][2], [0])
# passthrough when all actual transformers are skipped
ct = ColumnTransformer([("trans1", "drop", [0])], remainder="passthrough")
assert_array_equal(ct.fit_transform(X_array), X_res_second)
assert_array_equal(ct.fit(X_array).transform(X_array), X_res_second)
assert len(ct.transformers_) == 2
assert ct.transformers_[-1][0] == "remainder"
assert isinstance(ct.transformers_[-1][1], FunctionTransformer)
assert_array_equal(ct.transformers_[-1][2], [1])
# check default for make_column_transformer
ct = make_column_transformer((Trans(), [0]))
assert ct.remainder == "drop"
# TODO(1.7): check for deprecated force_int_remainder_cols
# TODO(1.9): remove force_int but keep the test
@pytest.mark.parametrize(
"cols1, cols2",
[
([0], [False, True, False]), # mix types
([0], [1]), # ints
(lambda x: [0], lambda x: [1]), # callables
],
)
@pytest.mark.parametrize("force_int", [False, True])
def test_column_transformer_remainder_dtypes_ints(force_int, cols1, cols2):
"""Check that the remainder columns are always stored as indices when
other columns are not all specified as column names or masks, regardless of
`force_int_remainder_cols`.
"""
X = np.ones((1, 3))
ct = make_column_transformer(
(Trans(), cols1),
(Trans(), cols2),
remainder="passthrough",
force_int_remainder_cols=force_int,
)
with warnings.catch_warnings():
warnings.simplefilter("error")
ct.fit_transform(X)
assert ct.transformers_[-1][-1][0] == 2
# TODO(1.7): check for deprecated force_int_remainder_cols
# TODO(1.9): remove force_int but keep the test
@pytest.mark.parametrize(
"force_int, cols1, cols2, expected_cols",
[
(True, ["A"], ["B"], [2]),
(False, ["A"], ["B"], ["C"]),
(True, [True, False, False], [False, True, False], [2]),
(False, [True, False, False], [False, True, False], [False, False, True]),
],
)
def test_column_transformer_remainder_dtypes(force_int, cols1, cols2, expected_cols):
"""Check that the remainder columns format matches the format of the other
columns when they're all strings or masks, unless `force_int = True`.
"""
X = np.ones((1, 3))
if isinstance(cols1[0], str):
pd = pytest.importorskip("pandas")
X = pd.DataFrame(X, columns=["A", "B", "C"])
# if inputs are column names store remainder columns as column names unless
# force_int_remainder_cols is True
ct = make_column_transformer(
(Trans(), cols1),
(Trans(), cols2),
remainder="passthrough",
force_int_remainder_cols=force_int,
)
with warnings.catch_warnings():
warnings.simplefilter("error")
ct.fit_transform(X)
if force_int:
# If we forced using ints and we access the remainder columns a warning is shown
match = "The format of the columns of the 'remainder' transformer"
cols = ct.transformers_[-1][-1]
with pytest.warns(FutureWarning, match=match):
cols[0]
else:
with warnings.catch_warnings():
warnings.simplefilter("error")
cols = ct.transformers_[-1][-1]
cols[0]
assert cols == expected_cols
def test_remainder_list_repr():
cols = _RemainderColsList([0, 1], warning_enabled=False)
assert str(cols) == "[0, 1]"
assert repr(cols) == "[0, 1]"
mock = Mock()
cols._repr_pretty_(mock, False)
mock.text.assert_called_once_with("[0, 1]")
@pytest.mark.parametrize(
"key, expected_cols",
[
([0], [1]),
(np.array([0]), [1]),
(slice(0, 1), [1]),
(np.array([True, False]), [False, True]),
],
)
def test_column_transformer_remainder_numpy(key, expected_cols):
# test different ways that columns are specified with passthrough
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
X_res_both = X_array
ct = ColumnTransformer(
[("trans1", Trans(), key)],
remainder="passthrough",
force_int_remainder_cols=False,
)
assert_array_equal(ct.fit_transform(X_array), X_res_both)
assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
assert len(ct.transformers_) == 2
assert ct.transformers_[-1][0] == "remainder"
assert isinstance(ct.transformers_[-1][1], FunctionTransformer)
assert ct.transformers_[-1][2] == expected_cols
@pytest.mark.parametrize(
"key, expected_cols",
[
([0], [1]),
(slice(0, 1), [1]),
(np.array([True, False]), [False, True]),
(["first"], ["second"]),
("pd-index", ["second"]),
(np.array(["first"]), ["second"]),
(np.array(["first"], dtype=object), ["second"]),
(slice(None, "first"), ["second"]),
(slice("first", "first"), ["second"]),
],
)
def test_column_transformer_remainder_pandas(key, expected_cols):
# test different ways that columns are specified with passthrough
pd = pytest.importorskip("pandas")
if isinstance(key, str) and key == "pd-index":
key = pd.Index(["first"])
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
X_df = pd.DataFrame(X_array, columns=["first", "second"])
X_res_both = X_array
ct = ColumnTransformer(
[("trans1", Trans(), key)],
remainder="passthrough",
force_int_remainder_cols=False,
)
assert_array_equal(ct.fit_transform(X_df), X_res_both)
assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
assert len(ct.transformers_) == 2
assert ct.transformers_[-1][0] == "remainder"
assert isinstance(ct.transformers_[-1][1], FunctionTransformer)
assert ct.transformers_[-1][2] == expected_cols
@pytest.mark.parametrize(
"key, expected_cols",
[
([0], [1, 2]),
(np.array([0]), [1, 2]),
(slice(0, 1), [1, 2]),
(np.array([True, False, False]), [False, True, True]),
],
)
def test_column_transformer_remainder_transformer(key, expected_cols):
X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
X_res_both = X_array.copy()
# second and third columns are doubled when remainder = DoubleTrans
X_res_both[:, 1:3] *= 2
ct = ColumnTransformer(
[("trans1", Trans(), key)],
remainder=DoubleTrans(),
force_int_remainder_cols=False,
)
assert_array_equal(ct.fit_transform(X_array), X_res_both)
assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
assert len(ct.transformers_) == 2
assert ct.transformers_[-1][0] == "remainder"
assert isinstance(ct.transformers_[-1][1], DoubleTrans)
assert ct.transformers_[-1][2] == expected_cols
def test_column_transformer_no_remaining_remainder_transformer():
X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
ct = ColumnTransformer([("trans1", Trans(), [0, 1, 2])], remainder=DoubleTrans())
assert_array_equal(ct.fit_transform(X_array), X_array)
assert_array_equal(ct.fit(X_array).transform(X_array), X_array)
assert len(ct.transformers_) == 1
assert ct.transformers_[-1][0] != "remainder"
def test_column_transformer_drops_all_remainder_transformer():
X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
# columns are doubled when remainder = DoubleTrans
X_res_both = 2 * X_array.copy()[:, 1:3]
ct = ColumnTransformer([("trans1", "drop", [0])], remainder=DoubleTrans())
assert_array_equal(ct.fit_transform(X_array), X_res_both)
assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
assert len(ct.transformers_) == 2
assert ct.transformers_[-1][0] == "remainder"
assert isinstance(ct.transformers_[-1][1], DoubleTrans)
assert_array_equal(ct.transformers_[-1][2], [1, 2])
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_column_transformer_sparse_remainder_transformer(csr_container):
X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
ct = ColumnTransformer(
[("trans1", Trans(), [0])],
remainder=SparseMatrixTrans(csr_container),
sparse_threshold=0.8,
)
X_trans = ct.fit_transform(X_array)
assert sparse.issparse(X_trans)
# SparseMatrixTrans creates 3 features for each column. There is
# one column in ``transformers``, thus:
assert X_trans.shape == (3, 3 + 1)
exp_array = np.hstack((X_array[:, 0].reshape(-1, 1), np.eye(3)))
assert_array_equal(X_trans.toarray(), exp_array)
assert len(ct.transformers_) == 2
assert ct.transformers_[-1][0] == "remainder"
assert isinstance(ct.transformers_[-1][1], SparseMatrixTrans)
assert_array_equal(ct.transformers_[-1][2], [1, 2])
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_column_transformer_drop_all_sparse_remainder_transformer(csr_container):
X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
ct = ColumnTransformer(
[("trans1", "drop", [0])],
remainder=SparseMatrixTrans(csr_container),
sparse_threshold=0.8,
)
X_trans = ct.fit_transform(X_array)
assert sparse.issparse(X_trans)
# SparseMatrixTrans creates 3 features for each column, thus:
assert X_trans.shape == (3, 3)
assert_array_equal(X_trans.toarray(), np.eye(3))
assert len(ct.transformers_) == 2
assert ct.transformers_[-1][0] == "remainder"
assert isinstance(ct.transformers_[-1][1], SparseMatrixTrans)
assert_array_equal(ct.transformers_[-1][2], [1, 2])
def test_column_transformer_get_set_params_with_remainder():
ct = ColumnTransformer(
[("trans1", StandardScaler(), [0])], remainder=StandardScaler()
)
exp = {
"n_jobs": None,
"remainder": ct.remainder,
"remainder__copy": True,
"remainder__with_mean": True,
"remainder__with_std": True,
"sparse_threshold": 0.3,
"trans1": ct.transformers[0][1],
"trans1__copy": True,
"trans1__with_mean": True,
"trans1__with_std": True,
"transformers": ct.transformers,
"transformer_weights": None,
"verbose_feature_names_out": True,
"verbose": False,
"force_int_remainder_cols": True,
}
assert ct.get_params() == exp
ct.set_params(remainder__with_std=False)
assert not ct.get_params()["remainder__with_std"]
ct.set_params(trans1="passthrough")
exp = {
"n_jobs": None,
"remainder": ct.remainder,
"remainder__copy": True,
"remainder__with_mean": True,
"remainder__with_std": False,
"sparse_threshold": 0.3,
"trans1": "passthrough",
"transformers": ct.transformers,
"transformer_weights": None,
"verbose_feature_names_out": True,
"verbose": False,
"force_int_remainder_cols": True,
}
assert ct.get_params() == exp
def test_column_transformer_no_estimators():
X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).astype("float").T
ct = ColumnTransformer([], remainder=StandardScaler())
params = ct.get_params()
assert params["remainder__with_mean"]
X_trans = ct.fit_transform(X_array)
assert X_trans.shape == X_array.shape
assert len(ct.transformers_) == 1
assert ct.transformers_[-1][0] == "remainder"
assert ct.transformers_[-1][2] == [0, 1, 2]
@pytest.mark.parametrize(
["est", "pattern"],
[
(
ColumnTransformer(
[("trans1", Trans(), [0]), ("trans2", Trans(), [1])],
remainder=DoubleTrans(),
),
(
r"\[ColumnTransformer\].*\(1 of 3\) Processing trans1.* total=.*\n"
r"\[ColumnTransformer\].*\(2 of 3\) Processing trans2.* total=.*\n"
r"\[ColumnTransformer\].*\(3 of 3\) Processing remainder.* total=.*\n$"
),
),
(
ColumnTransformer(
[("trans1", Trans(), [0]), ("trans2", Trans(), [1])],
remainder="passthrough",
),
(
r"\[ColumnTransformer\].*\(1 of 3\) Processing trans1.* total=.*\n"
r"\[ColumnTransformer\].*\(2 of 3\) Processing trans2.* total=.*\n"
r"\[ColumnTransformer\].*\(3 of 3\) Processing remainder.* total=.*\n$"
),
),
(
ColumnTransformer(
[("trans1", Trans(), [0]), ("trans2", "drop", [1])],
remainder="passthrough",
),
(
r"\[ColumnTransformer\].*\(1 of 2\) Processing trans1.* total=.*\n"
r"\[ColumnTransformer\].*\(2 of 2\) Processing remainder.* total=.*\n$"
),
),
(
ColumnTransformer(
[("trans1", Trans(), [0]), ("trans2", "passthrough", [1])],
remainder="passthrough",
),
(
r"\[ColumnTransformer\].*\(1 of 3\) Processing trans1.* total=.*\n"
r"\[ColumnTransformer\].*\(2 of 3\) Processing trans2.* total=.*\n"
r"\[ColumnTransformer\].*\(3 of 3\) Processing remainder.* total=.*\n$"
),
),
(
ColumnTransformer([("trans1", Trans(), [0])], remainder="passthrough"),
(
r"\[ColumnTransformer\].*\(1 of 2\) Processing trans1.* total=.*\n"
r"\[ColumnTransformer\].*\(2 of 2\) Processing remainder.* total=.*\n$"
),
),
(
ColumnTransformer(
[("trans1", Trans(), [0]), ("trans2", Trans(), [1])], remainder="drop"
),
(
r"\[ColumnTransformer\].*\(1 of 2\) Processing trans1.* total=.*\n"
r"\[ColumnTransformer\].*\(2 of 2\) Processing trans2.* total=.*\n$"
),
),
(
ColumnTransformer([("trans1", Trans(), [0])], remainder="drop"),
r"\[ColumnTransformer\].*\(1 of 1\) Processing trans1.* total=.*\n$",
),
],
)
@pytest.mark.parametrize("method", ["fit", "fit_transform"])
def test_column_transformer_verbose(est, pattern, method, capsys):
X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
func = getattr(est, method)
est.set_params(verbose=False)
func(X_array)
assert not capsys.readouterr().out, "Got output for verbose=False"
est.set_params(verbose=True)
func(X_array)
assert re.match(pattern, capsys.readouterr()[0])
def test_column_transformer_no_estimators_set_params():
ct = ColumnTransformer([]).set_params(n_jobs=2)
assert ct.n_jobs == 2
def test_column_transformer_callable_specifier():
# assert that function gets the full array
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
X_res_first = np.array([[0, 1, 2]]).T
def func(X):
assert_array_equal(X, X_array)
return [0]
ct = ColumnTransformer([("trans", Trans(), func)], remainder="drop")
assert_array_equal(ct.fit_transform(X_array), X_res_first)
assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first)
assert callable(ct.transformers[0][2])
assert ct.transformers_[0][2] == [0]
def test_column_transformer_callable_specifier_dataframe():
# assert that function gets the full dataframe
pd = pytest.importorskip("pandas")
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
X_res_first = np.array([[0, 1, 2]]).T
X_df = pd.DataFrame(X_array, columns=["first", "second"])
def func(X):
assert_array_equal(X.columns, X_df.columns)
assert_array_equal(X.values, X_df.values)
return ["first"]
ct = ColumnTransformer([("trans", Trans(), func)], remainder="drop")
assert_array_equal(ct.fit_transform(X_df), X_res_first)
assert_array_equal(ct.fit(X_df).transform(X_df), X_res_first)
assert callable(ct.transformers[0][2])
assert ct.transformers_[0][2] == ["first"]
def test_column_transformer_negative_column_indexes():
X = np.random.randn(2, 2)
X_categories = np.array([[1], [2]])
X = np.concatenate([X, X_categories], axis=1)
ohe = OneHotEncoder()
tf_1 = ColumnTransformer([("ohe", ohe, [-1])], remainder="passthrough")
tf_2 = ColumnTransformer([("ohe", ohe, [2])], remainder="passthrough")
assert_array_equal(tf_1.fit_transform(X), tf_2.fit_transform(X))
@pytest.mark.parametrize("array_type", [np.asarray, *CSR_CONTAINERS])
def test_column_transformer_mask_indexing(array_type):
# Regression test for #14510
# Boolean array-like does not behave as boolean array with sparse matrices.
X = np.transpose([[1, 2, 3], [4, 5, 6], [5, 6, 7], [8, 9, 10]])
X = array_type(X)
column_transformer = ColumnTransformer(
[("identity", FunctionTransformer(), [False, True, False, True])]
)
X_trans = column_transformer.fit_transform(X)
assert X_trans.shape == (3, 2)
def test_n_features_in():
# make sure n_features_in is what is passed as input to the column
# transformer.
X = [[1, 2], [3, 4], [5, 6]]
ct = ColumnTransformer([("a", DoubleTrans(), [0]), ("b", DoubleTrans(), [1])])
assert not hasattr(ct, "n_features_in_")
ct.fit(X)
assert ct.n_features_in_ == 2
@pytest.mark.parametrize(
"cols, pattern, include, exclude",
[
(["col_int", "col_float"], None, np.number, None),
(["col_int", "col_float"], None, None, object),
(["col_int", "col_float"], None, [int, float], None),
(["col_str"], None, [object], None),
(["col_str"], None, object, None),
(["col_float"], None, float, None),
(["col_float"], "at$", [np.number], None),
(["col_int"], None, [int], None),
(["col_int"], "^col_int", [np.number], None),
(["col_float", "col_str"], "float|str", None, None),
(["col_str"], "^col_s", None, [int]),
([], "str$", float, None),
(["col_int", "col_float", "col_str"], None, [np.number, object], None),
],
)
def test_make_column_selector_with_select_dtypes(cols, pattern, include, exclude):
pd = pytest.importorskip("pandas")
X_df = pd.DataFrame(
{
"col_int": np.array([0, 1, 2], dtype=int),
"col_float": np.array([0.0, 1.0, 2.0], dtype=float),
"col_str": ["one", "two", "three"],
},
columns=["col_int", "col_float", "col_str"],
)
selector = make_column_selector(
dtype_include=include, dtype_exclude=exclude, pattern=pattern
)
assert_array_equal(selector(X_df), cols)
def test_column_transformer_with_make_column_selector():
# Functional test for column transformer + column selector
pd = pytest.importorskip("pandas")
X_df = pd.DataFrame(
{
"col_int": np.array([0, 1, 2], dtype=int),
"col_float": np.array([0.0, 1.0, 2.0], dtype=float),
"col_cat": ["one", "two", "one"],
"col_str": ["low", "middle", "high"],
},
columns=["col_int", "col_float", "col_cat", "col_str"],
)
X_df["col_str"] = X_df["col_str"].astype("category")
cat_selector = make_column_selector(dtype_include=["category", object])
num_selector = make_column_selector(dtype_include=np.number)
ohe = OneHotEncoder()
scaler = StandardScaler()
ct_selector = make_column_transformer((ohe, cat_selector), (scaler, num_selector))
ct_direct = make_column_transformer(
(ohe, ["col_cat", "col_str"]), (scaler, ["col_float", "col_int"])
)
X_selector = ct_selector.fit_transform(X_df)
X_direct = ct_direct.fit_transform(X_df)
assert_allclose(X_selector, X_direct)
def test_make_column_selector_error():
selector = make_column_selector(dtype_include=np.number)
X = np.array([[0.1, 0.2]])
msg = "make_column_selector can only be applied to pandas dataframes"
with pytest.raises(ValueError, match=msg):
selector(X)
def test_make_column_selector_pickle():
pd = pytest.importorskip("pandas")
X_df = pd.DataFrame(
{
"col_int": np.array([0, 1, 2], dtype=int),
"col_float": np.array([0.0, 1.0, 2.0], dtype=float),
"col_str": ["one", "two", "three"],
},
columns=["col_int", "col_float", "col_str"],
)
selector = make_column_selector(dtype_include=[object])
selector_picked = pickle.loads(pickle.dumps(selector))
assert_array_equal(selector(X_df), selector_picked(X_df))
@pytest.mark.parametrize(
"empty_col",
[[], np.array([], dtype=int), lambda x: []],
ids=["list", "array", "callable"],
)
def test_feature_names_empty_columns(empty_col):
pd = pytest.importorskip("pandas")
df = pd.DataFrame({"col1": ["a", "a", "b"], "col2": ["z", "z", "z"]})
ct = ColumnTransformer(
transformers=[
("ohe", OneHotEncoder(), ["col1", "col2"]),
("empty_features", OneHotEncoder(), empty_col),
],
)
ct.fit(df)
assert_array_equal(
ct.get_feature_names_out(), ["ohe__col1_a", "ohe__col1_b", "ohe__col2_z"]
)
@pytest.mark.parametrize(
"selector",
[
[1],
lambda x: [1],
["col2"],
lambda x: ["col2"],
[False, True],
lambda x: [False, True],
],
)
def test_feature_names_out_pandas(selector):
"""Checks name when selecting only the second column"""
pd = pytest.importorskip("pandas")
df = pd.DataFrame({"col1": ["a", "a", "b"], "col2": ["z", "z", "z"]})
ct = ColumnTransformer([("ohe", OneHotEncoder(), selector)])
ct.fit(df)
assert_array_equal(ct.get_feature_names_out(), ["ohe__col2_z"])
@pytest.mark.parametrize(
"selector", [[1], lambda x: [1], [False, True], lambda x: [False, True]]
)
def test_feature_names_out_non_pandas(selector):
"""Checks name when selecting the second column with numpy array"""
X = [["a", "z"], ["a", "z"], ["b", "z"]]
ct = ColumnTransformer([("ohe", OneHotEncoder(), selector)])
ct.fit(X)
assert_array_equal(ct.get_feature_names_out(), ["ohe__x1_z"])
@pytest.mark.parametrize("remainder", ["passthrough", StandardScaler()])
def test_sk_visual_block_remainder(remainder):
# remainder='passthrough' or an estimator will be shown in repr_html
ohe = OneHotEncoder()
ct = ColumnTransformer(
transformers=[("ohe", ohe, ["col1", "col2"])], remainder=remainder
)
visual_block = ct._sk_visual_block_()
assert visual_block.names == ("ohe", "remainder")
assert visual_block.name_details == (["col1", "col2"], "")
assert visual_block.estimators == (ohe, remainder)
def test_sk_visual_block_remainder_drop():
# remainder='drop' is not shown in repr_html
ohe = OneHotEncoder()
ct = ColumnTransformer(transformers=[("ohe", ohe, ["col1", "col2"])])
visual_block = ct._sk_visual_block_()
assert visual_block.names == ("ohe",)
assert visual_block.name_details == (["col1", "col2"],)
assert visual_block.estimators == (ohe,)
@pytest.mark.parametrize("remainder", ["passthrough", StandardScaler()])
def test_sk_visual_block_remainder_fitted_pandas(remainder):
# Remainder shows the columns after fitting
pd = pytest.importorskip("pandas")
ohe = OneHotEncoder()
ct = ColumnTransformer(
transformers=[("ohe", ohe, ["col1", "col2"])],
remainder=remainder,
force_int_remainder_cols=False,
)
df = pd.DataFrame(
{
"col1": ["a", "b", "c"],
"col2": ["z", "z", "z"],
"col3": [1, 2, 3],
"col4": [3, 4, 5],
}
)
ct.fit(df)
visual_block = ct._sk_visual_block_()
assert visual_block.names == ("ohe", "remainder")
assert visual_block.name_details == (["col1", "col2"], ["col3", "col4"])
assert visual_block.estimators == (ohe, remainder)
@pytest.mark.parametrize("remainder", ["passthrough", StandardScaler()])
def test_sk_visual_block_remainder_fitted_numpy(remainder):
# Remainder shows the indices after fitting
X = np.array([[1, 2, 3], [4, 5, 6]], dtype=float)
scaler = StandardScaler()
ct = ColumnTransformer(
transformers=[("scale", scaler, [0, 2])], remainder=remainder
)
ct.fit(X)
visual_block = ct._sk_visual_block_()
assert visual_block.names == ("scale", "remainder")
assert visual_block.name_details == ([0, 2], [1])
assert visual_block.estimators == (scaler, remainder)
@pytest.mark.parametrize("explicit_colname", ["first", "second", 0, 1])
@pytest.mark.parametrize("remainder", [Trans(), "passthrough", "drop"])
def test_column_transformer_reordered_column_names_remainder(
explicit_colname, remainder
):
"""Test the interaction between remainder and column transformer"""
pd = pytest.importorskip("pandas")
X_fit_array = np.array([[0, 1, 2], [2, 4, 6]]).T
X_fit_df = pd.DataFrame(X_fit_array, columns=["first", "second"])
X_trans_array = np.array([[2, 4, 6], [0, 1, 2]]).T
X_trans_df = pd.DataFrame(X_trans_array, columns=["second", "first"])
tf = ColumnTransformer([("bycol", Trans(), explicit_colname)], remainder=remainder)
tf.fit(X_fit_df)
X_fit_trans = tf.transform(X_fit_df)
# Changing the order still works
X_trans = tf.transform(X_trans_df)
assert_allclose(X_trans, X_fit_trans)
# extra columns are ignored
X_extended_df = X_fit_df.copy()
X_extended_df["third"] = [3, 6, 9]
X_trans = tf.transform(X_extended_df)
assert_allclose(X_trans, X_fit_trans)
if isinstance(explicit_colname, str):
# Raise error if columns are specified by names but input only allows
# to specify by position, e.g. numpy array instead of a pandas df.
X_array = X_fit_array.copy()
err_msg = "Specifying the columns"
with pytest.raises(ValueError, match=err_msg):
tf.transform(X_array)
def test_feature_name_validation_missing_columns_drop_passthough():
"""Test the interaction between {'drop', 'passthrough'} and
missing column names."""
pd = pytest.importorskip("pandas")
X = np.ones(shape=(3, 4))
df = pd.DataFrame(X, columns=["a", "b", "c", "d"])
df_dropped = df.drop("c", axis=1)
# with remainder='passthrough', all columns seen during `fit` must be
# present
tf = ColumnTransformer([("bycol", Trans(), [1])], remainder="passthrough")
tf.fit(df)
msg = r"columns are missing: {'c'}"
with pytest.raises(ValueError, match=msg):
tf.transform(df_dropped)
# with remainder='drop', it is allowed to have column 'c' missing
tf = ColumnTransformer([("bycol", Trans(), [1])], remainder="drop")
tf.fit(df)
df_dropped_trans = tf.transform(df_dropped)
df_fit_trans = tf.transform(df)
assert_allclose(df_dropped_trans, df_fit_trans)
# bycol drops 'c', thus it is allowed for 'c' to be missing
tf = ColumnTransformer([("bycol", "drop", ["c"])], remainder="passthrough")
tf.fit(df)
df_dropped_trans = tf.transform(df_dropped)
df_fit_trans = tf.transform(df)
assert_allclose(df_dropped_trans, df_fit_trans)
def test_feature_names_in_():
"""Feature names are stored in column transformer.
Column transformer deliberately does not check for column name consistency.
It only checks that the non-dropped names seen in `fit` are seen
in `transform`. This behavior is already tested in
`test_feature_name_validation_missing_columns_drop_passthough`"""
pd = pytest.importorskip("pandas")
feature_names = ["a", "c", "d"]
df = pd.DataFrame([[1, 2, 3]], columns=feature_names)
ct = ColumnTransformer([("bycol", Trans(), ["a", "d"])], remainder="passthrough")
ct.fit(df)
assert_array_equal(ct.feature_names_in_, feature_names)
assert isinstance(ct.feature_names_in_, np.ndarray)
assert ct.feature_names_in_.dtype == object
class TransWithNames(Trans):
def __init__(self, feature_names_out=None):
self.feature_names_out = feature_names_out
def get_feature_names_out(self, input_features=None):
if self.feature_names_out is not None:
return np.asarray(self.feature_names_out, dtype=object)
return input_features
@pytest.mark.parametrize(
"transformers, remainder, expected_names",
[
(
[
("bycol1", TransWithNames(), ["d", "c"]),
("bycol2", "passthrough", ["d"]),
],
"passthrough",
["bycol1__d", "bycol1__c", "bycol2__d", "remainder__a", "remainder__b"],
),
(
[
("bycol1", TransWithNames(), ["d", "c"]),
("bycol2", "passthrough", ["d"]),
],
"drop",
["bycol1__d", "bycol1__c", "bycol2__d"],
),
(
[
("bycol1", TransWithNames(), ["b"]),
("bycol2", "drop", ["d"]),
],
"passthrough",
["bycol1__b", "remainder__a", "remainder__c"],
),
(
[
("bycol1", TransWithNames(["pca1", "pca2"]), ["a", "b", "d"]),
],
"passthrough",
["bycol1__pca1", "bycol1__pca2", "remainder__c"],
),
(
[
("bycol1", TransWithNames(["a", "b"]), ["d"]),
("bycol2", "passthrough", ["b"]),
],
"drop",
["bycol1__a", "bycol1__b", "bycol2__b"],
),
(
[
("bycol1", TransWithNames([f"pca{i}" for i in range(2)]), ["b"]),
("bycol2", TransWithNames([f"pca{i}" for i in range(2)]), ["b"]),
],
"passthrough",
[
"bycol1__pca0",
"bycol1__pca1",
"bycol2__pca0",
"bycol2__pca1",
"remainder__a",
"remainder__c",
"remainder__d",
],
),
(
[
("bycol1", "drop", ["d"]),
],
"drop",
[],
),
(
[
("bycol1", TransWithNames(), slice(1, 3)),
],
"drop",
["bycol1__b", "bycol1__c"],
),
(
[
("bycol1", TransWithNames(), ["b"]),
("bycol2", "drop", slice(3, 4)),
],
"passthrough",
["bycol1__b", "remainder__a", "remainder__c"],
),
(
[
("bycol1", TransWithNames(), ["d", "c"]),
("bycol2", "passthrough", slice(3, 4)),
],
"passthrough",
["bycol1__d", "bycol1__c", "bycol2__d", "remainder__a", "remainder__b"],
),
(
[
("bycol1", TransWithNames(), slice("b", "c")),
],
"drop",
["bycol1__b", "bycol1__c"],
),
(
[
("bycol1", TransWithNames(), ["b"]),
("bycol2", "drop", slice("c", "d")),
],
"passthrough",
["bycol1__b", "remainder__a"],
),
(
[
("bycol1", TransWithNames(), ["d", "c"]),
("bycol2", "passthrough", slice("c", "d")),
],
"passthrough",
[
"bycol1__d",
"bycol1__c",
"bycol2__c",
"bycol2__d",
"remainder__a",
"remainder__b",
],
),
],
)
def test_verbose_feature_names_out_true(transformers, remainder, expected_names):
"""Check feature_names_out for verbose_feature_names_out=True (default)"""
pd = pytest.importorskip("pandas")
df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"])
ct = ColumnTransformer(
transformers,
remainder=remainder,
)
ct.fit(df)
names = ct.get_feature_names_out()
assert isinstance(names, np.ndarray)
assert names.dtype == object
assert_array_equal(names, expected_names)
@pytest.mark.parametrize(
"transformers, remainder, expected_names",
[
(
[
("bycol1", TransWithNames(), ["d", "c"]),
("bycol2", "passthrough", ["a"]),
],
"passthrough",
["d", "c", "a", "b"],
),
(
[
("bycol1", TransWithNames(["a"]), ["d", "c"]),
("bycol2", "passthrough", ["d"]),
],
"drop",
["a", "d"],
),
(
[
("bycol1", TransWithNames(), ["b"]),
("bycol2", "drop", ["d"]),
],
"passthrough",
["b", "a", "c"],
),
(
[
("bycol1", TransWithNames(["pca1", "pca2"]), ["a", "b", "d"]),
],
"passthrough",
["pca1", "pca2", "c"],
),
(
[
("bycol1", TransWithNames(["a", "c"]), ["d"]),
("bycol2", "passthrough", ["d"]),
],
"drop",
["a", "c", "d"],
),
(
[
("bycol1", TransWithNames([f"pca{i}" for i in range(2)]), ["b"]),
("bycol2", TransWithNames([f"kpca{i}" for i in range(2)]), ["b"]),
],
"passthrough",
["pca0", "pca1", "kpca0", "kpca1", "a", "c", "d"],
),
(
[
("bycol1", "drop", ["d"]),
],
"drop",
[],
),
(
[
("bycol1", TransWithNames(), slice(1, 2)),
("bycol2", "drop", ["d"]),
],
"passthrough",
["b", "a", "c"],
),
(
[
("bycol1", TransWithNames(), ["b"]),
("bycol2", "drop", slice(3, 4)),
],
"passthrough",
["b", "a", "c"],
),
(
[
("bycol1", TransWithNames(), ["d", "c"]),
("bycol2", "passthrough", slice(0, 2)),
],
"drop",
["d", "c", "a", "b"],
),
(
[
("bycol1", TransWithNames(), slice("a", "b")),
("bycol2", "drop", ["d"]),
],
"passthrough",
["a", "b", "c"],
),
(
[
("bycol1", TransWithNames(), ["b"]),
("bycol2", "drop", slice("c", "d")),
],
"passthrough",
["b", "a"],
),
(
[
("bycol1", TransWithNames(), ["d", "c"]),
("bycol2", "passthrough", slice("a", "b")),
],
"drop",
["d", "c", "a", "b"],
),
(
[
("bycol1", TransWithNames(), ["d", "c"]),
("bycol2", "passthrough", slice("b", "b")),
],
"drop",
["d", "c", "b"],
),
],
)
def test_verbose_feature_names_out_false(transformers, remainder, expected_names):
"""Check feature_names_out for verbose_feature_names_out=False"""
pd = pytest.importorskip("pandas")
df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"])
ct = ColumnTransformer(
transformers,
remainder=remainder,
verbose_feature_names_out=False,
)
ct.fit(df)
names = ct.get_feature_names_out()
assert isinstance(names, np.ndarray)
assert names.dtype == object
assert_array_equal(names, expected_names)
@pytest.mark.parametrize(
"transformers, remainder, colliding_columns",
[
(
[
("bycol1", TransWithNames(), ["b"]),
("bycol2", "passthrough", ["b"]),
],
"drop",
"['b']",
),
(
[
("bycol1", TransWithNames(["c", "d"]), ["c"]),
("bycol2", "passthrough", ["c"]),
],
"drop",
"['c']",
),
(
[
("bycol1", TransWithNames(["a"]), ["b"]),
("bycol2", "passthrough", ["b"]),
],
"passthrough",
"['a']",
),
(
[
("bycol1", TransWithNames(["a"]), ["b"]),
("bycol2", "drop", ["b"]),
],
"passthrough",
"['a']",
),
(
[
("bycol1", TransWithNames(["c", "b"]), ["b"]),
("bycol2", "passthrough", ["c", "b"]),
],
"drop",
"['b', 'c']",
),
(
[
("bycol1", TransWithNames(["a"]), ["b"]),
("bycol2", "passthrough", ["a"]),
("bycol3", TransWithNames(["a"]), ["b"]),
],
"passthrough",
"['a']",
),
(
[
("bycol1", TransWithNames(["a", "b"]), ["b"]),
("bycol2", "passthrough", ["a"]),
("bycol3", TransWithNames(["b"]), ["c"]),
],
"passthrough",
"['a', 'b']",
),
(
[
("bycol1", TransWithNames([f"pca{i}" for i in range(6)]), ["b"]),
("bycol2", TransWithNames([f"pca{i}" for i in range(6)]), ["b"]),
],
"passthrough",
"['pca0', 'pca1', 'pca2', 'pca3', 'pca4', ...]",
),
(
[
("bycol1", TransWithNames(["a", "b"]), slice(1, 2)),
("bycol2", "passthrough", ["a"]),
("bycol3", TransWithNames(["b"]), ["c"]),
],
"passthrough",
"['a', 'b']",
),
(
[
("bycol1", TransWithNames(["a", "b"]), ["b"]),
("bycol2", "passthrough", slice(0, 1)),
("bycol3", TransWithNames(["b"]), ["c"]),
],
"passthrough",
"['a', 'b']",
),
(
[
("bycol1", TransWithNames(["a", "b"]), slice("b", "c")),
("bycol2", "passthrough", ["a"]),
("bycol3", TransWithNames(["b"]), ["c"]),
],
"passthrough",
"['a', 'b']",
),
(
[
("bycol1", TransWithNames(["a", "b"]), ["b"]),
("bycol2", "passthrough", slice("a", "a")),
("bycol3", TransWithNames(["b"]), ["c"]),
],
"passthrough",
"['a', 'b']",
),
],
)
def test_verbose_feature_names_out_false_errors(
transformers, remainder, colliding_columns
):
"""Check feature_names_out for verbose_feature_names_out=False"""
pd = pytest.importorskip("pandas")
df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"])
ct = ColumnTransformer(
transformers,
remainder=remainder,
verbose_feature_names_out=False,
)
ct.fit(df)
msg = re.escape(
f"Output feature names: {colliding_columns} are not unique. Please set "
"verbose_feature_names_out=True to add prefixes to feature names"
)
with pytest.raises(ValueError, match=msg):
ct.get_feature_names_out()
@pytest.mark.parametrize("verbose_feature_names_out", [True, False])
@pytest.mark.parametrize("remainder", ["drop", "passthrough"])
def test_column_transformer_set_output(verbose_feature_names_out, remainder):
"""Check column transformer behavior with set_output."""
pd = pytest.importorskip("pandas")
df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"], index=[10])
ct = ColumnTransformer(
[("first", TransWithNames(), ["a", "c"]), ("second", TransWithNames(), ["d"])],
remainder=remainder,
verbose_feature_names_out=verbose_feature_names_out,
)
X_trans = ct.fit_transform(df)
assert isinstance(X_trans, np.ndarray)
ct.set_output(transform="pandas")
df_test = pd.DataFrame([[1, 2, 3, 4]], columns=df.columns, index=[20])
X_trans = ct.transform(df_test)
assert isinstance(X_trans, pd.DataFrame)
feature_names_out = ct.get_feature_names_out()
assert_array_equal(X_trans.columns, feature_names_out)
assert_array_equal(X_trans.index, df_test.index)
@pytest.mark.parametrize("remainder", ["drop", "passthrough"])
@pytest.mark.parametrize("fit_transform", [True, False])
def test_column_transform_set_output_mixed(remainder, fit_transform):
"""Check ColumnTransformer outputs mixed types correctly."""
pd = pytest.importorskip("pandas")
df = pd.DataFrame(
{
"pet": pd.Series(["dog", "cat", "snake"], dtype="category"),
"color": pd.Series(["green", "blue", "red"], dtype="object"),
"age": [1.4, 2.1, 4.4],
"height": [20, 40, 10],
"distance": pd.Series([20, pd.NA, 100], dtype="Int32"),
}
)
ct = ColumnTransformer(
[
(
"color_encode",
OneHotEncoder(sparse_output=False, dtype="int8"),
["color"],
),
("age", StandardScaler(), ["age"]),
],
remainder=remainder,
verbose_feature_names_out=False,
).set_output(transform="pandas")
if fit_transform:
X_trans = ct.fit_transform(df)
else:
X_trans = ct.fit(df).transform(df)
assert isinstance(X_trans, pd.DataFrame)
assert_array_equal(X_trans.columns, ct.get_feature_names_out())
expected_dtypes = {
"color_blue": "int8",
"color_green": "int8",
"color_red": "int8",
"age": "float64",
"pet": "category",
"height": "int64",
"distance": "Int32",
}
for col, dtype in X_trans.dtypes.items():
assert dtype == expected_dtypes[col]
@pytest.mark.parametrize("remainder", ["drop", "passthrough"])
def test_column_transform_set_output_after_fitting(remainder):
pd = pytest.importorskip("pandas")
df = pd.DataFrame(
{
"pet": pd.Series(["dog", "cat", "snake"], dtype="category"),
"age": [1.4, 2.1, 4.4],
"height": [20, 40, 10],
}
)
ct = ColumnTransformer(
[
(
"color_encode",
OneHotEncoder(sparse_output=False, dtype="int16"),
["pet"],
),
("age", StandardScaler(), ["age"]),
],
remainder=remainder,
verbose_feature_names_out=False,
)
# fit without calling set_output
X_trans = ct.fit_transform(df)
assert isinstance(X_trans, np.ndarray)
assert X_trans.dtype == "float64"
ct.set_output(transform="pandas")
X_trans_df = ct.transform(df)
expected_dtypes = {
"pet_cat": "int16",
"pet_dog": "int16",
"pet_snake": "int16",
"height": "int64",
"age": "float64",
}
for col, dtype in X_trans_df.dtypes.items():
assert dtype == expected_dtypes[col]
# PandasOutTransformer that does not define get_feature_names_out and always expects
# the input to be a DataFrame.
class PandasOutTransformer(BaseEstimator):
def __init__(self, offset=1.0):
self.offset = offset
def fit(self, X, y=None):
pd = pytest.importorskip("pandas")
assert isinstance(X, pd.DataFrame)
return self
def transform(self, X, y=None):
pd = pytest.importorskip("pandas")
assert isinstance(X, pd.DataFrame)
return X - self.offset
def set_output(self, transform=None):
# This transformer will always output a DataFrame regardless of the
# configuration.
return self
@pytest.mark.parametrize(
"trans_1, expected_verbose_names, expected_non_verbose_names",
[
(
PandasOutTransformer(offset=2.0),
["trans_0__feat1", "trans_1__feat0"],
["feat1", "feat0"],
),
(
"drop",
["trans_0__feat1"],
["feat1"],
),
(
"passthrough",
["trans_0__feat1", "trans_1__feat0"],
["feat1", "feat0"],
),
],
)
def test_transformers_with_pandas_out_but_not_feature_names_out(
trans_1, expected_verbose_names, expected_non_verbose_names
):
"""Check that set_config(transform="pandas") is compatible with more transformers.
Specifically, if transformers returns a DataFrame, but does not define
`get_feature_names_out`.
"""
pd = pytest.importorskip("pandas")
X_df = pd.DataFrame({"feat0": [1.0, 2.0, 3.0], "feat1": [2.0, 3.0, 4.0]})
ct = ColumnTransformer(
[
("trans_0", PandasOutTransformer(offset=3.0), ["feat1"]),
("trans_1", trans_1, ["feat0"]),
]
)
X_trans_np = ct.fit_transform(X_df)
assert isinstance(X_trans_np, np.ndarray)
# `ct` does not have `get_feature_names_out` because `PandasOutTransformer` does
# not define the method.
with pytest.raises(AttributeError, match="not provide get_feature_names_out"):
ct.get_feature_names_out()
# The feature names are prefixed because verbose_feature_names_out=True is default
ct.set_output(transform="pandas")
X_trans_df0 = ct.fit_transform(X_df)
assert_array_equal(X_trans_df0.columns, expected_verbose_names)
ct.set_params(verbose_feature_names_out=False)
X_trans_df1 = ct.fit_transform(X_df)
assert_array_equal(X_trans_df1.columns, expected_non_verbose_names)
@pytest.mark.parametrize(
"empty_selection",
[[], np.array([False, False]), [False, False]],
ids=["list", "bool", "bool_int"],
)
def test_empty_selection_pandas_output(empty_selection):
"""Check that pandas output works when there is an empty selection.
Non-regression test for gh-25487
"""
pd = pytest.importorskip("pandas")
X = pd.DataFrame([[1.0, 2.2], [3.0, 1.0]], columns=["a", "b"])
ct = ColumnTransformer(
[
("categorical", "passthrough", empty_selection),
("numerical", StandardScaler(), ["a", "b"]),
],
verbose_feature_names_out=True,
)
ct.set_output(transform="pandas")
X_out = ct.fit_transform(X)
assert_array_equal(X_out.columns, ["numerical__a", "numerical__b"])
ct.set_params(verbose_feature_names_out=False)
X_out = ct.fit_transform(X)
assert_array_equal(X_out.columns, ["a", "b"])
def test_raise_error_if_index_not_aligned():
"""Check column transformer raises error if indices are not aligned.
Non-regression test for gh-26210.
"""
pd = pytest.importorskip("pandas")
X = pd.DataFrame([[1.0, 2.2], [3.0, 1.0]], columns=["a", "b"], index=[8, 3])
reset_index_transformer = FunctionTransformer(
lambda x: x.reset_index(drop=True), feature_names_out="one-to-one"
)
ct = ColumnTransformer(
[
("num1", "passthrough", ["a"]),
("num2", reset_index_transformer, ["b"]),
],
)
ct.set_output(transform="pandas")
msg = (
"Concatenating DataFrames from the transformer's output lead to"
" an inconsistent number of samples. The output may have Pandas"
" Indexes that do not match."
)
with pytest.raises(ValueError, match=msg):
ct.fit_transform(X)
def test_remainder_set_output():
"""Check that the output is set for the remainder.
Non-regression test for #26306.
"""
pd = pytest.importorskip("pandas")
df = pd.DataFrame({"a": [True, False, True], "b": [1, 2, 3]})
ct = make_column_transformer(
(VarianceThreshold(), make_column_selector(dtype_include=bool)),
remainder=VarianceThreshold(),
verbose_feature_names_out=False,
)
ct.set_output(transform="pandas")
out = ct.fit_transform(df)
pd.testing.assert_frame_equal(out, df)
ct.set_output(transform="default")
out = ct.fit_transform(df)
assert isinstance(out, np.ndarray)
# TODO(1.6): replace the warning by a ValueError exception
def test_transform_pd_na():
"""Check behavior when a tranformer's output contains pandas.NA
It should emit a warning unless the output config is set to 'pandas'.
"""
pd = pytest.importorskip("pandas")
if not hasattr(pd, "Float64Dtype"):
pytest.skip(
"The issue with pd.NA tested here does not happen in old versions that do"
" not have the extension dtypes"
)
df = pd.DataFrame({"a": [1.5, None]})
ct = make_column_transformer(("passthrough", ["a"]))
# No warning with non-extension dtypes and np.nan
with warnings.catch_warnings():
warnings.simplefilter("error")
ct.fit_transform(df)
df = df.convert_dtypes()
# Error with extension dtype and pd.NA
with pytest.warns(FutureWarning, match=r"set_output\(transform='pandas'\)"):
ct.fit_transform(df)
# No warning when output is set to pandas
with warnings.catch_warnings():
warnings.simplefilter("error")
ct.set_output(transform="pandas")
ct.fit_transform(df)
ct.set_output(transform="default")
# No warning when there are no pd.NA
with warnings.catch_warnings():
warnings.simplefilter("error")
ct.fit_transform(df.fillna(-1.0))
def test_dataframe_different_dataframe_libraries():
"""Check fitting and transforming on pandas and polars dataframes."""
pd = pytest.importorskip("pandas")
pl = pytest.importorskip("polars")
X_train_np = np.array([[0, 1], [2, 4], [4, 5]])
X_test_np = np.array([[1, 2], [1, 3], [2, 3]])
# Fit on pandas and transform on polars
X_train_pd = pd.DataFrame(X_train_np, columns=["a", "b"])
X_test_pl = pl.DataFrame(X_test_np, schema=["a", "b"])
ct = make_column_transformer((Trans(), [0, 1]))
ct.fit(X_train_pd)
out_pl_in = ct.transform(X_test_pl)
assert_array_equal(out_pl_in, X_test_np)
# Fit on polars and transform on pandas
X_train_pl = pl.DataFrame(X_train_np, schema=["a", "b"])
X_test_pd = pd.DataFrame(X_test_np, columns=["a", "b"])
ct.fit(X_train_pl)
out_pd_in = ct.transform(X_test_pd)
assert_array_equal(out_pd_in, X_test_np)
def test_column_transformer__getitem__():
"""Check __getitem__ for ColumnTransformer."""
X = np.array([[0, 1, 2], [3, 4, 5]])
ct = ColumnTransformer([("t1", Trans(), [0, 1]), ("t2", Trans(), [1, 2])])
msg = "ColumnTransformer is subscriptable after it is fitted"
with pytest.raises(TypeError, match=msg):
ct["t1"]
ct.fit(X)
assert ct["t1"] is ct.named_transformers_["t1"]
assert ct["t2"] is ct.named_transformers_["t2"]
msg = "'does_not_exist' is not a valid transformer name"
with pytest.raises(KeyError, match=msg):
ct["does_not_exist"]
@pytest.mark.parametrize("transform_output", ["default", "pandas"])
def test_column_transformer_remainder_passthrough_naming_consistency(transform_output):
"""Check that when `remainder="passthrough"`, inconsistent naming is handled
correctly by the underlying `FunctionTransformer`.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/28232
"""
pd = pytest.importorskip("pandas")
X = pd.DataFrame(np.random.randn(10, 4))
preprocessor = ColumnTransformer(
transformers=[("scaler", StandardScaler(), [0, 1])],
remainder="passthrough",
).set_output(transform=transform_output)
X_trans = preprocessor.fit_transform(X)
assert X_trans.shape == X.shape
expected_column_names = [
"scaler__x0",
"scaler__x1",
"remainder__x2",
"remainder__x3",
]
if hasattr(X_trans, "columns"):
assert X_trans.columns.tolist() == expected_column_names
assert preprocessor.get_feature_names_out().tolist() == expected_column_names
@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
def test_column_transformer_column_renaming(dataframe_lib):
"""Check that we properly rename columns when using `ColumnTransformer` and
selected columns are redundant between transformers.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/28260
"""
lib = pytest.importorskip(dataframe_lib)
df = lib.DataFrame({"x1": [1, 2, 3], "x2": [10, 20, 30], "x3": [100, 200, 300]})
transformer = ColumnTransformer(
transformers=[
("A", "passthrough", ["x1", "x2", "x3"]),
("B", FunctionTransformer(), ["x1", "x2"]),
("C", StandardScaler(), ["x1", "x3"]),
# special case of empty transformer
("D", FunctionTransformer(lambda x: x[[]]), ["x1", "x2", "x3"]),
],
verbose_feature_names_out=True,
).set_output(transform=dataframe_lib)
df_trans = transformer.fit_transform(df)
assert list(df_trans.columns) == [
"A__x1",
"A__x2",
"A__x3",
"B__x1",
"B__x2",
"C__x1",
"C__x3",
]
@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
def test_column_transformer_error_with_duplicated_columns(dataframe_lib):
"""Check that we raise an error when using `ColumnTransformer` and
the columns names are duplicated between transformers."""
lib = pytest.importorskip(dataframe_lib)
df = lib.DataFrame({"x1": [1, 2, 3], "x2": [10, 20, 30], "x3": [100, 200, 300]})
transformer = ColumnTransformer(
transformers=[
("A", "passthrough", ["x1", "x2", "x3"]),
("B", FunctionTransformer(), ["x1", "x2"]),
("C", StandardScaler(), ["x1", "x3"]),
# special case of empty transformer
("D", FunctionTransformer(lambda x: x[[]]), ["x1", "x2", "x3"]),
],
verbose_feature_names_out=False,
).set_output(transform=dataframe_lib)
err_msg = re.escape(
"Duplicated feature names found before concatenating the outputs of the "
"transformers: ['x1', 'x2', 'x3'].\n"
"Transformer A has conflicting columns names: ['x1', 'x2', 'x3'].\n"
"Transformer B has conflicting columns names: ['x1', 'x2'].\n"
"Transformer C has conflicting columns names: ['x1', 'x3'].\n"
)
with pytest.raises(ValueError, match=err_msg):
transformer.fit_transform(df)
@pytest.mark.skipif(
parse_version(joblib.__version__) < parse_version("1.3"),
reason="requires joblib >= 1.3",
)
def test_column_transformer_auto_memmap():
"""Check that ColumnTransformer works in parallel with joblib's auto-memmapping.
non-regression test for issue #28781
"""
X = np.random.RandomState(0).uniform(size=(3, 4))
scaler = StandardScaler(copy=False)
transformer = ColumnTransformer(
transformers=[("scaler", scaler, [0])],
n_jobs=2,
)
with joblib.parallel_backend("loky", max_nbytes=1):
Xt = transformer.fit_transform(X)
assert_allclose(Xt, StandardScaler().fit_transform(X[:, [0]]))
# Metadata Routing Tests
# ======================
@pytest.mark.parametrize("method", ["transform", "fit_transform", "fit"])
def test_routing_passed_metadata_not_supported(method):
"""Test that the right error message is raised when metadata is passed while
not supported when `enable_metadata_routing=False`."""
X = np.array([[0, 1, 2], [2, 4, 6]]).T
y = [1, 2, 3]
trs = ColumnTransformer([("trans", Trans(), [0])]).fit(X, y)
with pytest.raises(
ValueError, match="is only supported if enable_metadata_routing=True"
):
getattr(trs, method)([[1]], sample_weight=[1], prop="a")
@pytest.mark.usefixtures("enable_slep006")
@pytest.mark.parametrize("method", ["transform", "fit_transform", "fit"])
def test_metadata_routing_for_column_transformer(method):
"""Test that metadata is routed correctly for column transformer."""
X = np.array([[0, 1, 2], [2, 4, 6]]).T
y = [1, 2, 3]
registry = _Registry()
sample_weight, metadata = [1], "a"
trs = ColumnTransformer(
[
(
"trans",
ConsumingTransformer(registry=registry)
.set_fit_request(sample_weight=True, metadata=True)
.set_transform_request(sample_weight=True, metadata=True),
[0],
)
]
)
if method == "transform":
trs.fit(X, y)
trs.transform(X, sample_weight=sample_weight, metadata=metadata)
else:
getattr(trs, method)(X, y, sample_weight=sample_weight, metadata=metadata)
assert len(registry)
for _trs in registry:
check_recorded_metadata(
obj=_trs, method=method, sample_weight=sample_weight, metadata=metadata
)
@pytest.mark.usefixtures("enable_slep006")
def test_metadata_routing_no_fit_transform():
"""Test metadata routing when the sub-estimator doesn't implement
``fit_transform``."""
class NoFitTransform(BaseEstimator):
def fit(self, X, y=None, sample_weight=None, metadata=None):
assert sample_weight
assert metadata
return self
def transform(self, X, sample_weight=None, metadata=None):
assert sample_weight
assert metadata
return X
X = np.array([[0, 1, 2], [2, 4, 6]]).T
y = [1, 2, 3]
sample_weight, metadata = [1], "a"
trs = ColumnTransformer(
[
(
"trans",
NoFitTransform()
.set_fit_request(sample_weight=True, metadata=True)
.set_transform_request(sample_weight=True, metadata=True),
[0],
)
]
)
trs.fit(X, y, sample_weight=sample_weight, metadata=metadata)
trs.fit_transform(X, y, sample_weight=sample_weight, metadata=metadata)
@pytest.mark.usefixtures("enable_slep006")
@pytest.mark.parametrize("method", ["transform", "fit_transform", "fit"])
def test_metadata_routing_error_for_column_transformer(method):
"""Test that the right error is raised when metadata is not requested."""
X = np.array([[0, 1, 2], [2, 4, 6]]).T
y = [1, 2, 3]
sample_weight, metadata = [1], "a"
trs = ColumnTransformer([("trans", ConsumingTransformer(), [0])])
error_message = (
"[sample_weight, metadata] are passed but are not explicitly set as requested"
f" or not requested for ConsumingTransformer.{method}"
)
with pytest.raises(ValueError, match=re.escape(error_message)):
if method == "transform":
trs.fit(X, y)
trs.transform(X, sample_weight=sample_weight, metadata=metadata)
else:
getattr(trs, method)(X, y, sample_weight=sample_weight, metadata=metadata)
@pytest.mark.usefixtures("enable_slep006")
def test_get_metadata_routing_works_without_fit():
# Regression test for https://github.com/scikit-learn/scikit-learn/issues/28186
# Make sure ct.get_metadata_routing() works w/o having called fit.
ct = ColumnTransformer([("trans", ConsumingTransformer(), [0])])
ct.get_metadata_routing()
@pytest.mark.usefixtures("enable_slep006")
def test_remainder_request_always_present():
# Test that remainder request is always present.
ct = ColumnTransformer(
[("trans", StandardScaler(), [0])],
remainder=ConsumingTransformer()
.set_fit_request(metadata=True)
.set_transform_request(metadata=True),
)
router = ct.get_metadata_routing()
assert router.consumes("fit", ["metadata"]) == set(["metadata"])
@pytest.mark.usefixtures("enable_slep006")
def test_unused_transformer_request_present():
# Test that the request of a transformer is always present even when not
# used due to no selected columns.
ct = ColumnTransformer(
[
(
"trans",
ConsumingTransformer()
.set_fit_request(metadata=True)
.set_transform_request(metadata=True),
lambda X: [],
)
]
)
router = ct.get_metadata_routing()
assert router.consumes("fit", ["metadata"]) == set(["metadata"])
# End of Metadata Routing Tests
# =============================