2744 lines
91 KiB
Python
2744 lines
91 KiB
Python
"""
|
|
Test the ColumnTransformer.
|
|
"""
|
|
|
|
import pickle
|
|
import re
|
|
import warnings
|
|
from unittest.mock import Mock
|
|
|
|
import joblib
|
|
import numpy as np
|
|
import pytest
|
|
from numpy.testing import assert_allclose
|
|
from scipy import sparse
|
|
|
|
from sklearn.base import BaseEstimator, TransformerMixin
|
|
from sklearn.compose import (
|
|
ColumnTransformer,
|
|
make_column_selector,
|
|
make_column_transformer,
|
|
)
|
|
from sklearn.compose._column_transformer import _RemainderColsList
|
|
from sklearn.exceptions import NotFittedError
|
|
from sklearn.feature_selection import VarianceThreshold
|
|
from sklearn.preprocessing import (
|
|
FunctionTransformer,
|
|
Normalizer,
|
|
OneHotEncoder,
|
|
StandardScaler,
|
|
)
|
|
from sklearn.tests.metadata_routing_common import (
|
|
ConsumingTransformer,
|
|
_Registry,
|
|
check_recorded_metadata,
|
|
)
|
|
from sklearn.utils._testing import (
|
|
_convert_container,
|
|
assert_allclose_dense_sparse,
|
|
assert_almost_equal,
|
|
assert_array_equal,
|
|
)
|
|
from sklearn.utils.fixes import CSR_CONTAINERS, parse_version
|
|
|
|
|
|
class Trans(TransformerMixin, BaseEstimator):
|
|
def fit(self, X, y=None):
|
|
return self
|
|
|
|
def transform(self, X, y=None):
|
|
# 1D Series -> 2D DataFrame
|
|
if hasattr(X, "to_frame"):
|
|
return X.to_frame()
|
|
# 1D array -> 2D array
|
|
if getattr(X, "ndim", 2) == 1:
|
|
return np.atleast_2d(X).T
|
|
return X
|
|
|
|
|
|
class DoubleTrans(BaseEstimator):
|
|
def fit(self, X, y=None):
|
|
return self
|
|
|
|
def transform(self, X):
|
|
return 2 * X
|
|
|
|
|
|
class SparseMatrixTrans(BaseEstimator):
|
|
def __init__(self, csr_container):
|
|
self.csr_container = csr_container
|
|
|
|
def fit(self, X, y=None):
|
|
return self
|
|
|
|
def transform(self, X, y=None):
|
|
n_samples = len(X)
|
|
return self.csr_container(sparse.eye(n_samples, n_samples))
|
|
|
|
|
|
class TransNo2D(BaseEstimator):
|
|
def fit(self, X, y=None):
|
|
return self
|
|
|
|
def transform(self, X, y=None):
|
|
return X
|
|
|
|
|
|
class TransRaise(BaseEstimator):
|
|
def fit(self, X, y=None):
|
|
raise ValueError("specific message")
|
|
|
|
def transform(self, X, y=None):
|
|
raise ValueError("specific message")
|
|
|
|
|
|
def test_column_transformer():
|
|
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
|
|
|
|
X_res_first1D = np.array([0, 1, 2])
|
|
X_res_second1D = np.array([2, 4, 6])
|
|
X_res_first = X_res_first1D.reshape(-1, 1)
|
|
X_res_both = X_array
|
|
|
|
cases = [
|
|
# single column 1D / 2D
|
|
(0, X_res_first),
|
|
([0], X_res_first),
|
|
# list-like
|
|
([0, 1], X_res_both),
|
|
(np.array([0, 1]), X_res_both),
|
|
# slice
|
|
(slice(0, 1), X_res_first),
|
|
(slice(0, 2), X_res_both),
|
|
# boolean mask
|
|
(np.array([True, False]), X_res_first),
|
|
([True, False], X_res_first),
|
|
(np.array([True, True]), X_res_both),
|
|
([True, True], X_res_both),
|
|
]
|
|
|
|
for selection, res in cases:
|
|
ct = ColumnTransformer([("trans", Trans(), selection)], remainder="drop")
|
|
assert_array_equal(ct.fit_transform(X_array), res)
|
|
assert_array_equal(ct.fit(X_array).transform(X_array), res)
|
|
|
|
# callable that returns any of the allowed specifiers
|
|
ct = ColumnTransformer(
|
|
[("trans", Trans(), lambda x: selection)], remainder="drop"
|
|
)
|
|
assert_array_equal(ct.fit_transform(X_array), res)
|
|
assert_array_equal(ct.fit(X_array).transform(X_array), res)
|
|
|
|
ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", Trans(), [1])])
|
|
assert_array_equal(ct.fit_transform(X_array), X_res_both)
|
|
assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
|
|
assert len(ct.transformers_) == 2
|
|
|
|
# test with transformer_weights
|
|
transformer_weights = {"trans1": 0.1, "trans2": 10}
|
|
both = ColumnTransformer(
|
|
[("trans1", Trans(), [0]), ("trans2", Trans(), [1])],
|
|
transformer_weights=transformer_weights,
|
|
)
|
|
res = np.vstack(
|
|
[
|
|
transformer_weights["trans1"] * X_res_first1D,
|
|
transformer_weights["trans2"] * X_res_second1D,
|
|
]
|
|
).T
|
|
assert_array_equal(both.fit_transform(X_array), res)
|
|
assert_array_equal(both.fit(X_array).transform(X_array), res)
|
|
assert len(both.transformers_) == 2
|
|
|
|
both = ColumnTransformer(
|
|
[("trans", Trans(), [0, 1])], transformer_weights={"trans": 0.1}
|
|
)
|
|
assert_array_equal(both.fit_transform(X_array), 0.1 * X_res_both)
|
|
assert_array_equal(both.fit(X_array).transform(X_array), 0.1 * X_res_both)
|
|
assert len(both.transformers_) == 1
|
|
|
|
|
|
def test_column_transformer_tuple_transformers_parameter():
|
|
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
|
|
|
|
transformers = [("trans1", Trans(), [0]), ("trans2", Trans(), [1])]
|
|
|
|
ct_with_list = ColumnTransformer(transformers)
|
|
ct_with_tuple = ColumnTransformer(tuple(transformers))
|
|
|
|
assert_array_equal(
|
|
ct_with_list.fit_transform(X_array), ct_with_tuple.fit_transform(X_array)
|
|
)
|
|
assert_array_equal(
|
|
ct_with_list.fit(X_array).transform(X_array),
|
|
ct_with_tuple.fit(X_array).transform(X_array),
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize("constructor_name", ["dataframe", "polars"])
|
|
def test_column_transformer_dataframe(constructor_name):
|
|
if constructor_name == "dataframe":
|
|
dataframe_lib = pytest.importorskip("pandas")
|
|
else:
|
|
dataframe_lib = pytest.importorskip(constructor_name)
|
|
|
|
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
|
|
X_df = _convert_container(
|
|
X_array, constructor_name, columns_name=["first", "second"]
|
|
)
|
|
|
|
X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
|
|
X_res_both = X_array
|
|
|
|
cases = [
|
|
# String keys: label based
|
|
# list
|
|
(["first"], X_res_first),
|
|
(["first", "second"], X_res_both),
|
|
# slice
|
|
(slice("first", "second"), X_res_both),
|
|
# int keys: positional
|
|
# list
|
|
([0], X_res_first),
|
|
([0, 1], X_res_both),
|
|
(np.array([0, 1]), X_res_both),
|
|
# slice
|
|
(slice(0, 1), X_res_first),
|
|
(slice(0, 2), X_res_both),
|
|
# boolean mask
|
|
(np.array([True, False]), X_res_first),
|
|
([True, False], X_res_first),
|
|
]
|
|
if constructor_name == "dataframe":
|
|
# Scalars are only supported for pandas dataframes.
|
|
cases.extend(
|
|
[
|
|
# scalar
|
|
(0, X_res_first),
|
|
("first", X_res_first),
|
|
(
|
|
dataframe_lib.Series([True, False], index=["first", "second"]),
|
|
X_res_first,
|
|
),
|
|
]
|
|
)
|
|
|
|
for selection, res in cases:
|
|
ct = ColumnTransformer([("trans", Trans(), selection)], remainder="drop")
|
|
assert_array_equal(ct.fit_transform(X_df), res)
|
|
assert_array_equal(ct.fit(X_df).transform(X_df), res)
|
|
|
|
# callable that returns any of the allowed specifiers
|
|
ct = ColumnTransformer(
|
|
[("trans", Trans(), lambda X: selection)], remainder="drop"
|
|
)
|
|
assert_array_equal(ct.fit_transform(X_df), res)
|
|
assert_array_equal(ct.fit(X_df).transform(X_df), res)
|
|
|
|
ct = ColumnTransformer(
|
|
[("trans1", Trans(), ["first"]), ("trans2", Trans(), ["second"])]
|
|
)
|
|
assert_array_equal(ct.fit_transform(X_df), X_res_both)
|
|
assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
|
|
assert len(ct.transformers_) == 2
|
|
assert ct.transformers_[-1][0] != "remainder"
|
|
|
|
ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", Trans(), [1])])
|
|
assert_array_equal(ct.fit_transform(X_df), X_res_both)
|
|
assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
|
|
assert len(ct.transformers_) == 2
|
|
assert ct.transformers_[-1][0] != "remainder"
|
|
|
|
# test with transformer_weights
|
|
transformer_weights = {"trans1": 0.1, "trans2": 10}
|
|
both = ColumnTransformer(
|
|
[("trans1", Trans(), ["first"]), ("trans2", Trans(), ["second"])],
|
|
transformer_weights=transformer_weights,
|
|
)
|
|
res = np.vstack(
|
|
[
|
|
transformer_weights["trans1"] * X_df["first"],
|
|
transformer_weights["trans2"] * X_df["second"],
|
|
]
|
|
).T
|
|
assert_array_equal(both.fit_transform(X_df), res)
|
|
assert_array_equal(both.fit(X_df).transform(X_df), res)
|
|
assert len(both.transformers_) == 2
|
|
assert both.transformers_[-1][0] != "remainder"
|
|
|
|
# test multiple columns
|
|
both = ColumnTransformer(
|
|
[("trans", Trans(), ["first", "second"])], transformer_weights={"trans": 0.1}
|
|
)
|
|
assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both)
|
|
assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both)
|
|
assert len(both.transformers_) == 1
|
|
assert both.transformers_[-1][0] != "remainder"
|
|
|
|
both = ColumnTransformer(
|
|
[("trans", Trans(), [0, 1])], transformer_weights={"trans": 0.1}
|
|
)
|
|
assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both)
|
|
assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both)
|
|
assert len(both.transformers_) == 1
|
|
assert both.transformers_[-1][0] != "remainder"
|
|
|
|
# ensure pandas object is passed through
|
|
|
|
class TransAssert(BaseEstimator):
|
|
def __init__(self, expected_type_transform):
|
|
self.expected_type_transform = expected_type_transform
|
|
|
|
def fit(self, X, y=None):
|
|
return self
|
|
|
|
def transform(self, X, y=None):
|
|
assert isinstance(X, self.expected_type_transform)
|
|
if isinstance(X, dataframe_lib.Series):
|
|
X = X.to_frame()
|
|
return X
|
|
|
|
ct = ColumnTransformer(
|
|
[
|
|
(
|
|
"trans",
|
|
TransAssert(expected_type_transform=dataframe_lib.DataFrame),
|
|
["first", "second"],
|
|
)
|
|
]
|
|
)
|
|
ct.fit_transform(X_df)
|
|
|
|
if constructor_name == "dataframe":
|
|
# DataFrame protocol does not have 1d columns, so we only test on Pandas
|
|
# dataframes.
|
|
ct = ColumnTransformer(
|
|
[
|
|
(
|
|
"trans",
|
|
TransAssert(expected_type_transform=dataframe_lib.Series),
|
|
"first",
|
|
)
|
|
],
|
|
remainder="drop",
|
|
)
|
|
ct.fit_transform(X_df)
|
|
|
|
# Only test on pandas because the dataframe protocol requires string column
|
|
# names
|
|
# integer column spec + integer column names -> still use positional
|
|
X_df2 = X_df.copy()
|
|
X_df2.columns = [1, 0]
|
|
ct = ColumnTransformer([("trans", Trans(), 0)], remainder="drop")
|
|
assert_array_equal(ct.fit_transform(X_df2), X_res_first)
|
|
assert_array_equal(ct.fit(X_df2).transform(X_df2), X_res_first)
|
|
|
|
assert len(ct.transformers_) == 2
|
|
assert ct.transformers_[-1][0] == "remainder"
|
|
assert ct.transformers_[-1][1] == "drop"
|
|
assert_array_equal(ct.transformers_[-1][2], [1])
|
|
|
|
|
|
@pytest.mark.parametrize("pandas", [True, False], ids=["pandas", "numpy"])
|
|
@pytest.mark.parametrize(
|
|
"column_selection",
|
|
[[], np.array([False, False]), [False, False]],
|
|
ids=["list", "bool", "bool_int"],
|
|
)
|
|
@pytest.mark.parametrize("callable_column", [False, True])
|
|
def test_column_transformer_empty_columns(pandas, column_selection, callable_column):
|
|
# test case that ensures that the column transformer does also work when
|
|
# a given transformer doesn't have any columns to work on
|
|
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
|
|
X_res_both = X_array
|
|
|
|
if pandas:
|
|
pd = pytest.importorskip("pandas")
|
|
X = pd.DataFrame(X_array, columns=["first", "second"])
|
|
else:
|
|
X = X_array
|
|
|
|
if callable_column:
|
|
column = lambda X: column_selection # noqa
|
|
else:
|
|
column = column_selection
|
|
|
|
ct = ColumnTransformer(
|
|
[("trans1", Trans(), [0, 1]), ("trans2", TransRaise(), column)]
|
|
)
|
|
assert_array_equal(ct.fit_transform(X), X_res_both)
|
|
assert_array_equal(ct.fit(X).transform(X), X_res_both)
|
|
assert len(ct.transformers_) == 2
|
|
assert isinstance(ct.transformers_[1][1], TransRaise)
|
|
|
|
ct = ColumnTransformer(
|
|
[("trans1", TransRaise(), column), ("trans2", Trans(), [0, 1])]
|
|
)
|
|
assert_array_equal(ct.fit_transform(X), X_res_both)
|
|
assert_array_equal(ct.fit(X).transform(X), X_res_both)
|
|
assert len(ct.transformers_) == 2
|
|
assert isinstance(ct.transformers_[0][1], TransRaise)
|
|
|
|
ct = ColumnTransformer([("trans", TransRaise(), column)], remainder="passthrough")
|
|
assert_array_equal(ct.fit_transform(X), X_res_both)
|
|
assert_array_equal(ct.fit(X).transform(X), X_res_both)
|
|
assert len(ct.transformers_) == 2 # including remainder
|
|
assert isinstance(ct.transformers_[0][1], TransRaise)
|
|
|
|
fixture = np.array([[], [], []])
|
|
ct = ColumnTransformer([("trans", TransRaise(), column)], remainder="drop")
|
|
assert_array_equal(ct.fit_transform(X), fixture)
|
|
assert_array_equal(ct.fit(X).transform(X), fixture)
|
|
assert len(ct.transformers_) == 2 # including remainder
|
|
assert isinstance(ct.transformers_[0][1], TransRaise)
|
|
|
|
|
|
def test_column_transformer_output_indices():
|
|
# Checks for the output_indices_ attribute
|
|
X_array = np.arange(6).reshape(3, 2)
|
|
|
|
ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", Trans(), [1])])
|
|
X_trans = ct.fit_transform(X_array)
|
|
assert ct.output_indices_ == {
|
|
"trans1": slice(0, 1),
|
|
"trans2": slice(1, 2),
|
|
"remainder": slice(0, 0),
|
|
}
|
|
assert_array_equal(X_trans[:, [0]], X_trans[:, ct.output_indices_["trans1"]])
|
|
assert_array_equal(X_trans[:, [1]], X_trans[:, ct.output_indices_["trans2"]])
|
|
|
|
# test with transformer_weights and multiple columns
|
|
ct = ColumnTransformer(
|
|
[("trans", Trans(), [0, 1])], transformer_weights={"trans": 0.1}
|
|
)
|
|
X_trans = ct.fit_transform(X_array)
|
|
assert ct.output_indices_ == {"trans": slice(0, 2), "remainder": slice(0, 0)}
|
|
assert_array_equal(X_trans[:, [0, 1]], X_trans[:, ct.output_indices_["trans"]])
|
|
assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]])
|
|
|
|
# test case that ensures that the attribute does also work when
|
|
# a given transformer doesn't have any columns to work on
|
|
ct = ColumnTransformer([("trans1", Trans(), [0, 1]), ("trans2", TransRaise(), [])])
|
|
X_trans = ct.fit_transform(X_array)
|
|
assert ct.output_indices_ == {
|
|
"trans1": slice(0, 2),
|
|
"trans2": slice(0, 0),
|
|
"remainder": slice(0, 0),
|
|
}
|
|
assert_array_equal(X_trans[:, [0, 1]], X_trans[:, ct.output_indices_["trans1"]])
|
|
assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["trans2"]])
|
|
assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]])
|
|
|
|
ct = ColumnTransformer([("trans", TransRaise(), [])], remainder="passthrough")
|
|
X_trans = ct.fit_transform(X_array)
|
|
assert ct.output_indices_ == {"trans": slice(0, 0), "remainder": slice(0, 2)}
|
|
assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["trans"]])
|
|
assert_array_equal(X_trans[:, [0, 1]], X_trans[:, ct.output_indices_["remainder"]])
|
|
|
|
|
|
def test_column_transformer_output_indices_df():
|
|
# Checks for the output_indices_ attribute with data frames
|
|
pd = pytest.importorskip("pandas")
|
|
|
|
X_df = pd.DataFrame(np.arange(6).reshape(3, 2), columns=["first", "second"])
|
|
|
|
ct = ColumnTransformer(
|
|
[("trans1", Trans(), ["first"]), ("trans2", Trans(), ["second"])]
|
|
)
|
|
X_trans = ct.fit_transform(X_df)
|
|
assert ct.output_indices_ == {
|
|
"trans1": slice(0, 1),
|
|
"trans2": slice(1, 2),
|
|
"remainder": slice(0, 0),
|
|
}
|
|
assert_array_equal(X_trans[:, [0]], X_trans[:, ct.output_indices_["trans1"]])
|
|
assert_array_equal(X_trans[:, [1]], X_trans[:, ct.output_indices_["trans2"]])
|
|
assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]])
|
|
|
|
ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", Trans(), [1])])
|
|
X_trans = ct.fit_transform(X_df)
|
|
assert ct.output_indices_ == {
|
|
"trans1": slice(0, 1),
|
|
"trans2": slice(1, 2),
|
|
"remainder": slice(0, 0),
|
|
}
|
|
assert_array_equal(X_trans[:, [0]], X_trans[:, ct.output_indices_["trans1"]])
|
|
assert_array_equal(X_trans[:, [1]], X_trans[:, ct.output_indices_["trans2"]])
|
|
assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]])
|
|
|
|
|
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
|
def test_column_transformer_sparse_array(csr_container):
|
|
X_sparse = csr_container(sparse.eye(3, 2))
|
|
|
|
# no distinction between 1D and 2D
|
|
X_res_first = X_sparse[:, [0]]
|
|
X_res_both = X_sparse
|
|
|
|
for col in [(0,), [0], slice(0, 1)]:
|
|
for remainder, res in [("drop", X_res_first), ("passthrough", X_res_both)]:
|
|
ct = ColumnTransformer(
|
|
[("trans", Trans(), col)], remainder=remainder, sparse_threshold=0.8
|
|
)
|
|
assert sparse.issparse(ct.fit_transform(X_sparse))
|
|
assert_allclose_dense_sparse(ct.fit_transform(X_sparse), res)
|
|
assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse), res)
|
|
|
|
for col in [[0, 1], slice(0, 2)]:
|
|
ct = ColumnTransformer([("trans", Trans(), col)], sparse_threshold=0.8)
|
|
assert sparse.issparse(ct.fit_transform(X_sparse))
|
|
assert_allclose_dense_sparse(ct.fit_transform(X_sparse), X_res_both)
|
|
assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse), X_res_both)
|
|
|
|
|
|
def test_column_transformer_list():
|
|
X_list = [[1, float("nan"), "a"], [0, 0, "b"]]
|
|
expected_result = np.array(
|
|
[
|
|
[1, float("nan"), 1, 0],
|
|
[-1, 0, 0, 1],
|
|
]
|
|
)
|
|
|
|
ct = ColumnTransformer(
|
|
[
|
|
("numerical", StandardScaler(), [0, 1]),
|
|
("categorical", OneHotEncoder(), [2]),
|
|
]
|
|
)
|
|
|
|
assert_array_equal(ct.fit_transform(X_list), expected_result)
|
|
assert_array_equal(ct.fit(X_list).transform(X_list), expected_result)
|
|
|
|
|
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
|
def test_column_transformer_sparse_stacking(csr_container):
|
|
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
|
|
col_trans = ColumnTransformer(
|
|
[("trans1", Trans(), [0]), ("trans2", SparseMatrixTrans(csr_container), 1)],
|
|
sparse_threshold=0.8,
|
|
)
|
|
col_trans.fit(X_array)
|
|
X_trans = col_trans.transform(X_array)
|
|
assert sparse.issparse(X_trans)
|
|
assert X_trans.shape == (X_trans.shape[0], X_trans.shape[0] + 1)
|
|
assert_array_equal(X_trans.toarray()[:, 1:], np.eye(X_trans.shape[0]))
|
|
assert len(col_trans.transformers_) == 2
|
|
assert col_trans.transformers_[-1][0] != "remainder"
|
|
|
|
col_trans = ColumnTransformer(
|
|
[("trans1", Trans(), [0]), ("trans2", SparseMatrixTrans(csr_container), 1)],
|
|
sparse_threshold=0.1,
|
|
)
|
|
col_trans.fit(X_array)
|
|
X_trans = col_trans.transform(X_array)
|
|
assert not sparse.issparse(X_trans)
|
|
assert X_trans.shape == (X_trans.shape[0], X_trans.shape[0] + 1)
|
|
assert_array_equal(X_trans[:, 1:], np.eye(X_trans.shape[0]))
|
|
|
|
|
|
def test_column_transformer_mixed_cols_sparse():
|
|
df = np.array([["a", 1, True], ["b", 2, False]], dtype="O")
|
|
|
|
ct = make_column_transformer(
|
|
(OneHotEncoder(), [0]), ("passthrough", [1, 2]), sparse_threshold=1.0
|
|
)
|
|
|
|
# this shouldn't fail, since boolean can be coerced into a numeric
|
|
# See: https://github.com/scikit-learn/scikit-learn/issues/11912
|
|
X_trans = ct.fit_transform(df)
|
|
assert X_trans.getformat() == "csr"
|
|
assert_array_equal(X_trans.toarray(), np.array([[1, 0, 1, 1], [0, 1, 2, 0]]))
|
|
|
|
ct = make_column_transformer(
|
|
(OneHotEncoder(), [0]), ("passthrough", [0]), sparse_threshold=1.0
|
|
)
|
|
with pytest.raises(ValueError, match="For a sparse output, all columns should"):
|
|
# this fails since strings `a` and `b` cannot be
|
|
# coerced into a numeric.
|
|
ct.fit_transform(df)
|
|
|
|
|
|
def test_column_transformer_sparse_threshold():
|
|
X_array = np.array([["a", "b"], ["A", "B"]], dtype=object).T
|
|
# above data has sparsity of 4 / 8 = 0.5
|
|
|
|
# apply threshold even if all sparse
|
|
col_trans = ColumnTransformer(
|
|
[("trans1", OneHotEncoder(), [0]), ("trans2", OneHotEncoder(), [1])],
|
|
sparse_threshold=0.2,
|
|
)
|
|
res = col_trans.fit_transform(X_array)
|
|
assert not sparse.issparse(res)
|
|
assert not col_trans.sparse_output_
|
|
|
|
# mixed -> sparsity of (4 + 2) / 8 = 0.75
|
|
for thres in [0.75001, 1]:
|
|
col_trans = ColumnTransformer(
|
|
[
|
|
("trans1", OneHotEncoder(sparse_output=True), [0]),
|
|
("trans2", OneHotEncoder(sparse_output=False), [1]),
|
|
],
|
|
sparse_threshold=thres,
|
|
)
|
|
res = col_trans.fit_transform(X_array)
|
|
assert sparse.issparse(res)
|
|
assert col_trans.sparse_output_
|
|
|
|
for thres in [0.75, 0]:
|
|
col_trans = ColumnTransformer(
|
|
[
|
|
("trans1", OneHotEncoder(sparse_output=True), [0]),
|
|
("trans2", OneHotEncoder(sparse_output=False), [1]),
|
|
],
|
|
sparse_threshold=thres,
|
|
)
|
|
res = col_trans.fit_transform(X_array)
|
|
assert not sparse.issparse(res)
|
|
assert not col_trans.sparse_output_
|
|
|
|
# if nothing is sparse -> no sparse
|
|
for thres in [0.33, 0, 1]:
|
|
col_trans = ColumnTransformer(
|
|
[
|
|
("trans1", OneHotEncoder(sparse_output=False), [0]),
|
|
("trans2", OneHotEncoder(sparse_output=False), [1]),
|
|
],
|
|
sparse_threshold=thres,
|
|
)
|
|
res = col_trans.fit_transform(X_array)
|
|
assert not sparse.issparse(res)
|
|
assert not col_trans.sparse_output_
|
|
|
|
|
|
def test_column_transformer_error_msg_1D():
|
|
X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
|
|
|
|
col_trans = ColumnTransformer([("trans", StandardScaler(), 0)])
|
|
msg = "1D data passed to a transformer"
|
|
with pytest.raises(ValueError, match=msg):
|
|
col_trans.fit(X_array)
|
|
|
|
with pytest.raises(ValueError, match=msg):
|
|
col_trans.fit_transform(X_array)
|
|
|
|
col_trans = ColumnTransformer([("trans", TransRaise(), 0)])
|
|
for func in [col_trans.fit, col_trans.fit_transform]:
|
|
with pytest.raises(ValueError, match="specific message"):
|
|
func(X_array)
|
|
|
|
|
|
def test_2D_transformer_output():
|
|
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
|
|
|
|
# if one transformer is dropped, test that name is still correct
|
|
ct = ColumnTransformer([("trans1", "drop", 0), ("trans2", TransNo2D(), 1)])
|
|
|
|
msg = "the 'trans2' transformer should be 2D"
|
|
with pytest.raises(ValueError, match=msg):
|
|
ct.fit_transform(X_array)
|
|
# because fit is also doing transform, this raises already on fit
|
|
with pytest.raises(ValueError, match=msg):
|
|
ct.fit(X_array)
|
|
|
|
|
|
def test_2D_transformer_output_pandas():
|
|
pd = pytest.importorskip("pandas")
|
|
|
|
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
|
|
X_df = pd.DataFrame(X_array, columns=["col1", "col2"])
|
|
|
|
# if one transformer is dropped, test that name is still correct
|
|
ct = ColumnTransformer([("trans1", TransNo2D(), "col1")])
|
|
msg = "the 'trans1' transformer should be 2D"
|
|
with pytest.raises(ValueError, match=msg):
|
|
ct.fit_transform(X_df)
|
|
# because fit is also doing transform, this raises already on fit
|
|
with pytest.raises(ValueError, match=msg):
|
|
ct.fit(X_df)
|
|
|
|
|
|
@pytest.mark.parametrize("remainder", ["drop", "passthrough"])
|
|
def test_column_transformer_invalid_columns(remainder):
|
|
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
|
|
|
|
# general invalid
|
|
for col in [1.5, ["string", 1], slice(1, "s"), np.array([1.0])]:
|
|
ct = ColumnTransformer([("trans", Trans(), col)], remainder=remainder)
|
|
with pytest.raises(ValueError, match="No valid specification"):
|
|
ct.fit(X_array)
|
|
|
|
# invalid for arrays
|
|
for col in ["string", ["string", "other"], slice("a", "b")]:
|
|
ct = ColumnTransformer([("trans", Trans(), col)], remainder=remainder)
|
|
with pytest.raises(ValueError, match="Specifying the columns"):
|
|
ct.fit(X_array)
|
|
|
|
# transformed n_features does not match fitted n_features
|
|
col = [0, 1]
|
|
ct = ColumnTransformer([("trans", Trans(), col)], remainder=remainder)
|
|
ct.fit(X_array)
|
|
X_array_more = np.array([[0, 1, 2], [2, 4, 6], [3, 6, 9]]).T
|
|
msg = "X has 3 features, but ColumnTransformer is expecting 2 features as input."
|
|
with pytest.raises(ValueError, match=msg):
|
|
ct.transform(X_array_more)
|
|
X_array_fewer = np.array(
|
|
[
|
|
[0, 1, 2],
|
|
]
|
|
).T
|
|
err_msg = (
|
|
"X has 1 features, but ColumnTransformer is expecting 2 features as input."
|
|
)
|
|
with pytest.raises(ValueError, match=err_msg):
|
|
ct.transform(X_array_fewer)
|
|
|
|
|
|
def test_column_transformer_invalid_transformer():
|
|
class NoTrans(BaseEstimator):
|
|
def fit(self, X, y=None):
|
|
return self
|
|
|
|
def predict(self, X):
|
|
return X
|
|
|
|
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
|
|
ct = ColumnTransformer([("trans", NoTrans(), [0])])
|
|
msg = "All estimators should implement fit and transform"
|
|
with pytest.raises(TypeError, match=msg):
|
|
ct.fit(X_array)
|
|
|
|
|
|
def test_make_column_transformer():
|
|
scaler = StandardScaler()
|
|
norm = Normalizer()
|
|
ct = make_column_transformer((scaler, "first"), (norm, ["second"]))
|
|
names, transformers, columns = zip(*ct.transformers)
|
|
assert names == ("standardscaler", "normalizer")
|
|
assert transformers == (scaler, norm)
|
|
assert columns == ("first", ["second"])
|
|
|
|
|
|
def test_make_column_transformer_pandas():
|
|
pd = pytest.importorskip("pandas")
|
|
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
|
|
X_df = pd.DataFrame(X_array, columns=["first", "second"])
|
|
norm = Normalizer()
|
|
ct1 = ColumnTransformer([("norm", Normalizer(), X_df.columns)])
|
|
ct2 = make_column_transformer((norm, X_df.columns))
|
|
assert_almost_equal(ct1.fit_transform(X_df), ct2.fit_transform(X_df))
|
|
|
|
|
|
def test_make_column_transformer_kwargs():
|
|
scaler = StandardScaler()
|
|
norm = Normalizer()
|
|
ct = make_column_transformer(
|
|
(scaler, "first"),
|
|
(norm, ["second"]),
|
|
n_jobs=3,
|
|
remainder="drop",
|
|
sparse_threshold=0.5,
|
|
)
|
|
assert (
|
|
ct.transformers
|
|
== make_column_transformer((scaler, "first"), (norm, ["second"])).transformers
|
|
)
|
|
assert ct.n_jobs == 3
|
|
assert ct.remainder == "drop"
|
|
assert ct.sparse_threshold == 0.5
|
|
# invalid keyword parameters should raise an error message
|
|
msg = re.escape(
|
|
"make_column_transformer() got an unexpected "
|
|
"keyword argument 'transformer_weights'"
|
|
)
|
|
with pytest.raises(TypeError, match=msg):
|
|
make_column_transformer(
|
|
(scaler, "first"),
|
|
(norm, ["second"]),
|
|
transformer_weights={"pca": 10, "Transf": 1},
|
|
)
|
|
|
|
|
|
def test_make_column_transformer_remainder_transformer():
|
|
scaler = StandardScaler()
|
|
norm = Normalizer()
|
|
remainder = StandardScaler()
|
|
ct = make_column_transformer(
|
|
(scaler, "first"), (norm, ["second"]), remainder=remainder
|
|
)
|
|
assert ct.remainder == remainder
|
|
|
|
|
|
def test_column_transformer_get_set_params():
|
|
ct = ColumnTransformer(
|
|
[("trans1", StandardScaler(), [0]), ("trans2", StandardScaler(), [1])]
|
|
)
|
|
|
|
exp = {
|
|
"n_jobs": None,
|
|
"remainder": "drop",
|
|
"sparse_threshold": 0.3,
|
|
"trans1": ct.transformers[0][1],
|
|
"trans1__copy": True,
|
|
"trans1__with_mean": True,
|
|
"trans1__with_std": True,
|
|
"trans2": ct.transformers[1][1],
|
|
"trans2__copy": True,
|
|
"trans2__with_mean": True,
|
|
"trans2__with_std": True,
|
|
"transformers": ct.transformers,
|
|
"transformer_weights": None,
|
|
"verbose_feature_names_out": True,
|
|
"verbose": False,
|
|
"force_int_remainder_cols": True,
|
|
}
|
|
|
|
assert ct.get_params() == exp
|
|
|
|
ct.set_params(trans1__with_mean=False)
|
|
assert not ct.get_params()["trans1__with_mean"]
|
|
|
|
ct.set_params(trans1="passthrough")
|
|
exp = {
|
|
"n_jobs": None,
|
|
"remainder": "drop",
|
|
"sparse_threshold": 0.3,
|
|
"trans1": "passthrough",
|
|
"trans2": ct.transformers[1][1],
|
|
"trans2__copy": True,
|
|
"trans2__with_mean": True,
|
|
"trans2__with_std": True,
|
|
"transformers": ct.transformers,
|
|
"transformer_weights": None,
|
|
"verbose_feature_names_out": True,
|
|
"verbose": False,
|
|
"force_int_remainder_cols": True,
|
|
}
|
|
|
|
assert ct.get_params() == exp
|
|
|
|
|
|
def test_column_transformer_named_estimators():
|
|
X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
|
|
ct = ColumnTransformer(
|
|
[
|
|
("trans1", StandardScaler(), [0]),
|
|
("trans2", StandardScaler(with_std=False), [1]),
|
|
]
|
|
)
|
|
assert not hasattr(ct, "transformers_")
|
|
ct.fit(X_array)
|
|
assert hasattr(ct, "transformers_")
|
|
assert isinstance(ct.named_transformers_["trans1"], StandardScaler)
|
|
assert isinstance(ct.named_transformers_.trans1, StandardScaler)
|
|
assert isinstance(ct.named_transformers_["trans2"], StandardScaler)
|
|
assert isinstance(ct.named_transformers_.trans2, StandardScaler)
|
|
assert not ct.named_transformers_.trans2.with_std
|
|
# check it are fitted transformers
|
|
assert ct.named_transformers_.trans1.mean_ == 1.0
|
|
|
|
|
|
def test_column_transformer_cloning():
|
|
X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
|
|
|
|
ct = ColumnTransformer([("trans", StandardScaler(), [0])])
|
|
ct.fit(X_array)
|
|
assert not hasattr(ct.transformers[0][1], "mean_")
|
|
assert hasattr(ct.transformers_[0][1], "mean_")
|
|
|
|
ct = ColumnTransformer([("trans", StandardScaler(), [0])])
|
|
ct.fit_transform(X_array)
|
|
assert not hasattr(ct.transformers[0][1], "mean_")
|
|
assert hasattr(ct.transformers_[0][1], "mean_")
|
|
|
|
|
|
def test_column_transformer_get_feature_names():
|
|
X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
|
|
ct = ColumnTransformer([("trans", Trans(), [0, 1])])
|
|
# raise correct error when not fitted
|
|
with pytest.raises(NotFittedError):
|
|
ct.get_feature_names_out()
|
|
# raise correct error when no feature names are available
|
|
ct.fit(X_array)
|
|
msg = re.escape(
|
|
"Transformer trans (type Trans) does not provide get_feature_names_out"
|
|
)
|
|
with pytest.raises(AttributeError, match=msg):
|
|
ct.get_feature_names_out()
|
|
|
|
|
|
def test_column_transformer_special_strings():
|
|
# one 'drop' -> ignore
|
|
X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
|
|
ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", "drop", [1])])
|
|
exp = np.array([[0.0], [1.0], [2.0]])
|
|
assert_array_equal(ct.fit_transform(X_array), exp)
|
|
assert_array_equal(ct.fit(X_array).transform(X_array), exp)
|
|
assert len(ct.transformers_) == 2
|
|
assert ct.transformers_[-1][0] != "remainder"
|
|
|
|
# all 'drop' -> return shape 0 array
|
|
ct = ColumnTransformer([("trans1", "drop", [0]), ("trans2", "drop", [1])])
|
|
assert_array_equal(ct.fit(X_array).transform(X_array).shape, (3, 0))
|
|
assert_array_equal(ct.fit_transform(X_array).shape, (3, 0))
|
|
assert len(ct.transformers_) == 2
|
|
assert ct.transformers_[-1][0] != "remainder"
|
|
|
|
# 'passthrough'
|
|
X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
|
|
ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", "passthrough", [1])])
|
|
exp = X_array
|
|
assert_array_equal(ct.fit_transform(X_array), exp)
|
|
assert_array_equal(ct.fit(X_array).transform(X_array), exp)
|
|
assert len(ct.transformers_) == 2
|
|
assert ct.transformers_[-1][0] != "remainder"
|
|
|
|
|
|
def test_column_transformer_remainder():
|
|
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
|
|
|
|
X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
|
|
X_res_second = np.array([2, 4, 6]).reshape(-1, 1)
|
|
X_res_both = X_array
|
|
|
|
# default drop
|
|
ct = ColumnTransformer([("trans1", Trans(), [0])])
|
|
assert_array_equal(ct.fit_transform(X_array), X_res_first)
|
|
assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first)
|
|
assert len(ct.transformers_) == 2
|
|
assert ct.transformers_[-1][0] == "remainder"
|
|
assert ct.transformers_[-1][1] == "drop"
|
|
assert_array_equal(ct.transformers_[-1][2], [1])
|
|
|
|
# specify passthrough
|
|
ct = ColumnTransformer([("trans", Trans(), [0])], remainder="passthrough")
|
|
assert_array_equal(ct.fit_transform(X_array), X_res_both)
|
|
assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
|
|
assert len(ct.transformers_) == 2
|
|
assert ct.transformers_[-1][0] == "remainder"
|
|
assert isinstance(ct.transformers_[-1][1], FunctionTransformer)
|
|
assert_array_equal(ct.transformers_[-1][2], [1])
|
|
|
|
# column order is not preserved (passed through added to end)
|
|
ct = ColumnTransformer([("trans1", Trans(), [1])], remainder="passthrough")
|
|
assert_array_equal(ct.fit_transform(X_array), X_res_both[:, ::-1])
|
|
assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both[:, ::-1])
|
|
assert len(ct.transformers_) == 2
|
|
assert ct.transformers_[-1][0] == "remainder"
|
|
assert isinstance(ct.transformers_[-1][1], FunctionTransformer)
|
|
assert_array_equal(ct.transformers_[-1][2], [0])
|
|
|
|
# passthrough when all actual transformers are skipped
|
|
ct = ColumnTransformer([("trans1", "drop", [0])], remainder="passthrough")
|
|
assert_array_equal(ct.fit_transform(X_array), X_res_second)
|
|
assert_array_equal(ct.fit(X_array).transform(X_array), X_res_second)
|
|
assert len(ct.transformers_) == 2
|
|
assert ct.transformers_[-1][0] == "remainder"
|
|
assert isinstance(ct.transformers_[-1][1], FunctionTransformer)
|
|
assert_array_equal(ct.transformers_[-1][2], [1])
|
|
|
|
# check default for make_column_transformer
|
|
ct = make_column_transformer((Trans(), [0]))
|
|
assert ct.remainder == "drop"
|
|
|
|
|
|
# TODO(1.7): check for deprecated force_int_remainder_cols
|
|
# TODO(1.9): remove force_int but keep the test
|
|
@pytest.mark.parametrize(
|
|
"cols1, cols2",
|
|
[
|
|
([0], [False, True, False]), # mix types
|
|
([0], [1]), # ints
|
|
(lambda x: [0], lambda x: [1]), # callables
|
|
],
|
|
)
|
|
@pytest.mark.parametrize("force_int", [False, True])
|
|
def test_column_transformer_remainder_dtypes_ints(force_int, cols1, cols2):
|
|
"""Check that the remainder columns are always stored as indices when
|
|
other columns are not all specified as column names or masks, regardless of
|
|
`force_int_remainder_cols`.
|
|
"""
|
|
X = np.ones((1, 3))
|
|
|
|
ct = make_column_transformer(
|
|
(Trans(), cols1),
|
|
(Trans(), cols2),
|
|
remainder="passthrough",
|
|
force_int_remainder_cols=force_int,
|
|
)
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("error")
|
|
ct.fit_transform(X)
|
|
assert ct.transformers_[-1][-1][0] == 2
|
|
|
|
|
|
# TODO(1.7): check for deprecated force_int_remainder_cols
|
|
# TODO(1.9): remove force_int but keep the test
|
|
@pytest.mark.parametrize(
|
|
"force_int, cols1, cols2, expected_cols",
|
|
[
|
|
(True, ["A"], ["B"], [2]),
|
|
(False, ["A"], ["B"], ["C"]),
|
|
(True, [True, False, False], [False, True, False], [2]),
|
|
(False, [True, False, False], [False, True, False], [False, False, True]),
|
|
],
|
|
)
|
|
def test_column_transformer_remainder_dtypes(force_int, cols1, cols2, expected_cols):
|
|
"""Check that the remainder columns format matches the format of the other
|
|
columns when they're all strings or masks, unless `force_int = True`.
|
|
"""
|
|
X = np.ones((1, 3))
|
|
|
|
if isinstance(cols1[0], str):
|
|
pd = pytest.importorskip("pandas")
|
|
X = pd.DataFrame(X, columns=["A", "B", "C"])
|
|
|
|
# if inputs are column names store remainder columns as column names unless
|
|
# force_int_remainder_cols is True
|
|
ct = make_column_transformer(
|
|
(Trans(), cols1),
|
|
(Trans(), cols2),
|
|
remainder="passthrough",
|
|
force_int_remainder_cols=force_int,
|
|
)
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("error")
|
|
ct.fit_transform(X)
|
|
|
|
if force_int:
|
|
# If we forced using ints and we access the remainder columns a warning is shown
|
|
match = "The format of the columns of the 'remainder' transformer"
|
|
cols = ct.transformers_[-1][-1]
|
|
with pytest.warns(FutureWarning, match=match):
|
|
cols[0]
|
|
else:
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("error")
|
|
cols = ct.transformers_[-1][-1]
|
|
cols[0]
|
|
|
|
assert cols == expected_cols
|
|
|
|
|
|
def test_remainder_list_repr():
|
|
cols = _RemainderColsList([0, 1], warning_enabled=False)
|
|
assert str(cols) == "[0, 1]"
|
|
assert repr(cols) == "[0, 1]"
|
|
mock = Mock()
|
|
cols._repr_pretty_(mock, False)
|
|
mock.text.assert_called_once_with("[0, 1]")
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"key, expected_cols",
|
|
[
|
|
([0], [1]),
|
|
(np.array([0]), [1]),
|
|
(slice(0, 1), [1]),
|
|
(np.array([True, False]), [False, True]),
|
|
],
|
|
)
|
|
def test_column_transformer_remainder_numpy(key, expected_cols):
|
|
# test different ways that columns are specified with passthrough
|
|
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
|
|
X_res_both = X_array
|
|
|
|
ct = ColumnTransformer(
|
|
[("trans1", Trans(), key)],
|
|
remainder="passthrough",
|
|
force_int_remainder_cols=False,
|
|
)
|
|
assert_array_equal(ct.fit_transform(X_array), X_res_both)
|
|
assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
|
|
assert len(ct.transformers_) == 2
|
|
assert ct.transformers_[-1][0] == "remainder"
|
|
assert isinstance(ct.transformers_[-1][1], FunctionTransformer)
|
|
assert ct.transformers_[-1][2] == expected_cols
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"key, expected_cols",
|
|
[
|
|
([0], [1]),
|
|
(slice(0, 1), [1]),
|
|
(np.array([True, False]), [False, True]),
|
|
(["first"], ["second"]),
|
|
("pd-index", ["second"]),
|
|
(np.array(["first"]), ["second"]),
|
|
(np.array(["first"], dtype=object), ["second"]),
|
|
(slice(None, "first"), ["second"]),
|
|
(slice("first", "first"), ["second"]),
|
|
],
|
|
)
|
|
def test_column_transformer_remainder_pandas(key, expected_cols):
|
|
# test different ways that columns are specified with passthrough
|
|
pd = pytest.importorskip("pandas")
|
|
if isinstance(key, str) and key == "pd-index":
|
|
key = pd.Index(["first"])
|
|
|
|
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
|
|
X_df = pd.DataFrame(X_array, columns=["first", "second"])
|
|
X_res_both = X_array
|
|
|
|
ct = ColumnTransformer(
|
|
[("trans1", Trans(), key)],
|
|
remainder="passthrough",
|
|
force_int_remainder_cols=False,
|
|
)
|
|
assert_array_equal(ct.fit_transform(X_df), X_res_both)
|
|
assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
|
|
assert len(ct.transformers_) == 2
|
|
assert ct.transformers_[-1][0] == "remainder"
|
|
assert isinstance(ct.transformers_[-1][1], FunctionTransformer)
|
|
assert ct.transformers_[-1][2] == expected_cols
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"key, expected_cols",
|
|
[
|
|
([0], [1, 2]),
|
|
(np.array([0]), [1, 2]),
|
|
(slice(0, 1), [1, 2]),
|
|
(np.array([True, False, False]), [False, True, True]),
|
|
],
|
|
)
|
|
def test_column_transformer_remainder_transformer(key, expected_cols):
|
|
X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
|
|
X_res_both = X_array.copy()
|
|
|
|
# second and third columns are doubled when remainder = DoubleTrans
|
|
X_res_both[:, 1:3] *= 2
|
|
|
|
ct = ColumnTransformer(
|
|
[("trans1", Trans(), key)],
|
|
remainder=DoubleTrans(),
|
|
force_int_remainder_cols=False,
|
|
)
|
|
|
|
assert_array_equal(ct.fit_transform(X_array), X_res_both)
|
|
assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
|
|
assert len(ct.transformers_) == 2
|
|
assert ct.transformers_[-1][0] == "remainder"
|
|
assert isinstance(ct.transformers_[-1][1], DoubleTrans)
|
|
assert ct.transformers_[-1][2] == expected_cols
|
|
|
|
|
|
def test_column_transformer_no_remaining_remainder_transformer():
|
|
X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
|
|
|
|
ct = ColumnTransformer([("trans1", Trans(), [0, 1, 2])], remainder=DoubleTrans())
|
|
|
|
assert_array_equal(ct.fit_transform(X_array), X_array)
|
|
assert_array_equal(ct.fit(X_array).transform(X_array), X_array)
|
|
assert len(ct.transformers_) == 1
|
|
assert ct.transformers_[-1][0] != "remainder"
|
|
|
|
|
|
def test_column_transformer_drops_all_remainder_transformer():
|
|
X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
|
|
|
|
# columns are doubled when remainder = DoubleTrans
|
|
X_res_both = 2 * X_array.copy()[:, 1:3]
|
|
|
|
ct = ColumnTransformer([("trans1", "drop", [0])], remainder=DoubleTrans())
|
|
|
|
assert_array_equal(ct.fit_transform(X_array), X_res_both)
|
|
assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
|
|
assert len(ct.transformers_) == 2
|
|
assert ct.transformers_[-1][0] == "remainder"
|
|
assert isinstance(ct.transformers_[-1][1], DoubleTrans)
|
|
assert_array_equal(ct.transformers_[-1][2], [1, 2])
|
|
|
|
|
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
|
def test_column_transformer_sparse_remainder_transformer(csr_container):
|
|
X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
|
|
|
|
ct = ColumnTransformer(
|
|
[("trans1", Trans(), [0])],
|
|
remainder=SparseMatrixTrans(csr_container),
|
|
sparse_threshold=0.8,
|
|
)
|
|
|
|
X_trans = ct.fit_transform(X_array)
|
|
assert sparse.issparse(X_trans)
|
|
# SparseMatrixTrans creates 3 features for each column. There is
|
|
# one column in ``transformers``, thus:
|
|
assert X_trans.shape == (3, 3 + 1)
|
|
|
|
exp_array = np.hstack((X_array[:, 0].reshape(-1, 1), np.eye(3)))
|
|
assert_array_equal(X_trans.toarray(), exp_array)
|
|
assert len(ct.transformers_) == 2
|
|
assert ct.transformers_[-1][0] == "remainder"
|
|
assert isinstance(ct.transformers_[-1][1], SparseMatrixTrans)
|
|
assert_array_equal(ct.transformers_[-1][2], [1, 2])
|
|
|
|
|
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
|
def test_column_transformer_drop_all_sparse_remainder_transformer(csr_container):
|
|
X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
|
|
ct = ColumnTransformer(
|
|
[("trans1", "drop", [0])],
|
|
remainder=SparseMatrixTrans(csr_container),
|
|
sparse_threshold=0.8,
|
|
)
|
|
|
|
X_trans = ct.fit_transform(X_array)
|
|
assert sparse.issparse(X_trans)
|
|
|
|
# SparseMatrixTrans creates 3 features for each column, thus:
|
|
assert X_trans.shape == (3, 3)
|
|
assert_array_equal(X_trans.toarray(), np.eye(3))
|
|
assert len(ct.transformers_) == 2
|
|
assert ct.transformers_[-1][0] == "remainder"
|
|
assert isinstance(ct.transformers_[-1][1], SparseMatrixTrans)
|
|
assert_array_equal(ct.transformers_[-1][2], [1, 2])
|
|
|
|
|
|
def test_column_transformer_get_set_params_with_remainder():
|
|
ct = ColumnTransformer(
|
|
[("trans1", StandardScaler(), [0])], remainder=StandardScaler()
|
|
)
|
|
|
|
exp = {
|
|
"n_jobs": None,
|
|
"remainder": ct.remainder,
|
|
"remainder__copy": True,
|
|
"remainder__with_mean": True,
|
|
"remainder__with_std": True,
|
|
"sparse_threshold": 0.3,
|
|
"trans1": ct.transformers[0][1],
|
|
"trans1__copy": True,
|
|
"trans1__with_mean": True,
|
|
"trans1__with_std": True,
|
|
"transformers": ct.transformers,
|
|
"transformer_weights": None,
|
|
"verbose_feature_names_out": True,
|
|
"verbose": False,
|
|
"force_int_remainder_cols": True,
|
|
}
|
|
|
|
assert ct.get_params() == exp
|
|
|
|
ct.set_params(remainder__with_std=False)
|
|
assert not ct.get_params()["remainder__with_std"]
|
|
|
|
ct.set_params(trans1="passthrough")
|
|
exp = {
|
|
"n_jobs": None,
|
|
"remainder": ct.remainder,
|
|
"remainder__copy": True,
|
|
"remainder__with_mean": True,
|
|
"remainder__with_std": False,
|
|
"sparse_threshold": 0.3,
|
|
"trans1": "passthrough",
|
|
"transformers": ct.transformers,
|
|
"transformer_weights": None,
|
|
"verbose_feature_names_out": True,
|
|
"verbose": False,
|
|
"force_int_remainder_cols": True,
|
|
}
|
|
assert ct.get_params() == exp
|
|
|
|
|
|
def test_column_transformer_no_estimators():
|
|
X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).astype("float").T
|
|
ct = ColumnTransformer([], remainder=StandardScaler())
|
|
|
|
params = ct.get_params()
|
|
assert params["remainder__with_mean"]
|
|
|
|
X_trans = ct.fit_transform(X_array)
|
|
assert X_trans.shape == X_array.shape
|
|
assert len(ct.transformers_) == 1
|
|
assert ct.transformers_[-1][0] == "remainder"
|
|
assert ct.transformers_[-1][2] == [0, 1, 2]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
["est", "pattern"],
|
|
[
|
|
(
|
|
ColumnTransformer(
|
|
[("trans1", Trans(), [0]), ("trans2", Trans(), [1])],
|
|
remainder=DoubleTrans(),
|
|
),
|
|
(
|
|
r"\[ColumnTransformer\].*\(1 of 3\) Processing trans1.* total=.*\n"
|
|
r"\[ColumnTransformer\].*\(2 of 3\) Processing trans2.* total=.*\n"
|
|
r"\[ColumnTransformer\].*\(3 of 3\) Processing remainder.* total=.*\n$"
|
|
),
|
|
),
|
|
(
|
|
ColumnTransformer(
|
|
[("trans1", Trans(), [0]), ("trans2", Trans(), [1])],
|
|
remainder="passthrough",
|
|
),
|
|
(
|
|
r"\[ColumnTransformer\].*\(1 of 3\) Processing trans1.* total=.*\n"
|
|
r"\[ColumnTransformer\].*\(2 of 3\) Processing trans2.* total=.*\n"
|
|
r"\[ColumnTransformer\].*\(3 of 3\) Processing remainder.* total=.*\n$"
|
|
),
|
|
),
|
|
(
|
|
ColumnTransformer(
|
|
[("trans1", Trans(), [0]), ("trans2", "drop", [1])],
|
|
remainder="passthrough",
|
|
),
|
|
(
|
|
r"\[ColumnTransformer\].*\(1 of 2\) Processing trans1.* total=.*\n"
|
|
r"\[ColumnTransformer\].*\(2 of 2\) Processing remainder.* total=.*\n$"
|
|
),
|
|
),
|
|
(
|
|
ColumnTransformer(
|
|
[("trans1", Trans(), [0]), ("trans2", "passthrough", [1])],
|
|
remainder="passthrough",
|
|
),
|
|
(
|
|
r"\[ColumnTransformer\].*\(1 of 3\) Processing trans1.* total=.*\n"
|
|
r"\[ColumnTransformer\].*\(2 of 3\) Processing trans2.* total=.*\n"
|
|
r"\[ColumnTransformer\].*\(3 of 3\) Processing remainder.* total=.*\n$"
|
|
),
|
|
),
|
|
(
|
|
ColumnTransformer([("trans1", Trans(), [0])], remainder="passthrough"),
|
|
(
|
|
r"\[ColumnTransformer\].*\(1 of 2\) Processing trans1.* total=.*\n"
|
|
r"\[ColumnTransformer\].*\(2 of 2\) Processing remainder.* total=.*\n$"
|
|
),
|
|
),
|
|
(
|
|
ColumnTransformer(
|
|
[("trans1", Trans(), [0]), ("trans2", Trans(), [1])], remainder="drop"
|
|
),
|
|
(
|
|
r"\[ColumnTransformer\].*\(1 of 2\) Processing trans1.* total=.*\n"
|
|
r"\[ColumnTransformer\].*\(2 of 2\) Processing trans2.* total=.*\n$"
|
|
),
|
|
),
|
|
(
|
|
ColumnTransformer([("trans1", Trans(), [0])], remainder="drop"),
|
|
r"\[ColumnTransformer\].*\(1 of 1\) Processing trans1.* total=.*\n$",
|
|
),
|
|
],
|
|
)
|
|
@pytest.mark.parametrize("method", ["fit", "fit_transform"])
|
|
def test_column_transformer_verbose(est, pattern, method, capsys):
|
|
X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
|
|
|
|
func = getattr(est, method)
|
|
est.set_params(verbose=False)
|
|
func(X_array)
|
|
assert not capsys.readouterr().out, "Got output for verbose=False"
|
|
|
|
est.set_params(verbose=True)
|
|
func(X_array)
|
|
assert re.match(pattern, capsys.readouterr()[0])
|
|
|
|
|
|
def test_column_transformer_no_estimators_set_params():
|
|
ct = ColumnTransformer([]).set_params(n_jobs=2)
|
|
assert ct.n_jobs == 2
|
|
|
|
|
|
def test_column_transformer_callable_specifier():
|
|
# assert that function gets the full array
|
|
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
|
|
X_res_first = np.array([[0, 1, 2]]).T
|
|
|
|
def func(X):
|
|
assert_array_equal(X, X_array)
|
|
return [0]
|
|
|
|
ct = ColumnTransformer([("trans", Trans(), func)], remainder="drop")
|
|
assert_array_equal(ct.fit_transform(X_array), X_res_first)
|
|
assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first)
|
|
assert callable(ct.transformers[0][2])
|
|
assert ct.transformers_[0][2] == [0]
|
|
|
|
|
|
def test_column_transformer_callable_specifier_dataframe():
|
|
# assert that function gets the full dataframe
|
|
pd = pytest.importorskip("pandas")
|
|
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
|
|
X_res_first = np.array([[0, 1, 2]]).T
|
|
|
|
X_df = pd.DataFrame(X_array, columns=["first", "second"])
|
|
|
|
def func(X):
|
|
assert_array_equal(X.columns, X_df.columns)
|
|
assert_array_equal(X.values, X_df.values)
|
|
return ["first"]
|
|
|
|
ct = ColumnTransformer([("trans", Trans(), func)], remainder="drop")
|
|
assert_array_equal(ct.fit_transform(X_df), X_res_first)
|
|
assert_array_equal(ct.fit(X_df).transform(X_df), X_res_first)
|
|
assert callable(ct.transformers[0][2])
|
|
assert ct.transformers_[0][2] == ["first"]
|
|
|
|
|
|
def test_column_transformer_negative_column_indexes():
|
|
X = np.random.randn(2, 2)
|
|
X_categories = np.array([[1], [2]])
|
|
X = np.concatenate([X, X_categories], axis=1)
|
|
|
|
ohe = OneHotEncoder()
|
|
|
|
tf_1 = ColumnTransformer([("ohe", ohe, [-1])], remainder="passthrough")
|
|
tf_2 = ColumnTransformer([("ohe", ohe, [2])], remainder="passthrough")
|
|
assert_array_equal(tf_1.fit_transform(X), tf_2.fit_transform(X))
|
|
|
|
|
|
@pytest.mark.parametrize("array_type", [np.asarray, *CSR_CONTAINERS])
|
|
def test_column_transformer_mask_indexing(array_type):
|
|
# Regression test for #14510
|
|
# Boolean array-like does not behave as boolean array with sparse matrices.
|
|
X = np.transpose([[1, 2, 3], [4, 5, 6], [5, 6, 7], [8, 9, 10]])
|
|
X = array_type(X)
|
|
column_transformer = ColumnTransformer(
|
|
[("identity", FunctionTransformer(), [False, True, False, True])]
|
|
)
|
|
X_trans = column_transformer.fit_transform(X)
|
|
assert X_trans.shape == (3, 2)
|
|
|
|
|
|
def test_n_features_in():
|
|
# make sure n_features_in is what is passed as input to the column
|
|
# transformer.
|
|
|
|
X = [[1, 2], [3, 4], [5, 6]]
|
|
ct = ColumnTransformer([("a", DoubleTrans(), [0]), ("b", DoubleTrans(), [1])])
|
|
assert not hasattr(ct, "n_features_in_")
|
|
ct.fit(X)
|
|
assert ct.n_features_in_ == 2
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"cols, pattern, include, exclude",
|
|
[
|
|
(["col_int", "col_float"], None, np.number, None),
|
|
(["col_int", "col_float"], None, None, object),
|
|
(["col_int", "col_float"], None, [int, float], None),
|
|
(["col_str"], None, [object], None),
|
|
(["col_str"], None, object, None),
|
|
(["col_float"], None, float, None),
|
|
(["col_float"], "at$", [np.number], None),
|
|
(["col_int"], None, [int], None),
|
|
(["col_int"], "^col_int", [np.number], None),
|
|
(["col_float", "col_str"], "float|str", None, None),
|
|
(["col_str"], "^col_s", None, [int]),
|
|
([], "str$", float, None),
|
|
(["col_int", "col_float", "col_str"], None, [np.number, object], None),
|
|
],
|
|
)
|
|
def test_make_column_selector_with_select_dtypes(cols, pattern, include, exclude):
|
|
pd = pytest.importorskip("pandas")
|
|
|
|
X_df = pd.DataFrame(
|
|
{
|
|
"col_int": np.array([0, 1, 2], dtype=int),
|
|
"col_float": np.array([0.0, 1.0, 2.0], dtype=float),
|
|
"col_str": ["one", "two", "three"],
|
|
},
|
|
columns=["col_int", "col_float", "col_str"],
|
|
)
|
|
|
|
selector = make_column_selector(
|
|
dtype_include=include, dtype_exclude=exclude, pattern=pattern
|
|
)
|
|
|
|
assert_array_equal(selector(X_df), cols)
|
|
|
|
|
|
def test_column_transformer_with_make_column_selector():
|
|
# Functional test for column transformer + column selector
|
|
pd = pytest.importorskip("pandas")
|
|
X_df = pd.DataFrame(
|
|
{
|
|
"col_int": np.array([0, 1, 2], dtype=int),
|
|
"col_float": np.array([0.0, 1.0, 2.0], dtype=float),
|
|
"col_cat": ["one", "two", "one"],
|
|
"col_str": ["low", "middle", "high"],
|
|
},
|
|
columns=["col_int", "col_float", "col_cat", "col_str"],
|
|
)
|
|
X_df["col_str"] = X_df["col_str"].astype("category")
|
|
|
|
cat_selector = make_column_selector(dtype_include=["category", object])
|
|
num_selector = make_column_selector(dtype_include=np.number)
|
|
|
|
ohe = OneHotEncoder()
|
|
scaler = StandardScaler()
|
|
|
|
ct_selector = make_column_transformer((ohe, cat_selector), (scaler, num_selector))
|
|
ct_direct = make_column_transformer(
|
|
(ohe, ["col_cat", "col_str"]), (scaler, ["col_float", "col_int"])
|
|
)
|
|
|
|
X_selector = ct_selector.fit_transform(X_df)
|
|
X_direct = ct_direct.fit_transform(X_df)
|
|
|
|
assert_allclose(X_selector, X_direct)
|
|
|
|
|
|
def test_make_column_selector_error():
|
|
selector = make_column_selector(dtype_include=np.number)
|
|
X = np.array([[0.1, 0.2]])
|
|
msg = "make_column_selector can only be applied to pandas dataframes"
|
|
with pytest.raises(ValueError, match=msg):
|
|
selector(X)
|
|
|
|
|
|
def test_make_column_selector_pickle():
|
|
pd = pytest.importorskip("pandas")
|
|
|
|
X_df = pd.DataFrame(
|
|
{
|
|
"col_int": np.array([0, 1, 2], dtype=int),
|
|
"col_float": np.array([0.0, 1.0, 2.0], dtype=float),
|
|
"col_str": ["one", "two", "three"],
|
|
},
|
|
columns=["col_int", "col_float", "col_str"],
|
|
)
|
|
|
|
selector = make_column_selector(dtype_include=[object])
|
|
selector_picked = pickle.loads(pickle.dumps(selector))
|
|
|
|
assert_array_equal(selector(X_df), selector_picked(X_df))
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"empty_col",
|
|
[[], np.array([], dtype=int), lambda x: []],
|
|
ids=["list", "array", "callable"],
|
|
)
|
|
def test_feature_names_empty_columns(empty_col):
|
|
pd = pytest.importorskip("pandas")
|
|
|
|
df = pd.DataFrame({"col1": ["a", "a", "b"], "col2": ["z", "z", "z"]})
|
|
|
|
ct = ColumnTransformer(
|
|
transformers=[
|
|
("ohe", OneHotEncoder(), ["col1", "col2"]),
|
|
("empty_features", OneHotEncoder(), empty_col),
|
|
],
|
|
)
|
|
|
|
ct.fit(df)
|
|
assert_array_equal(
|
|
ct.get_feature_names_out(), ["ohe__col1_a", "ohe__col1_b", "ohe__col2_z"]
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"selector",
|
|
[
|
|
[1],
|
|
lambda x: [1],
|
|
["col2"],
|
|
lambda x: ["col2"],
|
|
[False, True],
|
|
lambda x: [False, True],
|
|
],
|
|
)
|
|
def test_feature_names_out_pandas(selector):
|
|
"""Checks name when selecting only the second column"""
|
|
pd = pytest.importorskip("pandas")
|
|
df = pd.DataFrame({"col1": ["a", "a", "b"], "col2": ["z", "z", "z"]})
|
|
ct = ColumnTransformer([("ohe", OneHotEncoder(), selector)])
|
|
ct.fit(df)
|
|
|
|
assert_array_equal(ct.get_feature_names_out(), ["ohe__col2_z"])
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"selector", [[1], lambda x: [1], [False, True], lambda x: [False, True]]
|
|
)
|
|
def test_feature_names_out_non_pandas(selector):
|
|
"""Checks name when selecting the second column with numpy array"""
|
|
X = [["a", "z"], ["a", "z"], ["b", "z"]]
|
|
ct = ColumnTransformer([("ohe", OneHotEncoder(), selector)])
|
|
ct.fit(X)
|
|
|
|
assert_array_equal(ct.get_feature_names_out(), ["ohe__x1_z"])
|
|
|
|
|
|
@pytest.mark.parametrize("remainder", ["passthrough", StandardScaler()])
|
|
def test_sk_visual_block_remainder(remainder):
|
|
# remainder='passthrough' or an estimator will be shown in repr_html
|
|
ohe = OneHotEncoder()
|
|
ct = ColumnTransformer(
|
|
transformers=[("ohe", ohe, ["col1", "col2"])], remainder=remainder
|
|
)
|
|
visual_block = ct._sk_visual_block_()
|
|
assert visual_block.names == ("ohe", "remainder")
|
|
assert visual_block.name_details == (["col1", "col2"], "")
|
|
assert visual_block.estimators == (ohe, remainder)
|
|
|
|
|
|
def test_sk_visual_block_remainder_drop():
|
|
# remainder='drop' is not shown in repr_html
|
|
ohe = OneHotEncoder()
|
|
ct = ColumnTransformer(transformers=[("ohe", ohe, ["col1", "col2"])])
|
|
visual_block = ct._sk_visual_block_()
|
|
assert visual_block.names == ("ohe",)
|
|
assert visual_block.name_details == (["col1", "col2"],)
|
|
assert visual_block.estimators == (ohe,)
|
|
|
|
|
|
@pytest.mark.parametrize("remainder", ["passthrough", StandardScaler()])
|
|
def test_sk_visual_block_remainder_fitted_pandas(remainder):
|
|
# Remainder shows the columns after fitting
|
|
pd = pytest.importorskip("pandas")
|
|
ohe = OneHotEncoder()
|
|
ct = ColumnTransformer(
|
|
transformers=[("ohe", ohe, ["col1", "col2"])],
|
|
remainder=remainder,
|
|
force_int_remainder_cols=False,
|
|
)
|
|
df = pd.DataFrame(
|
|
{
|
|
"col1": ["a", "b", "c"],
|
|
"col2": ["z", "z", "z"],
|
|
"col3": [1, 2, 3],
|
|
"col4": [3, 4, 5],
|
|
}
|
|
)
|
|
ct.fit(df)
|
|
visual_block = ct._sk_visual_block_()
|
|
assert visual_block.names == ("ohe", "remainder")
|
|
assert visual_block.name_details == (["col1", "col2"], ["col3", "col4"])
|
|
assert visual_block.estimators == (ohe, remainder)
|
|
|
|
|
|
@pytest.mark.parametrize("remainder", ["passthrough", StandardScaler()])
|
|
def test_sk_visual_block_remainder_fitted_numpy(remainder):
|
|
# Remainder shows the indices after fitting
|
|
X = np.array([[1, 2, 3], [4, 5, 6]], dtype=float)
|
|
scaler = StandardScaler()
|
|
ct = ColumnTransformer(
|
|
transformers=[("scale", scaler, [0, 2])], remainder=remainder
|
|
)
|
|
ct.fit(X)
|
|
visual_block = ct._sk_visual_block_()
|
|
assert visual_block.names == ("scale", "remainder")
|
|
assert visual_block.name_details == ([0, 2], [1])
|
|
assert visual_block.estimators == (scaler, remainder)
|
|
|
|
|
|
@pytest.mark.parametrize("explicit_colname", ["first", "second", 0, 1])
|
|
@pytest.mark.parametrize("remainder", [Trans(), "passthrough", "drop"])
|
|
def test_column_transformer_reordered_column_names_remainder(
|
|
explicit_colname, remainder
|
|
):
|
|
"""Test the interaction between remainder and column transformer"""
|
|
pd = pytest.importorskip("pandas")
|
|
|
|
X_fit_array = np.array([[0, 1, 2], [2, 4, 6]]).T
|
|
X_fit_df = pd.DataFrame(X_fit_array, columns=["first", "second"])
|
|
|
|
X_trans_array = np.array([[2, 4, 6], [0, 1, 2]]).T
|
|
X_trans_df = pd.DataFrame(X_trans_array, columns=["second", "first"])
|
|
|
|
tf = ColumnTransformer([("bycol", Trans(), explicit_colname)], remainder=remainder)
|
|
|
|
tf.fit(X_fit_df)
|
|
X_fit_trans = tf.transform(X_fit_df)
|
|
|
|
# Changing the order still works
|
|
X_trans = tf.transform(X_trans_df)
|
|
assert_allclose(X_trans, X_fit_trans)
|
|
|
|
# extra columns are ignored
|
|
X_extended_df = X_fit_df.copy()
|
|
X_extended_df["third"] = [3, 6, 9]
|
|
X_trans = tf.transform(X_extended_df)
|
|
assert_allclose(X_trans, X_fit_trans)
|
|
|
|
if isinstance(explicit_colname, str):
|
|
# Raise error if columns are specified by names but input only allows
|
|
# to specify by position, e.g. numpy array instead of a pandas df.
|
|
X_array = X_fit_array.copy()
|
|
err_msg = "Specifying the columns"
|
|
with pytest.raises(ValueError, match=err_msg):
|
|
tf.transform(X_array)
|
|
|
|
|
|
def test_feature_name_validation_missing_columns_drop_passthough():
|
|
"""Test the interaction between {'drop', 'passthrough'} and
|
|
missing column names."""
|
|
pd = pytest.importorskip("pandas")
|
|
|
|
X = np.ones(shape=(3, 4))
|
|
df = pd.DataFrame(X, columns=["a", "b", "c", "d"])
|
|
|
|
df_dropped = df.drop("c", axis=1)
|
|
|
|
# with remainder='passthrough', all columns seen during `fit` must be
|
|
# present
|
|
tf = ColumnTransformer([("bycol", Trans(), [1])], remainder="passthrough")
|
|
tf.fit(df)
|
|
msg = r"columns are missing: {'c'}"
|
|
with pytest.raises(ValueError, match=msg):
|
|
tf.transform(df_dropped)
|
|
|
|
# with remainder='drop', it is allowed to have column 'c' missing
|
|
tf = ColumnTransformer([("bycol", Trans(), [1])], remainder="drop")
|
|
tf.fit(df)
|
|
|
|
df_dropped_trans = tf.transform(df_dropped)
|
|
df_fit_trans = tf.transform(df)
|
|
assert_allclose(df_dropped_trans, df_fit_trans)
|
|
|
|
# bycol drops 'c', thus it is allowed for 'c' to be missing
|
|
tf = ColumnTransformer([("bycol", "drop", ["c"])], remainder="passthrough")
|
|
tf.fit(df)
|
|
df_dropped_trans = tf.transform(df_dropped)
|
|
df_fit_trans = tf.transform(df)
|
|
assert_allclose(df_dropped_trans, df_fit_trans)
|
|
|
|
|
|
def test_feature_names_in_():
|
|
"""Feature names are stored in column transformer.
|
|
|
|
Column transformer deliberately does not check for column name consistency.
|
|
It only checks that the non-dropped names seen in `fit` are seen
|
|
in `transform`. This behavior is already tested in
|
|
`test_feature_name_validation_missing_columns_drop_passthough`"""
|
|
|
|
pd = pytest.importorskip("pandas")
|
|
|
|
feature_names = ["a", "c", "d"]
|
|
df = pd.DataFrame([[1, 2, 3]], columns=feature_names)
|
|
ct = ColumnTransformer([("bycol", Trans(), ["a", "d"])], remainder="passthrough")
|
|
|
|
ct.fit(df)
|
|
assert_array_equal(ct.feature_names_in_, feature_names)
|
|
assert isinstance(ct.feature_names_in_, np.ndarray)
|
|
assert ct.feature_names_in_.dtype == object
|
|
|
|
|
|
class TransWithNames(Trans):
|
|
def __init__(self, feature_names_out=None):
|
|
self.feature_names_out = feature_names_out
|
|
|
|
def get_feature_names_out(self, input_features=None):
|
|
if self.feature_names_out is not None:
|
|
return np.asarray(self.feature_names_out, dtype=object)
|
|
return input_features
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"transformers, remainder, expected_names",
|
|
[
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(), ["d", "c"]),
|
|
("bycol2", "passthrough", ["d"]),
|
|
],
|
|
"passthrough",
|
|
["bycol1__d", "bycol1__c", "bycol2__d", "remainder__a", "remainder__b"],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(), ["d", "c"]),
|
|
("bycol2", "passthrough", ["d"]),
|
|
],
|
|
"drop",
|
|
["bycol1__d", "bycol1__c", "bycol2__d"],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(), ["b"]),
|
|
("bycol2", "drop", ["d"]),
|
|
],
|
|
"passthrough",
|
|
["bycol1__b", "remainder__a", "remainder__c"],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(["pca1", "pca2"]), ["a", "b", "d"]),
|
|
],
|
|
"passthrough",
|
|
["bycol1__pca1", "bycol1__pca2", "remainder__c"],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(["a", "b"]), ["d"]),
|
|
("bycol2", "passthrough", ["b"]),
|
|
],
|
|
"drop",
|
|
["bycol1__a", "bycol1__b", "bycol2__b"],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames([f"pca{i}" for i in range(2)]), ["b"]),
|
|
("bycol2", TransWithNames([f"pca{i}" for i in range(2)]), ["b"]),
|
|
],
|
|
"passthrough",
|
|
[
|
|
"bycol1__pca0",
|
|
"bycol1__pca1",
|
|
"bycol2__pca0",
|
|
"bycol2__pca1",
|
|
"remainder__a",
|
|
"remainder__c",
|
|
"remainder__d",
|
|
],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", "drop", ["d"]),
|
|
],
|
|
"drop",
|
|
[],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(), slice(1, 3)),
|
|
],
|
|
"drop",
|
|
["bycol1__b", "bycol1__c"],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(), ["b"]),
|
|
("bycol2", "drop", slice(3, 4)),
|
|
],
|
|
"passthrough",
|
|
["bycol1__b", "remainder__a", "remainder__c"],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(), ["d", "c"]),
|
|
("bycol2", "passthrough", slice(3, 4)),
|
|
],
|
|
"passthrough",
|
|
["bycol1__d", "bycol1__c", "bycol2__d", "remainder__a", "remainder__b"],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(), slice("b", "c")),
|
|
],
|
|
"drop",
|
|
["bycol1__b", "bycol1__c"],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(), ["b"]),
|
|
("bycol2", "drop", slice("c", "d")),
|
|
],
|
|
"passthrough",
|
|
["bycol1__b", "remainder__a"],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(), ["d", "c"]),
|
|
("bycol2", "passthrough", slice("c", "d")),
|
|
],
|
|
"passthrough",
|
|
[
|
|
"bycol1__d",
|
|
"bycol1__c",
|
|
"bycol2__c",
|
|
"bycol2__d",
|
|
"remainder__a",
|
|
"remainder__b",
|
|
],
|
|
),
|
|
],
|
|
)
|
|
def test_verbose_feature_names_out_true(transformers, remainder, expected_names):
|
|
"""Check feature_names_out for verbose_feature_names_out=True (default)"""
|
|
pd = pytest.importorskip("pandas")
|
|
df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"])
|
|
ct = ColumnTransformer(
|
|
transformers,
|
|
remainder=remainder,
|
|
)
|
|
ct.fit(df)
|
|
|
|
names = ct.get_feature_names_out()
|
|
assert isinstance(names, np.ndarray)
|
|
assert names.dtype == object
|
|
assert_array_equal(names, expected_names)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"transformers, remainder, expected_names",
|
|
[
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(), ["d", "c"]),
|
|
("bycol2", "passthrough", ["a"]),
|
|
],
|
|
"passthrough",
|
|
["d", "c", "a", "b"],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(["a"]), ["d", "c"]),
|
|
("bycol2", "passthrough", ["d"]),
|
|
],
|
|
"drop",
|
|
["a", "d"],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(), ["b"]),
|
|
("bycol2", "drop", ["d"]),
|
|
],
|
|
"passthrough",
|
|
["b", "a", "c"],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(["pca1", "pca2"]), ["a", "b", "d"]),
|
|
],
|
|
"passthrough",
|
|
["pca1", "pca2", "c"],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(["a", "c"]), ["d"]),
|
|
("bycol2", "passthrough", ["d"]),
|
|
],
|
|
"drop",
|
|
["a", "c", "d"],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames([f"pca{i}" for i in range(2)]), ["b"]),
|
|
("bycol2", TransWithNames([f"kpca{i}" for i in range(2)]), ["b"]),
|
|
],
|
|
"passthrough",
|
|
["pca0", "pca1", "kpca0", "kpca1", "a", "c", "d"],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", "drop", ["d"]),
|
|
],
|
|
"drop",
|
|
[],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(), slice(1, 2)),
|
|
("bycol2", "drop", ["d"]),
|
|
],
|
|
"passthrough",
|
|
["b", "a", "c"],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(), ["b"]),
|
|
("bycol2", "drop", slice(3, 4)),
|
|
],
|
|
"passthrough",
|
|
["b", "a", "c"],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(), ["d", "c"]),
|
|
("bycol2", "passthrough", slice(0, 2)),
|
|
],
|
|
"drop",
|
|
["d", "c", "a", "b"],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(), slice("a", "b")),
|
|
("bycol2", "drop", ["d"]),
|
|
],
|
|
"passthrough",
|
|
["a", "b", "c"],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(), ["b"]),
|
|
("bycol2", "drop", slice("c", "d")),
|
|
],
|
|
"passthrough",
|
|
["b", "a"],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(), ["d", "c"]),
|
|
("bycol2", "passthrough", slice("a", "b")),
|
|
],
|
|
"drop",
|
|
["d", "c", "a", "b"],
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(), ["d", "c"]),
|
|
("bycol2", "passthrough", slice("b", "b")),
|
|
],
|
|
"drop",
|
|
["d", "c", "b"],
|
|
),
|
|
],
|
|
)
|
|
def test_verbose_feature_names_out_false(transformers, remainder, expected_names):
|
|
"""Check feature_names_out for verbose_feature_names_out=False"""
|
|
pd = pytest.importorskip("pandas")
|
|
df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"])
|
|
ct = ColumnTransformer(
|
|
transformers,
|
|
remainder=remainder,
|
|
verbose_feature_names_out=False,
|
|
)
|
|
ct.fit(df)
|
|
|
|
names = ct.get_feature_names_out()
|
|
assert isinstance(names, np.ndarray)
|
|
assert names.dtype == object
|
|
assert_array_equal(names, expected_names)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"transformers, remainder, colliding_columns",
|
|
[
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(), ["b"]),
|
|
("bycol2", "passthrough", ["b"]),
|
|
],
|
|
"drop",
|
|
"['b']",
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(["c", "d"]), ["c"]),
|
|
("bycol2", "passthrough", ["c"]),
|
|
],
|
|
"drop",
|
|
"['c']",
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(["a"]), ["b"]),
|
|
("bycol2", "passthrough", ["b"]),
|
|
],
|
|
"passthrough",
|
|
"['a']",
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(["a"]), ["b"]),
|
|
("bycol2", "drop", ["b"]),
|
|
],
|
|
"passthrough",
|
|
"['a']",
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(["c", "b"]), ["b"]),
|
|
("bycol2", "passthrough", ["c", "b"]),
|
|
],
|
|
"drop",
|
|
"['b', 'c']",
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(["a"]), ["b"]),
|
|
("bycol2", "passthrough", ["a"]),
|
|
("bycol3", TransWithNames(["a"]), ["b"]),
|
|
],
|
|
"passthrough",
|
|
"['a']",
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(["a", "b"]), ["b"]),
|
|
("bycol2", "passthrough", ["a"]),
|
|
("bycol3", TransWithNames(["b"]), ["c"]),
|
|
],
|
|
"passthrough",
|
|
"['a', 'b']",
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames([f"pca{i}" for i in range(6)]), ["b"]),
|
|
("bycol2", TransWithNames([f"pca{i}" for i in range(6)]), ["b"]),
|
|
],
|
|
"passthrough",
|
|
"['pca0', 'pca1', 'pca2', 'pca3', 'pca4', ...]",
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(["a", "b"]), slice(1, 2)),
|
|
("bycol2", "passthrough", ["a"]),
|
|
("bycol3", TransWithNames(["b"]), ["c"]),
|
|
],
|
|
"passthrough",
|
|
"['a', 'b']",
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(["a", "b"]), ["b"]),
|
|
("bycol2", "passthrough", slice(0, 1)),
|
|
("bycol3", TransWithNames(["b"]), ["c"]),
|
|
],
|
|
"passthrough",
|
|
"['a', 'b']",
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(["a", "b"]), slice("b", "c")),
|
|
("bycol2", "passthrough", ["a"]),
|
|
("bycol3", TransWithNames(["b"]), ["c"]),
|
|
],
|
|
"passthrough",
|
|
"['a', 'b']",
|
|
),
|
|
(
|
|
[
|
|
("bycol1", TransWithNames(["a", "b"]), ["b"]),
|
|
("bycol2", "passthrough", slice("a", "a")),
|
|
("bycol3", TransWithNames(["b"]), ["c"]),
|
|
],
|
|
"passthrough",
|
|
"['a', 'b']",
|
|
),
|
|
],
|
|
)
|
|
def test_verbose_feature_names_out_false_errors(
|
|
transformers, remainder, colliding_columns
|
|
):
|
|
"""Check feature_names_out for verbose_feature_names_out=False"""
|
|
|
|
pd = pytest.importorskip("pandas")
|
|
df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"])
|
|
ct = ColumnTransformer(
|
|
transformers,
|
|
remainder=remainder,
|
|
verbose_feature_names_out=False,
|
|
)
|
|
ct.fit(df)
|
|
|
|
msg = re.escape(
|
|
f"Output feature names: {colliding_columns} are not unique. Please set "
|
|
"verbose_feature_names_out=True to add prefixes to feature names"
|
|
)
|
|
with pytest.raises(ValueError, match=msg):
|
|
ct.get_feature_names_out()
|
|
|
|
|
|
@pytest.mark.parametrize("verbose_feature_names_out", [True, False])
|
|
@pytest.mark.parametrize("remainder", ["drop", "passthrough"])
|
|
def test_column_transformer_set_output(verbose_feature_names_out, remainder):
|
|
"""Check column transformer behavior with set_output."""
|
|
pd = pytest.importorskip("pandas")
|
|
df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"], index=[10])
|
|
ct = ColumnTransformer(
|
|
[("first", TransWithNames(), ["a", "c"]), ("second", TransWithNames(), ["d"])],
|
|
remainder=remainder,
|
|
verbose_feature_names_out=verbose_feature_names_out,
|
|
)
|
|
X_trans = ct.fit_transform(df)
|
|
assert isinstance(X_trans, np.ndarray)
|
|
|
|
ct.set_output(transform="pandas")
|
|
|
|
df_test = pd.DataFrame([[1, 2, 3, 4]], columns=df.columns, index=[20])
|
|
X_trans = ct.transform(df_test)
|
|
assert isinstance(X_trans, pd.DataFrame)
|
|
|
|
feature_names_out = ct.get_feature_names_out()
|
|
assert_array_equal(X_trans.columns, feature_names_out)
|
|
assert_array_equal(X_trans.index, df_test.index)
|
|
|
|
|
|
@pytest.mark.parametrize("remainder", ["drop", "passthrough"])
|
|
@pytest.mark.parametrize("fit_transform", [True, False])
|
|
def test_column_transform_set_output_mixed(remainder, fit_transform):
|
|
"""Check ColumnTransformer outputs mixed types correctly."""
|
|
pd = pytest.importorskip("pandas")
|
|
df = pd.DataFrame(
|
|
{
|
|
"pet": pd.Series(["dog", "cat", "snake"], dtype="category"),
|
|
"color": pd.Series(["green", "blue", "red"], dtype="object"),
|
|
"age": [1.4, 2.1, 4.4],
|
|
"height": [20, 40, 10],
|
|
"distance": pd.Series([20, pd.NA, 100], dtype="Int32"),
|
|
}
|
|
)
|
|
ct = ColumnTransformer(
|
|
[
|
|
(
|
|
"color_encode",
|
|
OneHotEncoder(sparse_output=False, dtype="int8"),
|
|
["color"],
|
|
),
|
|
("age", StandardScaler(), ["age"]),
|
|
],
|
|
remainder=remainder,
|
|
verbose_feature_names_out=False,
|
|
).set_output(transform="pandas")
|
|
if fit_transform:
|
|
X_trans = ct.fit_transform(df)
|
|
else:
|
|
X_trans = ct.fit(df).transform(df)
|
|
|
|
assert isinstance(X_trans, pd.DataFrame)
|
|
assert_array_equal(X_trans.columns, ct.get_feature_names_out())
|
|
|
|
expected_dtypes = {
|
|
"color_blue": "int8",
|
|
"color_green": "int8",
|
|
"color_red": "int8",
|
|
"age": "float64",
|
|
"pet": "category",
|
|
"height": "int64",
|
|
"distance": "Int32",
|
|
}
|
|
for col, dtype in X_trans.dtypes.items():
|
|
assert dtype == expected_dtypes[col]
|
|
|
|
|
|
@pytest.mark.parametrize("remainder", ["drop", "passthrough"])
|
|
def test_column_transform_set_output_after_fitting(remainder):
|
|
pd = pytest.importorskip("pandas")
|
|
df = pd.DataFrame(
|
|
{
|
|
"pet": pd.Series(["dog", "cat", "snake"], dtype="category"),
|
|
"age": [1.4, 2.1, 4.4],
|
|
"height": [20, 40, 10],
|
|
}
|
|
)
|
|
ct = ColumnTransformer(
|
|
[
|
|
(
|
|
"color_encode",
|
|
OneHotEncoder(sparse_output=False, dtype="int16"),
|
|
["pet"],
|
|
),
|
|
("age", StandardScaler(), ["age"]),
|
|
],
|
|
remainder=remainder,
|
|
verbose_feature_names_out=False,
|
|
)
|
|
|
|
# fit without calling set_output
|
|
X_trans = ct.fit_transform(df)
|
|
assert isinstance(X_trans, np.ndarray)
|
|
assert X_trans.dtype == "float64"
|
|
|
|
ct.set_output(transform="pandas")
|
|
X_trans_df = ct.transform(df)
|
|
expected_dtypes = {
|
|
"pet_cat": "int16",
|
|
"pet_dog": "int16",
|
|
"pet_snake": "int16",
|
|
"height": "int64",
|
|
"age": "float64",
|
|
}
|
|
for col, dtype in X_trans_df.dtypes.items():
|
|
assert dtype == expected_dtypes[col]
|
|
|
|
|
|
# PandasOutTransformer that does not define get_feature_names_out and always expects
|
|
# the input to be a DataFrame.
|
|
class PandasOutTransformer(BaseEstimator):
|
|
def __init__(self, offset=1.0):
|
|
self.offset = offset
|
|
|
|
def fit(self, X, y=None):
|
|
pd = pytest.importorskip("pandas")
|
|
assert isinstance(X, pd.DataFrame)
|
|
return self
|
|
|
|
def transform(self, X, y=None):
|
|
pd = pytest.importorskip("pandas")
|
|
assert isinstance(X, pd.DataFrame)
|
|
return X - self.offset
|
|
|
|
def set_output(self, transform=None):
|
|
# This transformer will always output a DataFrame regardless of the
|
|
# configuration.
|
|
return self
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"trans_1, expected_verbose_names, expected_non_verbose_names",
|
|
[
|
|
(
|
|
PandasOutTransformer(offset=2.0),
|
|
["trans_0__feat1", "trans_1__feat0"],
|
|
["feat1", "feat0"],
|
|
),
|
|
(
|
|
"drop",
|
|
["trans_0__feat1"],
|
|
["feat1"],
|
|
),
|
|
(
|
|
"passthrough",
|
|
["trans_0__feat1", "trans_1__feat0"],
|
|
["feat1", "feat0"],
|
|
),
|
|
],
|
|
)
|
|
def test_transformers_with_pandas_out_but_not_feature_names_out(
|
|
trans_1, expected_verbose_names, expected_non_verbose_names
|
|
):
|
|
"""Check that set_config(transform="pandas") is compatible with more transformers.
|
|
|
|
Specifically, if transformers returns a DataFrame, but does not define
|
|
`get_feature_names_out`.
|
|
"""
|
|
pd = pytest.importorskip("pandas")
|
|
|
|
X_df = pd.DataFrame({"feat0": [1.0, 2.0, 3.0], "feat1": [2.0, 3.0, 4.0]})
|
|
ct = ColumnTransformer(
|
|
[
|
|
("trans_0", PandasOutTransformer(offset=3.0), ["feat1"]),
|
|
("trans_1", trans_1, ["feat0"]),
|
|
]
|
|
)
|
|
X_trans_np = ct.fit_transform(X_df)
|
|
assert isinstance(X_trans_np, np.ndarray)
|
|
|
|
# `ct` does not have `get_feature_names_out` because `PandasOutTransformer` does
|
|
# not define the method.
|
|
with pytest.raises(AttributeError, match="not provide get_feature_names_out"):
|
|
ct.get_feature_names_out()
|
|
|
|
# The feature names are prefixed because verbose_feature_names_out=True is default
|
|
ct.set_output(transform="pandas")
|
|
X_trans_df0 = ct.fit_transform(X_df)
|
|
assert_array_equal(X_trans_df0.columns, expected_verbose_names)
|
|
|
|
ct.set_params(verbose_feature_names_out=False)
|
|
X_trans_df1 = ct.fit_transform(X_df)
|
|
assert_array_equal(X_trans_df1.columns, expected_non_verbose_names)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"empty_selection",
|
|
[[], np.array([False, False]), [False, False]],
|
|
ids=["list", "bool", "bool_int"],
|
|
)
|
|
def test_empty_selection_pandas_output(empty_selection):
|
|
"""Check that pandas output works when there is an empty selection.
|
|
|
|
Non-regression test for gh-25487
|
|
"""
|
|
pd = pytest.importorskip("pandas")
|
|
|
|
X = pd.DataFrame([[1.0, 2.2], [3.0, 1.0]], columns=["a", "b"])
|
|
ct = ColumnTransformer(
|
|
[
|
|
("categorical", "passthrough", empty_selection),
|
|
("numerical", StandardScaler(), ["a", "b"]),
|
|
],
|
|
verbose_feature_names_out=True,
|
|
)
|
|
ct.set_output(transform="pandas")
|
|
X_out = ct.fit_transform(X)
|
|
assert_array_equal(X_out.columns, ["numerical__a", "numerical__b"])
|
|
|
|
ct.set_params(verbose_feature_names_out=False)
|
|
X_out = ct.fit_transform(X)
|
|
assert_array_equal(X_out.columns, ["a", "b"])
|
|
|
|
|
|
def test_raise_error_if_index_not_aligned():
|
|
"""Check column transformer raises error if indices are not aligned.
|
|
|
|
Non-regression test for gh-26210.
|
|
"""
|
|
pd = pytest.importorskip("pandas")
|
|
|
|
X = pd.DataFrame([[1.0, 2.2], [3.0, 1.0]], columns=["a", "b"], index=[8, 3])
|
|
reset_index_transformer = FunctionTransformer(
|
|
lambda x: x.reset_index(drop=True), feature_names_out="one-to-one"
|
|
)
|
|
|
|
ct = ColumnTransformer(
|
|
[
|
|
("num1", "passthrough", ["a"]),
|
|
("num2", reset_index_transformer, ["b"]),
|
|
],
|
|
)
|
|
ct.set_output(transform="pandas")
|
|
msg = (
|
|
"Concatenating DataFrames from the transformer's output lead to"
|
|
" an inconsistent number of samples. The output may have Pandas"
|
|
" Indexes that do not match."
|
|
)
|
|
with pytest.raises(ValueError, match=msg):
|
|
ct.fit_transform(X)
|
|
|
|
|
|
def test_remainder_set_output():
|
|
"""Check that the output is set for the remainder.
|
|
|
|
Non-regression test for #26306.
|
|
"""
|
|
|
|
pd = pytest.importorskip("pandas")
|
|
df = pd.DataFrame({"a": [True, False, True], "b": [1, 2, 3]})
|
|
|
|
ct = make_column_transformer(
|
|
(VarianceThreshold(), make_column_selector(dtype_include=bool)),
|
|
remainder=VarianceThreshold(),
|
|
verbose_feature_names_out=False,
|
|
)
|
|
ct.set_output(transform="pandas")
|
|
|
|
out = ct.fit_transform(df)
|
|
pd.testing.assert_frame_equal(out, df)
|
|
|
|
ct.set_output(transform="default")
|
|
out = ct.fit_transform(df)
|
|
assert isinstance(out, np.ndarray)
|
|
|
|
|
|
# TODO(1.6): replace the warning by a ValueError exception
|
|
def test_transform_pd_na():
|
|
"""Check behavior when a tranformer's output contains pandas.NA
|
|
|
|
It should emit a warning unless the output config is set to 'pandas'.
|
|
"""
|
|
pd = pytest.importorskip("pandas")
|
|
if not hasattr(pd, "Float64Dtype"):
|
|
pytest.skip(
|
|
"The issue with pd.NA tested here does not happen in old versions that do"
|
|
" not have the extension dtypes"
|
|
)
|
|
df = pd.DataFrame({"a": [1.5, None]})
|
|
ct = make_column_transformer(("passthrough", ["a"]))
|
|
# No warning with non-extension dtypes and np.nan
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("error")
|
|
ct.fit_transform(df)
|
|
df = df.convert_dtypes()
|
|
# Error with extension dtype and pd.NA
|
|
with pytest.warns(FutureWarning, match=r"set_output\(transform='pandas'\)"):
|
|
ct.fit_transform(df)
|
|
# No warning when output is set to pandas
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("error")
|
|
ct.set_output(transform="pandas")
|
|
ct.fit_transform(df)
|
|
ct.set_output(transform="default")
|
|
# No warning when there are no pd.NA
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("error")
|
|
ct.fit_transform(df.fillna(-1.0))
|
|
|
|
|
|
def test_dataframe_different_dataframe_libraries():
|
|
"""Check fitting and transforming on pandas and polars dataframes."""
|
|
pd = pytest.importorskip("pandas")
|
|
pl = pytest.importorskip("polars")
|
|
X_train_np = np.array([[0, 1], [2, 4], [4, 5]])
|
|
X_test_np = np.array([[1, 2], [1, 3], [2, 3]])
|
|
|
|
# Fit on pandas and transform on polars
|
|
X_train_pd = pd.DataFrame(X_train_np, columns=["a", "b"])
|
|
X_test_pl = pl.DataFrame(X_test_np, schema=["a", "b"])
|
|
|
|
ct = make_column_transformer((Trans(), [0, 1]))
|
|
ct.fit(X_train_pd)
|
|
|
|
out_pl_in = ct.transform(X_test_pl)
|
|
assert_array_equal(out_pl_in, X_test_np)
|
|
|
|
# Fit on polars and transform on pandas
|
|
X_train_pl = pl.DataFrame(X_train_np, schema=["a", "b"])
|
|
X_test_pd = pd.DataFrame(X_test_np, columns=["a", "b"])
|
|
ct.fit(X_train_pl)
|
|
|
|
out_pd_in = ct.transform(X_test_pd)
|
|
assert_array_equal(out_pd_in, X_test_np)
|
|
|
|
|
|
def test_column_transformer__getitem__():
|
|
"""Check __getitem__ for ColumnTransformer."""
|
|
X = np.array([[0, 1, 2], [3, 4, 5]])
|
|
ct = ColumnTransformer([("t1", Trans(), [0, 1]), ("t2", Trans(), [1, 2])])
|
|
|
|
msg = "ColumnTransformer is subscriptable after it is fitted"
|
|
with pytest.raises(TypeError, match=msg):
|
|
ct["t1"]
|
|
|
|
ct.fit(X)
|
|
assert ct["t1"] is ct.named_transformers_["t1"]
|
|
assert ct["t2"] is ct.named_transformers_["t2"]
|
|
|
|
msg = "'does_not_exist' is not a valid transformer name"
|
|
with pytest.raises(KeyError, match=msg):
|
|
ct["does_not_exist"]
|
|
|
|
|
|
@pytest.mark.parametrize("transform_output", ["default", "pandas"])
|
|
def test_column_transformer_remainder_passthrough_naming_consistency(transform_output):
|
|
"""Check that when `remainder="passthrough"`, inconsistent naming is handled
|
|
correctly by the underlying `FunctionTransformer`.
|
|
|
|
Non-regression test for:
|
|
https://github.com/scikit-learn/scikit-learn/issues/28232
|
|
"""
|
|
pd = pytest.importorskip("pandas")
|
|
X = pd.DataFrame(np.random.randn(10, 4))
|
|
|
|
preprocessor = ColumnTransformer(
|
|
transformers=[("scaler", StandardScaler(), [0, 1])],
|
|
remainder="passthrough",
|
|
).set_output(transform=transform_output)
|
|
X_trans = preprocessor.fit_transform(X)
|
|
assert X_trans.shape == X.shape
|
|
|
|
expected_column_names = [
|
|
"scaler__x0",
|
|
"scaler__x1",
|
|
"remainder__x2",
|
|
"remainder__x3",
|
|
]
|
|
if hasattr(X_trans, "columns"):
|
|
assert X_trans.columns.tolist() == expected_column_names
|
|
assert preprocessor.get_feature_names_out().tolist() == expected_column_names
|
|
|
|
|
|
@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
|
|
def test_column_transformer_column_renaming(dataframe_lib):
|
|
"""Check that we properly rename columns when using `ColumnTransformer` and
|
|
selected columns are redundant between transformers.
|
|
|
|
Non-regression test for:
|
|
https://github.com/scikit-learn/scikit-learn/issues/28260
|
|
"""
|
|
lib = pytest.importorskip(dataframe_lib)
|
|
|
|
df = lib.DataFrame({"x1": [1, 2, 3], "x2": [10, 20, 30], "x3": [100, 200, 300]})
|
|
|
|
transformer = ColumnTransformer(
|
|
transformers=[
|
|
("A", "passthrough", ["x1", "x2", "x3"]),
|
|
("B", FunctionTransformer(), ["x1", "x2"]),
|
|
("C", StandardScaler(), ["x1", "x3"]),
|
|
# special case of empty transformer
|
|
("D", FunctionTransformer(lambda x: x[[]]), ["x1", "x2", "x3"]),
|
|
],
|
|
verbose_feature_names_out=True,
|
|
).set_output(transform=dataframe_lib)
|
|
df_trans = transformer.fit_transform(df)
|
|
assert list(df_trans.columns) == [
|
|
"A__x1",
|
|
"A__x2",
|
|
"A__x3",
|
|
"B__x1",
|
|
"B__x2",
|
|
"C__x1",
|
|
"C__x3",
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
|
|
def test_column_transformer_error_with_duplicated_columns(dataframe_lib):
|
|
"""Check that we raise an error when using `ColumnTransformer` and
|
|
the columns names are duplicated between transformers."""
|
|
lib = pytest.importorskip(dataframe_lib)
|
|
|
|
df = lib.DataFrame({"x1": [1, 2, 3], "x2": [10, 20, 30], "x3": [100, 200, 300]})
|
|
|
|
transformer = ColumnTransformer(
|
|
transformers=[
|
|
("A", "passthrough", ["x1", "x2", "x3"]),
|
|
("B", FunctionTransformer(), ["x1", "x2"]),
|
|
("C", StandardScaler(), ["x1", "x3"]),
|
|
# special case of empty transformer
|
|
("D", FunctionTransformer(lambda x: x[[]]), ["x1", "x2", "x3"]),
|
|
],
|
|
verbose_feature_names_out=False,
|
|
).set_output(transform=dataframe_lib)
|
|
err_msg = re.escape(
|
|
"Duplicated feature names found before concatenating the outputs of the "
|
|
"transformers: ['x1', 'x2', 'x3'].\n"
|
|
"Transformer A has conflicting columns names: ['x1', 'x2', 'x3'].\n"
|
|
"Transformer B has conflicting columns names: ['x1', 'x2'].\n"
|
|
"Transformer C has conflicting columns names: ['x1', 'x3'].\n"
|
|
)
|
|
with pytest.raises(ValueError, match=err_msg):
|
|
transformer.fit_transform(df)
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
parse_version(joblib.__version__) < parse_version("1.3"),
|
|
reason="requires joblib >= 1.3",
|
|
)
|
|
def test_column_transformer_auto_memmap():
|
|
"""Check that ColumnTransformer works in parallel with joblib's auto-memmapping.
|
|
|
|
non-regression test for issue #28781
|
|
"""
|
|
X = np.random.RandomState(0).uniform(size=(3, 4))
|
|
|
|
scaler = StandardScaler(copy=False)
|
|
|
|
transformer = ColumnTransformer(
|
|
transformers=[("scaler", scaler, [0])],
|
|
n_jobs=2,
|
|
)
|
|
|
|
with joblib.parallel_backend("loky", max_nbytes=1):
|
|
Xt = transformer.fit_transform(X)
|
|
|
|
assert_allclose(Xt, StandardScaler().fit_transform(X[:, [0]]))
|
|
|
|
|
|
# Metadata Routing Tests
|
|
# ======================
|
|
|
|
|
|
@pytest.mark.parametrize("method", ["transform", "fit_transform", "fit"])
|
|
def test_routing_passed_metadata_not_supported(method):
|
|
"""Test that the right error message is raised when metadata is passed while
|
|
not supported when `enable_metadata_routing=False`."""
|
|
|
|
X = np.array([[0, 1, 2], [2, 4, 6]]).T
|
|
y = [1, 2, 3]
|
|
trs = ColumnTransformer([("trans", Trans(), [0])]).fit(X, y)
|
|
|
|
with pytest.raises(
|
|
ValueError, match="is only supported if enable_metadata_routing=True"
|
|
):
|
|
getattr(trs, method)([[1]], sample_weight=[1], prop="a")
|
|
|
|
|
|
@pytest.mark.usefixtures("enable_slep006")
|
|
@pytest.mark.parametrize("method", ["transform", "fit_transform", "fit"])
|
|
def test_metadata_routing_for_column_transformer(method):
|
|
"""Test that metadata is routed correctly for column transformer."""
|
|
X = np.array([[0, 1, 2], [2, 4, 6]]).T
|
|
y = [1, 2, 3]
|
|
registry = _Registry()
|
|
sample_weight, metadata = [1], "a"
|
|
trs = ColumnTransformer(
|
|
[
|
|
(
|
|
"trans",
|
|
ConsumingTransformer(registry=registry)
|
|
.set_fit_request(sample_weight=True, metadata=True)
|
|
.set_transform_request(sample_weight=True, metadata=True),
|
|
[0],
|
|
)
|
|
]
|
|
)
|
|
|
|
if method == "transform":
|
|
trs.fit(X, y)
|
|
trs.transform(X, sample_weight=sample_weight, metadata=metadata)
|
|
else:
|
|
getattr(trs, method)(X, y, sample_weight=sample_weight, metadata=metadata)
|
|
|
|
assert len(registry)
|
|
for _trs in registry:
|
|
check_recorded_metadata(
|
|
obj=_trs, method=method, sample_weight=sample_weight, metadata=metadata
|
|
)
|
|
|
|
|
|
@pytest.mark.usefixtures("enable_slep006")
|
|
def test_metadata_routing_no_fit_transform():
|
|
"""Test metadata routing when the sub-estimator doesn't implement
|
|
``fit_transform``."""
|
|
|
|
class NoFitTransform(BaseEstimator):
|
|
def fit(self, X, y=None, sample_weight=None, metadata=None):
|
|
assert sample_weight
|
|
assert metadata
|
|
return self
|
|
|
|
def transform(self, X, sample_weight=None, metadata=None):
|
|
assert sample_weight
|
|
assert metadata
|
|
return X
|
|
|
|
X = np.array([[0, 1, 2], [2, 4, 6]]).T
|
|
y = [1, 2, 3]
|
|
sample_weight, metadata = [1], "a"
|
|
trs = ColumnTransformer(
|
|
[
|
|
(
|
|
"trans",
|
|
NoFitTransform()
|
|
.set_fit_request(sample_weight=True, metadata=True)
|
|
.set_transform_request(sample_weight=True, metadata=True),
|
|
[0],
|
|
)
|
|
]
|
|
)
|
|
|
|
trs.fit(X, y, sample_weight=sample_weight, metadata=metadata)
|
|
trs.fit_transform(X, y, sample_weight=sample_weight, metadata=metadata)
|
|
|
|
|
|
@pytest.mark.usefixtures("enable_slep006")
|
|
@pytest.mark.parametrize("method", ["transform", "fit_transform", "fit"])
|
|
def test_metadata_routing_error_for_column_transformer(method):
|
|
"""Test that the right error is raised when metadata is not requested."""
|
|
X = np.array([[0, 1, 2], [2, 4, 6]]).T
|
|
y = [1, 2, 3]
|
|
sample_weight, metadata = [1], "a"
|
|
trs = ColumnTransformer([("trans", ConsumingTransformer(), [0])])
|
|
|
|
error_message = (
|
|
"[sample_weight, metadata] are passed but are not explicitly set as requested"
|
|
f" or not requested for ConsumingTransformer.{method}"
|
|
)
|
|
with pytest.raises(ValueError, match=re.escape(error_message)):
|
|
if method == "transform":
|
|
trs.fit(X, y)
|
|
trs.transform(X, sample_weight=sample_weight, metadata=metadata)
|
|
else:
|
|
getattr(trs, method)(X, y, sample_weight=sample_weight, metadata=metadata)
|
|
|
|
|
|
@pytest.mark.usefixtures("enable_slep006")
|
|
def test_get_metadata_routing_works_without_fit():
|
|
# Regression test for https://github.com/scikit-learn/scikit-learn/issues/28186
|
|
# Make sure ct.get_metadata_routing() works w/o having called fit.
|
|
ct = ColumnTransformer([("trans", ConsumingTransformer(), [0])])
|
|
ct.get_metadata_routing()
|
|
|
|
|
|
@pytest.mark.usefixtures("enable_slep006")
|
|
def test_remainder_request_always_present():
|
|
# Test that remainder request is always present.
|
|
ct = ColumnTransformer(
|
|
[("trans", StandardScaler(), [0])],
|
|
remainder=ConsumingTransformer()
|
|
.set_fit_request(metadata=True)
|
|
.set_transform_request(metadata=True),
|
|
)
|
|
router = ct.get_metadata_routing()
|
|
assert router.consumes("fit", ["metadata"]) == set(["metadata"])
|
|
|
|
|
|
@pytest.mark.usefixtures("enable_slep006")
|
|
def test_unused_transformer_request_present():
|
|
# Test that the request of a transformer is always present even when not
|
|
# used due to no selected columns.
|
|
ct = ColumnTransformer(
|
|
[
|
|
(
|
|
"trans",
|
|
ConsumingTransformer()
|
|
.set_fit_request(metadata=True)
|
|
.set_transform_request(metadata=True),
|
|
lambda X: [],
|
|
)
|
|
]
|
|
)
|
|
router = ct.get_metadata_routing()
|
|
assert router.consumes("fit", ["metadata"]) == set(["metadata"])
|
|
|
|
|
|
# End of Metadata Routing Tests
|
|
# =============================
|