projektAI/venv/Lib/site-packages/sklearn/utils/tests/test_utils.py

696 lines
24 KiB
Python
Raw Normal View History

2021-06-06 22:13:05 +02:00
from copy import copy
from itertools import chain
import warnings
import string
import timeit
import pytest
import numpy as np
import scipy.sparse as sp
from sklearn.utils._testing import (assert_array_equal,
assert_allclose_dense_sparse,
assert_warns_message,
assert_no_warnings,
_convert_container)
from sklearn.utils import check_random_state
from sklearn.utils import _determine_key_type
from sklearn.utils import deprecated
from sklearn.utils import gen_batches
from sklearn.utils import _get_column_indices
from sklearn.utils import resample
from sklearn.utils import safe_mask
from sklearn.utils import column_or_1d
from sklearn.utils import _safe_indexing
from sklearn.utils import shuffle
from sklearn.utils import gen_even_slices
from sklearn.utils import _message_with_time, _print_elapsed_time
from sklearn.utils import get_chunk_n_rows
from sklearn.utils import is_scalar_nan
from sklearn.utils import _to_object_array
from sklearn.utils._mocking import MockDataFrame
from sklearn import config_context
# toy array
X_toy = np.arange(9).reshape((3, 3))
def test_make_rng():
# Check the check_random_state utility function behavior
assert check_random_state(None) is np.random.mtrand._rand
assert check_random_state(np.random) is np.random.mtrand._rand
rng_42 = np.random.RandomState(42)
assert check_random_state(42).randint(100) == rng_42.randint(100)
rng_42 = np.random.RandomState(42)
assert check_random_state(rng_42) is rng_42
rng_42 = np.random.RandomState(42)
assert check_random_state(43).randint(100) != rng_42.randint(100)
with pytest.raises(ValueError):
check_random_state("some invalid seed")
def test_gen_batches():
# Make sure gen_batches errors on invalid batch_size
assert_array_equal(
list(gen_batches(4, 2)),
[slice(0, 2, None), slice(2, 4, None)]
)
msg_zero = "gen_batches got batch_size=0, must be positive"
with pytest.raises(ValueError, match=msg_zero):
next(gen_batches(4, 0))
msg_float = "gen_batches got batch_size=0.5, must be an integer"
with pytest.raises(TypeError, match=msg_float):
next(gen_batches(4, 0.5))
def test_deprecated():
# Test whether the deprecated decorator issues appropriate warnings
# Copied almost verbatim from https://docs.python.org/library/warnings.html
# First a function...
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
@deprecated()
def ham():
return "spam"
spam = ham()
assert spam == "spam" # function must remain usable
assert len(w) == 1
assert issubclass(w[0].category, FutureWarning)
assert "deprecated" in str(w[0].message).lower()
# ... then a class.
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
@deprecated("don't use this")
class Ham:
SPAM = 1
ham = Ham()
assert hasattr(ham, "SPAM")
assert len(w) == 1
assert issubclass(w[0].category, FutureWarning)
assert "deprecated" in str(w[0].message).lower()
def test_resample():
# Border case not worth mentioning in doctests
assert resample() is None
# Check that invalid arguments yield ValueError
with pytest.raises(ValueError):
resample([0], [0, 1])
with pytest.raises(ValueError):
resample([0, 1], [0, 1], replace=False, n_samples=3)
# Issue:6581, n_samples can be more when replace is True (default).
assert len(resample([1, 2], n_samples=5)) == 5
def test_resample_stratified():
# Make sure resample can stratify
rng = np.random.RandomState(0)
n_samples = 100
p = .9
X = rng.normal(size=(n_samples, 1))
y = rng.binomial(1, p, size=n_samples)
_, y_not_stratified = resample(X, y, n_samples=10, random_state=0,
stratify=None)
assert np.all(y_not_stratified == 1)
_, y_stratified = resample(X, y, n_samples=10, random_state=0, stratify=y)
assert not np.all(y_stratified == 1)
assert np.sum(y_stratified) == 9 # all 1s, one 0
def test_resample_stratified_replace():
# Make sure stratified resampling supports the replace parameter
rng = np.random.RandomState(0)
n_samples = 100
X = rng.normal(size=(n_samples, 1))
y = rng.randint(0, 2, size=n_samples)
X_replace, _ = resample(X, y, replace=True, n_samples=50,
random_state=rng, stratify=y)
X_no_replace, _ = resample(X, y, replace=False, n_samples=50,
random_state=rng, stratify=y)
assert np.unique(X_replace).shape[0] < 50
assert np.unique(X_no_replace).shape[0] == 50
# make sure n_samples can be greater than X.shape[0] if we sample with
# replacement
X_replace, _ = resample(X, y, replace=True, n_samples=1000,
random_state=rng, stratify=y)
assert X_replace.shape[0] == 1000
assert np.unique(X_replace).shape[0] == 100
def test_resample_stratify_2dy():
# Make sure y can be 2d when stratifying
rng = np.random.RandomState(0)
n_samples = 100
X = rng.normal(size=(n_samples, 1))
y = rng.randint(0, 2, size=(n_samples, 2))
X, y = resample(X, y, n_samples=50, random_state=rng, stratify=y)
assert y.ndim == 2
def test_resample_stratify_sparse_error():
# resample must be ndarray
rng = np.random.RandomState(0)
n_samples = 100
X = rng.normal(size=(n_samples, 2))
y = rng.randint(0, 2, size=n_samples)
stratify = sp.csr_matrix(y)
with pytest.raises(TypeError, match='A sparse matrix was passed'):
X, y = resample(X, y, n_samples=50, random_state=rng,
stratify=stratify)
def test_safe_mask():
random_state = check_random_state(0)
X = random_state.rand(5, 4)
X_csr = sp.csr_matrix(X)
mask = [False, False, True, True, True]
mask = safe_mask(X, mask)
assert X[mask].shape[0] == 3
mask = safe_mask(X_csr, mask)
assert X_csr[mask].shape[0] == 3
def test_column_or_1d():
EXAMPLES = [
("binary", ["spam", "egg", "spam"]),
("binary", [0, 1, 0, 1]),
("continuous", np.arange(10) / 20.),
("multiclass", [1, 2, 3]),
("multiclass", [0, 1, 2, 2, 0]),
("multiclass", [[1], [2], [3]]),
("multilabel-indicator", [[0, 1, 0], [0, 0, 1]]),
("multiclass-multioutput", [[1, 2, 3]]),
("multiclass-multioutput", [[1, 1], [2, 2], [3, 1]]),
("multiclass-multioutput", [[5, 1], [4, 2], [3, 1]]),
("multiclass-multioutput", [[1, 2, 3]]),
("continuous-multioutput", np.arange(30).reshape((-1, 3))),
]
for y_type, y in EXAMPLES:
if y_type in ["binary", 'multiclass', "continuous"]:
assert_array_equal(column_or_1d(y), np.ravel(y))
else:
with pytest.raises(ValueError):
column_or_1d(y)
@pytest.mark.parametrize(
"key, dtype",
[(0, 'int'),
('0', 'str'),
(True, 'bool'),
(np.bool_(True), 'bool'),
([0, 1, 2], 'int'),
(['0', '1', '2'], 'str'),
((0, 1, 2), 'int'),
(('0', '1', '2'), 'str'),
(slice(None, None), None),
(slice(0, 2), 'int'),
(np.array([0, 1, 2], dtype=np.int32), 'int'),
(np.array([0, 1, 2], dtype=np.int64), 'int'),
(np.array([0, 1, 2], dtype=np.uint8), 'int'),
([True, False], 'bool'),
((True, False), 'bool'),
(np.array([True, False]), 'bool'),
('col_0', 'str'),
(['col_0', 'col_1', 'col_2'], 'str'),
(('col_0', 'col_1', 'col_2'), 'str'),
(slice('begin', 'end'), 'str'),
(np.array(['col_0', 'col_1', 'col_2']), 'str'),
(np.array(['col_0', 'col_1', 'col_2'], dtype=object), 'str')]
)
def test_determine_key_type(key, dtype):
assert _determine_key_type(key) == dtype
def test_determine_key_type_error():
with pytest.raises(ValueError, match="No valid specification of the"):
_determine_key_type(1.0)
def test_determine_key_type_slice_error():
with pytest.raises(TypeError, match="Only array-like or scalar are"):
_determine_key_type(slice(0, 2, 1), accept_slice=False)
@pytest.mark.parametrize(
"array_type", ["list", "array", "sparse", "dataframe"]
)
@pytest.mark.parametrize(
"indices_type", ["list", "tuple", "array", "series", "slice"]
)
def test_safe_indexing_2d_container_axis_0(array_type, indices_type):
indices = [1, 2]
if indices_type == 'slice' and isinstance(indices[1], int):
indices[1] += 1
array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=0)
assert_allclose_dense_sparse(
subset, _convert_container([[4, 5, 6], [7, 8, 9]], array_type)
)
@pytest.mark.parametrize("array_type", ["list", "array", "series"])
@pytest.mark.parametrize(
"indices_type", ["list", "tuple", "array", "series", "slice"]
)
def test_safe_indexing_1d_container(array_type, indices_type):
indices = [1, 2]
if indices_type == 'slice' and isinstance(indices[1], int):
indices[1] += 1
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=0)
assert_allclose_dense_sparse(
subset, _convert_container([2, 3], array_type)
)
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"])
@pytest.mark.parametrize(
"indices_type", ["list", "tuple", "array", "series", "slice"]
)
@pytest.mark.parametrize("indices", [[1, 2], ["col_1", "col_2"]])
def test_safe_indexing_2d_container_axis_1(array_type, indices_type, indices):
# validation of the indices
# we make a copy because indices is mutable and shared between tests
indices_converted = copy(indices)
if indices_type == 'slice' and isinstance(indices[1], int):
indices_converted[1] += 1
columns_name = ['col_0', 'col_1', 'col_2']
array = _convert_container(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
)
indices_converted = _convert_container(indices_converted, indices_type)
if isinstance(indices[0], str) and array_type != 'dataframe':
err_msg = ("Specifying the columns using strings is only supported "
"for pandas DataFrames")
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(array, indices_converted, axis=1)
else:
subset = _safe_indexing(array, indices_converted, axis=1)
assert_allclose_dense_sparse(
subset, _convert_container([[2, 3], [5, 6], [8, 9]], array_type)
)
@pytest.mark.parametrize("array_read_only", [True, False])
@pytest.mark.parametrize("indices_read_only", [True, False])
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"])
@pytest.mark.parametrize("indices_type", ["array", "series"])
@pytest.mark.parametrize(
"axis, expected_array",
[(0, [[4, 5, 6], [7, 8, 9]]), (1, [[2, 3], [5, 6], [8, 9]])]
)
def test_safe_indexing_2d_read_only_axis_1(array_read_only, indices_read_only,
array_type, indices_type, axis,
expected_array):
array = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
if array_read_only:
array.setflags(write=False)
array = _convert_container(array, array_type)
indices = np.array([1, 2])
if indices_read_only:
indices.setflags(write=False)
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=axis)
assert_allclose_dense_sparse(
subset, _convert_container(expected_array, array_type)
)
@pytest.mark.parametrize("array_type", ["list", "array", "series"])
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"])
def test_safe_indexing_1d_container_mask(array_type, indices_type):
indices = [False] + [True] * 2 + [False] * 6
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=0)
assert_allclose_dense_sparse(
subset, _convert_container([2, 3], array_type)
)
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"])
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"])
@pytest.mark.parametrize(
"axis, expected_subset",
[(0, [[4, 5, 6], [7, 8, 9]]),
(1, [[2, 3], [5, 6], [8, 9]])]
)
def test_safe_indexing_2d_mask(array_type, indices_type, axis,
expected_subset):
columns_name = ['col_0', 'col_1', 'col_2']
array = _convert_container(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
)
indices = [False, True, True]
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=axis)
assert_allclose_dense_sparse(
subset, _convert_container(expected_subset, array_type)
)
@pytest.mark.parametrize(
"array_type, expected_output_type",
[("list", "list"), ("array", "array"),
("sparse", "sparse"), ("dataframe", "series")]
)
def test_safe_indexing_2d_scalar_axis_0(array_type, expected_output_type):
array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
indices = 2
subset = _safe_indexing(array, indices, axis=0)
expected_array = _convert_container([7, 8, 9], expected_output_type)
assert_allclose_dense_sparse(subset, expected_array)
@pytest.mark.parametrize("array_type", ["list", "array", "series"])
def test_safe_indexing_1d_scalar(array_type):
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
indices = 2
subset = _safe_indexing(array, indices, axis=0)
assert subset == 3
@pytest.mark.parametrize(
"array_type, expected_output_type",
[("array", "array"), ("sparse", "sparse"), ("dataframe", "series")]
)
@pytest.mark.parametrize("indices", [2, "col_2"])
def test_safe_indexing_2d_scalar_axis_1(array_type, expected_output_type,
indices):
columns_name = ['col_0', 'col_1', 'col_2']
array = _convert_container(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
)
if isinstance(indices, str) and array_type != 'dataframe':
err_msg = ("Specifying the columns using strings is only supported "
"for pandas DataFrames")
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(array, indices, axis=1)
else:
subset = _safe_indexing(array, indices, axis=1)
expected_output = [3, 6, 9]
if expected_output_type == 'sparse':
# sparse matrix are keeping the 2D shape
expected_output = [[3], [6], [9]]
expected_array = _convert_container(
expected_output, expected_output_type
)
assert_allclose_dense_sparse(subset, expected_array)
@pytest.mark.parametrize("array_type", ["list", "array", "sparse"])
def test_safe_indexing_None_axis_0(array_type):
X = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
X_subset = _safe_indexing(X, None, axis=0)
assert_allclose_dense_sparse(X_subset, X)
def test_safe_indexing_pandas_no_matching_cols_error():
pd = pytest.importorskip('pandas')
err_msg = "No valid specification of the columns."
X = pd.DataFrame(X_toy)
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(X, [1.0], axis=1)
@pytest.mark.parametrize("axis", [None, 3])
def test_safe_indexing_error_axis(axis):
with pytest.raises(ValueError, match="'axis' should be either 0"):
_safe_indexing(X_toy, [0, 1], axis=axis)
@pytest.mark.parametrize("X_constructor", ['array', 'series'])
def test_safe_indexing_1d_array_error(X_constructor):
# check that we are raising an error if the array-like passed is 1D and
# we try to index on the 2nd dimension
X = list(range(5))
if X_constructor == 'array':
X_constructor = np.asarray(X)
elif X_constructor == 'series':
pd = pytest.importorskip("pandas")
X_constructor = pd.Series(X)
err_msg = "'X' should be a 2D NumPy array, 2D sparse matrix or pandas"
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(X_constructor, [0, 1], axis=1)
def test_safe_indexing_container_axis_0_unsupported_type():
indices = ["col_1", "col_2"]
array = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
err_msg = "String indexing is not supported with 'axis=0'"
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(array, indices, axis=0)
@pytest.mark.parametrize(
"key, err_msg",
[(10, r"all features must be in \[0, 2\]"),
('whatever', 'A given column is not a column of the dataframe')]
)
def test_get_column_indices_error(key, err_msg):
pd = pytest.importorskip("pandas")
X_df = pd.DataFrame(X_toy, columns=['col_0', 'col_1', 'col_2'])
with pytest.raises(ValueError, match=err_msg):
_get_column_indices(X_df, key)
@pytest.mark.parametrize(
"key",
[['col1'], ['col2'], ['col1', 'col2'], ['col1', 'col3'], ['col2', 'col3']]
)
def test_get_column_indices_pandas_nonunique_columns_error(key):
pd = pytest.importorskip('pandas')
toy = np.zeros((1, 5), dtype=int)
columns = ['col1', 'col1', 'col2', 'col3', 'col2']
X = pd.DataFrame(toy, columns=columns)
err_msg = "Selected columns, {}, are not unique in dataframe".format(key)
with pytest.raises(ValueError) as exc_info:
_get_column_indices(X, key)
assert str(exc_info.value) == err_msg
def test_shuffle_on_ndim_equals_three():
def to_tuple(A): # to make the inner arrays hashable
return tuple(tuple(tuple(C) for C in B) for B in A)
A = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) # A.shape = (2,2,2)
S = set(to_tuple(A))
shuffle(A) # shouldn't raise a ValueError for dim = 3
assert set(to_tuple(A)) == S
def test_shuffle_dont_convert_to_array():
# Check that shuffle does not try to convert to numpy arrays with float
# dtypes can let any indexable datastructure pass-through.
a = ['a', 'b', 'c']
b = np.array(['a', 'b', 'c'], dtype=object)
c = [1, 2, 3]
d = MockDataFrame(np.array([['a', 0],
['b', 1],
['c', 2]],
dtype=object))
e = sp.csc_matrix(np.arange(6).reshape(3, 2))
a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0)
assert a_s == ['c', 'b', 'a']
assert type(a_s) == list
assert_array_equal(b_s, ['c', 'b', 'a'])
assert b_s.dtype == object
assert c_s == [3, 2, 1]
assert type(c_s) == list
assert_array_equal(d_s, np.array([['c', 2],
['b', 1],
['a', 0]],
dtype=object))
assert type(d_s) == MockDataFrame
assert_array_equal(e_s.toarray(), np.array([[4, 5],
[2, 3],
[0, 1]]))
def test_gen_even_slices():
# check that gen_even_slices contains all samples
some_range = range(10)
joined_range = list(chain(*[some_range[slice] for slice in
gen_even_slices(10, 3)]))
assert_array_equal(some_range, joined_range)
# check that passing negative n_chunks raises an error
slices = gen_even_slices(10, -1)
with pytest.raises(ValueError, match="gen_even_slices got n_packs=-1,"
" must be >=1"):
next(slices)
@pytest.mark.parametrize(
('row_bytes', 'max_n_rows', 'working_memory', 'expected', 'warning'),
[(1024, None, 1, 1024, None),
(1024, None, 0.99999999, 1023, None),
(1023, None, 1, 1025, None),
(1025, None, 1, 1023, None),
(1024, None, 2, 2048, None),
(1024, 7, 1, 7, None),
(1024 * 1024, None, 1, 1, None),
(1024 * 1024 + 1, None, 1, 1,
'Could not adhere to working_memory config. '
'Currently 1MiB, 2MiB required.'),
])
def test_get_chunk_n_rows(row_bytes, max_n_rows, working_memory,
expected, warning):
if warning is not None:
def check_warning(*args, **kw):
return assert_warns_message(UserWarning, warning, *args, **kw)
else:
check_warning = assert_no_warnings
actual = check_warning(get_chunk_n_rows,
row_bytes=row_bytes,
max_n_rows=max_n_rows,
working_memory=working_memory)
assert actual == expected
assert type(actual) is type(expected)
with config_context(working_memory=working_memory):
actual = check_warning(get_chunk_n_rows,
row_bytes=row_bytes,
max_n_rows=max_n_rows)
assert actual == expected
assert type(actual) is type(expected)
@pytest.mark.parametrize(
['source', 'message', 'is_long'],
[
('ABC', string.ascii_lowercase, False),
('ABCDEF', string.ascii_lowercase, False),
('ABC', string.ascii_lowercase * 3, True),
('ABC' * 10, string.ascii_lowercase, True),
('ABC', string.ascii_lowercase + u'\u1048', False),
])
@pytest.mark.parametrize(
['time', 'time_str'],
[
(0.2, ' 0.2s'),
(20, ' 20.0s'),
(2000, '33.3min'),
(20000, '333.3min'),
])
def test_message_with_time(source, message, is_long, time, time_str):
out = _message_with_time(source, message, time)
if is_long:
assert len(out) > 70
else:
assert len(out) == 70
assert out.startswith('[' + source + '] ')
out = out[len(source) + 3:]
assert out.endswith(time_str)
out = out[:-len(time_str)]
assert out.endswith(', total=')
out = out[:-len(', total=')]
assert out.endswith(message)
out = out[:-len(message)]
assert out.endswith(' ')
out = out[:-1]
if is_long:
assert not out
else:
assert list(set(out)) == ['.']
@pytest.mark.parametrize(
['message', 'expected'],
[
('hello', _message_with_time('ABC', 'hello', 0.1) + '\n'),
('', _message_with_time('ABC', '', 0.1) + '\n'),
(None, ''),
])
def test_print_elapsed_time(message, expected, capsys, monkeypatch):
monkeypatch.setattr(timeit, 'default_timer', lambda: 0)
with _print_elapsed_time('ABC', message):
monkeypatch.setattr(timeit, 'default_timer', lambda: 0.1)
assert capsys.readouterr().out == expected
@pytest.mark.parametrize("value, result", [(float("nan"), True),
(np.nan, True),
(float(np.nan), True),
(np.float32(np.nan), True),
(np.float64(np.nan), True),
(0, False),
(0., False),
(None, False),
("", False),
("nan", False),
([np.nan], False)])
def test_is_scalar_nan(value, result):
assert is_scalar_nan(value) is result
def dummy_func():
pass
def test_deprecation_joblib_api(tmpdir):
# Only parallel_backend and register_parallel_backend are not deprecated in
# sklearn.utils
from sklearn.utils import parallel_backend, register_parallel_backend
assert_no_warnings(parallel_backend, 'loky', None)
assert_no_warnings(register_parallel_backend, 'failing', None)
from sklearn.utils._joblib import joblib
del joblib.parallel.BACKENDS['failing']
@pytest.mark.parametrize(
"sequence",
[[np.array(1), np.array(2)], [[1, 2], [3, 4]]]
)
def test_to_object_array(sequence):
out = _to_object_array(sequence)
assert isinstance(out, np.ndarray)
assert out.dtype.kind == 'O'
assert out.ndim == 1