209 lines
7.2 KiB
Python
209 lines
7.2 KiB
Python
![]() |
import pickle
|
||
|
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
from numpy.testing import assert_array_equal
|
||
|
|
||
|
from sklearn.utils._encode import _unique
|
||
|
from sklearn.utils._encode import _encode
|
||
|
from sklearn.utils._encode import _check_unknown
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"values, expected",
|
||
|
[(np.array([2, 1, 3, 1, 3], dtype='int64'),
|
||
|
np.array([1, 2, 3], dtype='int64')),
|
||
|
(np.array(['b', 'a', 'c', 'a', 'c'], dtype=object),
|
||
|
np.array(['a', 'b', 'c'], dtype=object)),
|
||
|
(np.array(['b', 'a', 'c', 'a', 'c']),
|
||
|
np.array(['a', 'b', 'c']))],
|
||
|
ids=['int64', 'object', 'str'])
|
||
|
def test_encode_util(values, expected):
|
||
|
uniques = _unique(values)
|
||
|
assert_array_equal(uniques, expected)
|
||
|
encoded = _encode(values, uniques=uniques)
|
||
|
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
|
||
|
|
||
|
|
||
|
def test_encode_with_check_unknown():
|
||
|
# test for the check_unknown parameter of _encode()
|
||
|
uniques = np.array([1, 2, 3])
|
||
|
values = np.array([1, 2, 3, 4])
|
||
|
|
||
|
# Default is True, raise error
|
||
|
with pytest.raises(ValueError,
|
||
|
match='y contains previously unseen labels'):
|
||
|
_encode(values, uniques=uniques, check_unknown=True)
|
||
|
|
||
|
# dont raise error if False
|
||
|
_encode(values, uniques=uniques, check_unknown=False)
|
||
|
|
||
|
# parameter is ignored for object dtype
|
||
|
uniques = np.array(['a', 'b', 'c'], dtype=object)
|
||
|
values = np.array(['a', 'b', 'c', 'd'], dtype=object)
|
||
|
with pytest.raises(ValueError,
|
||
|
match='y contains previously unseen labels'):
|
||
|
_encode(values, uniques=uniques, check_unknown=False)
|
||
|
|
||
|
|
||
|
def _assert_check_unknown(values, uniques, expected_diff, expected_mask):
|
||
|
diff = _check_unknown(values, uniques)
|
||
|
assert_array_equal(diff, expected_diff)
|
||
|
|
||
|
diff, valid_mask = _check_unknown(values, uniques, return_mask=True)
|
||
|
assert_array_equal(diff, expected_diff)
|
||
|
assert_array_equal(valid_mask, expected_mask)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("values, uniques, expected_diff, expected_mask", [
|
||
|
(np.array([1, 2, 3, 4]),
|
||
|
np.array([1, 2, 3]),
|
||
|
[4],
|
||
|
[True, True, True, False]),
|
||
|
(np.array([2, 1, 4, 5]),
|
||
|
np.array([2, 5, 1]),
|
||
|
[4],
|
||
|
[True, True, False, True]),
|
||
|
(np.array([2, 1, np.nan]),
|
||
|
np.array([2, 5, 1]),
|
||
|
[np.nan],
|
||
|
[True, True, False]),
|
||
|
(np.array([2, 1, 4, np.nan]),
|
||
|
np.array([2, 5, 1, np.nan]),
|
||
|
[4],
|
||
|
[True, True, False, True]),
|
||
|
(np.array([2, 1, 4, np.nan]),
|
||
|
np.array([2, 5, 1]),
|
||
|
[4, np.nan],
|
||
|
[True, True, False, False]),
|
||
|
(np.array([2, 1, 4, 5]),
|
||
|
np.array([2, 5, 1, np.nan]),
|
||
|
[4],
|
||
|
[True, True, False, True]),
|
||
|
(np.array(['a', 'b', 'c', 'd'], dtype=object),
|
||
|
np.array(['a', 'b', 'c'], dtype=object),
|
||
|
np.array(['d'], dtype=object),
|
||
|
[True, True, True, False]),
|
||
|
(np.array(['d', 'c', 'a', 'b'], dtype=object),
|
||
|
np.array(['a', 'c', 'b'], dtype=object),
|
||
|
np.array(['d'], dtype=object),
|
||
|
[False, True, True, True]),
|
||
|
(np.array(['a', 'b', 'c', 'd']),
|
||
|
np.array(['a', 'b', 'c']),
|
||
|
np.array(['d']),
|
||
|
[True, True, True, False]),
|
||
|
(np.array(['d', 'c', 'a', 'b']),
|
||
|
np.array(['a', 'c', 'b']),
|
||
|
np.array(['d']),
|
||
|
[False, True, True, True]),
|
||
|
])
|
||
|
def test_check_unknown(values, uniques, expected_diff, expected_mask):
|
||
|
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("missing_value", [None, np.nan, float('nan')])
|
||
|
@pytest.mark.parametrize('pickle_uniques', [True, False])
|
||
|
def test_check_unknown_missing_values(missing_value, pickle_uniques):
|
||
|
# check for check_unknown with missing values with object dtypes
|
||
|
values = np.array(['d', 'c', 'a', 'b', missing_value], dtype=object)
|
||
|
uniques = np.array(['c', 'a', 'b', missing_value], dtype=object)
|
||
|
if pickle_uniques:
|
||
|
uniques = pickle.loads(pickle.dumps(uniques))
|
||
|
|
||
|
expected_diff = ['d']
|
||
|
expected_mask = [False, True, True, True, True]
|
||
|
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
|
||
|
|
||
|
values = np.array(['d', 'c', 'a', 'b', missing_value], dtype=object)
|
||
|
uniques = np.array(['c', 'a', 'b'], dtype=object)
|
||
|
if pickle_uniques:
|
||
|
uniques = pickle.loads(pickle.dumps(uniques))
|
||
|
|
||
|
expected_diff = ['d', missing_value]
|
||
|
|
||
|
expected_mask = [False, True, True, True, False]
|
||
|
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
|
||
|
|
||
|
values = np.array(['a', missing_value], dtype=object)
|
||
|
uniques = np.array(['a', 'b', 'z'], dtype=object)
|
||
|
if pickle_uniques:
|
||
|
uniques = pickle.loads(pickle.dumps(uniques))
|
||
|
|
||
|
expected_diff = [missing_value]
|
||
|
expected_mask = [True, False]
|
||
|
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize('missing_value', [np.nan, None, float('nan')])
|
||
|
@pytest.mark.parametrize('pickle_uniques', [True, False])
|
||
|
def test_unique_util_missing_values_objects(missing_value, pickle_uniques):
|
||
|
# check for _unique and _encode with missing values with object dtypes
|
||
|
values = np.array(['a', 'c', 'c', missing_value, 'b'], dtype=object)
|
||
|
expected_uniques = np.array(['a', 'b', 'c', missing_value], dtype=object)
|
||
|
|
||
|
uniques = _unique(values)
|
||
|
|
||
|
if missing_value is None:
|
||
|
assert_array_equal(uniques, expected_uniques)
|
||
|
else: # missing_value == np.nan
|
||
|
assert_array_equal(uniques[:-1], expected_uniques[:-1])
|
||
|
assert np.isnan(uniques[-1])
|
||
|
|
||
|
if pickle_uniques:
|
||
|
uniques = pickle.loads(pickle.dumps(uniques))
|
||
|
|
||
|
encoded = _encode(values, uniques=uniques)
|
||
|
assert_array_equal(encoded, np.array([0, 2, 2, 3, 1]))
|
||
|
|
||
|
|
||
|
def test_unique_util_missing_values_numeric():
|
||
|
# Check missing values in numerical values
|
||
|
values = np.array([3, 1, np.nan, 5, 3, np.nan], dtype=float)
|
||
|
expected_uniques = np.array([1, 3, 5, np.nan], dtype=float)
|
||
|
expected_inverse = np.array([1, 0, 3, 2, 1, 3])
|
||
|
|
||
|
uniques = _unique(values)
|
||
|
assert_array_equal(uniques, expected_uniques)
|
||
|
|
||
|
uniques, inverse = _unique(values, return_inverse=True)
|
||
|
assert_array_equal(uniques, expected_uniques)
|
||
|
assert_array_equal(inverse, expected_inverse)
|
||
|
|
||
|
encoded = _encode(values, uniques=uniques)
|
||
|
assert_array_equal(encoded, expected_inverse)
|
||
|
|
||
|
|
||
|
def test_unique_util_with_all_missing_values():
|
||
|
# test for all types of missing values for object dtype
|
||
|
values = np.array([np.nan, 'a', 'c', 'c', None, float('nan'),
|
||
|
None], dtype=object)
|
||
|
|
||
|
uniques = _unique(values)
|
||
|
assert_array_equal(uniques[:-1], ['a', 'c', None])
|
||
|
# last value is nan
|
||
|
assert np.isnan(uniques[-1])
|
||
|
|
||
|
expected_inverse = [3, 0, 1, 1, 2, 3, 2]
|
||
|
_, inverse = _unique(values, return_inverse=True)
|
||
|
assert_array_equal(inverse, expected_inverse)
|
||
|
|
||
|
|
||
|
def test_check_unknown_with_both_missing_values():
|
||
|
# test for both types of missing values for object dtype
|
||
|
values = np.array([np.nan, 'a', 'c', 'c', None, np.nan,
|
||
|
None], dtype=object)
|
||
|
|
||
|
diff = _check_unknown(values,
|
||
|
known_values=np.array(['a', 'c'], dtype=object))
|
||
|
assert diff[0] is None
|
||
|
assert np.isnan(diff[1])
|
||
|
|
||
|
diff, valid_mask = _check_unknown(
|
||
|
values, known_values=np.array(['a', 'c'], dtype=object),
|
||
|
return_mask=True)
|
||
|
|
||
|
assert diff[0] is None
|
||
|
assert np.isnan(diff[1])
|
||
|
assert_array_equal(valid_mask,
|
||
|
[False, True, True, True, False, False, False])
|