Inzynierka/Lib/site-packages/sklearn/feature_selection/tests/test_variance_threshold.py

64 lines
2.0 KiB
Python
Raw Normal View History

2023-06-02 12:51:02 +02:00
import numpy as np
import pytest
from sklearn.utils._testing import assert_array_equal
from scipy.sparse import bsr_matrix, csc_matrix, csr_matrix
from sklearn.feature_selection import VarianceThreshold
data = [[0, 1, 2, 3, 4], [0, 2, 2, 3, 5], [1, 1, 2, 4, 0]]
data2 = [[-0.13725701]] * 10
def test_zero_variance():
# Test VarianceThreshold with default setting, zero variance.
for X in [data, csr_matrix(data), csc_matrix(data), bsr_matrix(data)]:
sel = VarianceThreshold().fit(X)
assert_array_equal([0, 1, 3, 4], sel.get_support(indices=True))
with pytest.raises(ValueError):
VarianceThreshold().fit([[0, 1, 2, 3]])
with pytest.raises(ValueError):
VarianceThreshold().fit([[0, 1], [0, 1]])
def test_variance_threshold():
# Test VarianceThreshold with custom variance.
for X in [data, csr_matrix(data)]:
X = VarianceThreshold(threshold=0.4).fit_transform(X)
assert (len(data), 1) == X.shape
@pytest.mark.skipif(
np.var(data2) == 0,
reason=(
"This test is not valid for this platform, "
"as it relies on numerical instabilities."
),
)
def test_zero_variance_floating_point_error():
# Test that VarianceThreshold(0.0).fit eliminates features that have
# the same value in every sample, even when floating point errors
# cause np.var not to be 0 for the feature.
# See #13691
for X in [data2, csr_matrix(data2), csc_matrix(data2), bsr_matrix(data2)]:
msg = "No feature in X meets the variance threshold 0.00000"
with pytest.raises(ValueError, match=msg):
VarianceThreshold().fit(X)
def test_variance_nan():
arr = np.array(data, dtype=np.float64)
# add single NaN and feature should still be included
arr[0, 0] = np.NaN
# make all values in feature NaN and feature should be rejected
arr[:, 1] = np.NaN
for X in [arr, csr_matrix(arr), csc_matrix(arr), bsr_matrix(arr)]:
sel = VarianceThreshold().fit(X)
assert_array_equal([0, 3, 4], sel.get_support(indices=True))