projektAI/venv/Lib/site-packages/sklearn/impute/tests/test_knn.py

642 lines
17 KiB
Python
Raw Normal View History

2021-06-06 22:13:05 +02:00
import numpy as np
import pytest
from sklearn import config_context
from sklearn.impute import KNNImputer
from sklearn.metrics.pairwise import nan_euclidean_distances
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.neighbors import KNeighborsRegressor
from sklearn.utils._testing import assert_allclose
@pytest.mark.parametrize("weights", ["uniform", "distance"])
@pytest.mark.parametrize("n_neighbors", range(1, 6))
def test_knn_imputer_shape(weights, n_neighbors):
# Verify the shapes of the imputed matrix for different weights and
# number of neighbors.
n_rows = 10
n_cols = 2
X = np.random.rand(n_rows, n_cols)
X[0, 0] = np.nan
imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights)
X_imputed = imputer.fit_transform(X)
assert X_imputed.shape == (n_rows, n_cols)
@pytest.mark.parametrize("na", [np.nan, -1])
def test_knn_imputer_default_with_invalid_input(na):
# Test imputation with default values and invalid input
# Test with inf present
X = np.array([
[np.inf, 1, 1, 2, na],
[2, 1, 2, 2, 3],
[3, 2, 3, 3, 8],
[na, 6, 0, 5, 13],
[na, 7, 0, 7, 8],
[6, 6, 2, 5, 7],
])
with pytest.raises(ValueError, match="Input contains (infinity|NaN)"):
KNNImputer(missing_values=na).fit(X)
# Test with inf present in matrix passed in transform()
X = np.array([
[np.inf, 1, 1, 2, na],
[2, 1, 2, 2, 3],
[3, 2, 3, 3, 8],
[na, 6, 0, 5, 13],
[na, 7, 0, 7, 8],
[6, 6, 2, 5, 7],
])
X_fit = np.array([
[0, 1, 1, 2, na],
[2, 1, 2, 2, 3],
[3, 2, 3, 3, 8],
[na, 6, 0, 5, 13],
[na, 7, 0, 7, 8],
[6, 6, 2, 5, 7],
])
imputer = KNNImputer(missing_values=na).fit(X_fit)
with pytest.raises(ValueError, match="Input contains (infinity|NaN)"):
imputer.transform(X)
# negative n_neighbors
with pytest.raises(ValueError, match="Expected n_neighbors > 0"):
KNNImputer(missing_values=na, n_neighbors=0).fit(X_fit)
# Test with missing_values=0 when NaN present
imputer = KNNImputer(missing_values=0, n_neighbors=2, weights="uniform")
X = np.array([
[np.nan, 0, 0, 0, 5],
[np.nan, 1, 0, np.nan, 3],
[np.nan, 2, 0, 0, 0],
[np.nan, 6, 0, 5, 13],
])
msg = (r"Input contains NaN, infinity or a value too large for "
r"dtype\('float64'\)")
with pytest.raises(ValueError, match=msg):
imputer.fit(X)
X = np.array([
[0, 0],
[np.nan, 2],
])
# Test with a metric type without NaN support
imputer = KNNImputer(metric="euclidean")
bad_metric_msg = "The selected metric does not support NaN values"
with pytest.raises(ValueError, match=bad_metric_msg):
imputer.fit(X)
@pytest.mark.parametrize("na", [np.nan, -1])
def test_knn_imputer_removes_all_na_features(na):
X = np.array([
[1, 1, na, 1, 1, 1.],
[2, 3, na, 2, 2, 2],
[3, 4, na, 3, 3, na],
[6, 4, na, na, 6, 6],
])
knn = KNNImputer(missing_values=na, n_neighbors=2).fit(X)
X_transform = knn.transform(X)
assert not np.isnan(X_transform).any()
assert X_transform.shape == (4, 5)
X_test = np.arange(0, 12).reshape(2, 6)
X_transform = knn.transform(X_test)
assert_allclose(X_test[:, [0, 1, 3, 4, 5]], X_transform)
@pytest.mark.parametrize("na", [np.nan, -1])
def test_knn_imputer_zero_nan_imputes_the_same(na):
# Test with an imputable matrix and compare with different missing_values
X_zero = np.array([
[1, 0, 1, 1, 1.],
[2, 2, 2, 2, 2],
[3, 3, 3, 3, 0],
[6, 6, 0, 6, 6],
])
X_nan = np.array([
[1, na, 1, 1, 1.],
[2, 2, 2, 2, 2],
[3, 3, 3, 3, na],
[6, 6, na, 6, 6],
])
X_imputed = np.array([
[1, 2.5, 1, 1, 1.],
[2, 2, 2, 2, 2],
[3, 3, 3, 3, 1.5],
[6, 6, 2.5, 6, 6],
])
imputer_zero = KNNImputer(missing_values=0, n_neighbors=2,
weights="uniform")
imputer_nan = KNNImputer(missing_values=na, n_neighbors=2,
weights="uniform")
assert_allclose(imputer_zero.fit_transform(X_zero), X_imputed)
assert_allclose(imputer_zero.fit_transform(X_zero),
imputer_nan.fit_transform(X_nan))
@pytest.mark.parametrize("na", [np.nan, -1])
def test_knn_imputer_verify(na):
# Test with an imputable matrix
X = np.array([
[1, 0, 0, 1],
[2, 1, 2, na],
[3, 2, 3, na],
[na, 4, 5, 5],
[6, na, 6, 7],
[8, 8, 8, 8],
[16, 15, 18, 19],
])
X_imputed = np.array([
[1, 0, 0, 1],
[2, 1, 2, 8],
[3, 2, 3, 8],
[4, 4, 5, 5],
[6, 3, 6, 7],
[8, 8, 8, 8],
[16, 15, 18, 19],
])
imputer = KNNImputer(missing_values=na)
assert_allclose(imputer.fit_transform(X), X_imputed)
# Test when there is not enough neighbors
X = np.array([
[1, 0, 0, na],
[2, 1, 2, na],
[3, 2, 3, na],
[4, 4, 5, na],
[6, 7, 6, na],
[8, 8, 8, na],
[20, 20, 20, 20],
[22, 22, 22, 22]
])
# Not enough neighbors, use column mean from training
X_impute_value = (20 + 22) / 2
X_imputed = np.array([
[1, 0, 0, X_impute_value],
[2, 1, 2, X_impute_value],
[3, 2, 3, X_impute_value],
[4, 4, 5, X_impute_value],
[6, 7, 6, X_impute_value],
[8, 8, 8, X_impute_value],
[20, 20, 20, 20],
[22, 22, 22, 22]
])
imputer = KNNImputer(missing_values=na)
assert_allclose(imputer.fit_transform(X), X_imputed)
# Test when data in fit() and transform() are different
X = np.array([
[0, 0],
[na, 2],
[4, 3],
[5, 6],
[7, 7],
[9, 8],
[11, 16]
])
X1 = np.array([
[1, 0],
[3, 2],
[4, na]
])
X_2_1 = (0 + 3 + 6 + 7 + 8) / 5
X1_imputed = np.array([
[1, 0],
[3, 2],
[4, X_2_1]
])
imputer = KNNImputer(missing_values=na)
assert_allclose(imputer.fit(X).transform(X1), X1_imputed)
@pytest.mark.parametrize("na", [np.nan, -1])
def test_knn_imputer_one_n_neighbors(na):
X = np.array([
[0, 0],
[na, 2],
[4, 3],
[5, na],
[7, 7],
[na, 8],
[14, 13]
])
X_imputed = np.array([
[0, 0],
[4, 2],
[4, 3],
[5, 3],
[7, 7],
[7, 8],
[14, 13]
])
imputer = KNNImputer(n_neighbors=1, missing_values=na)
assert_allclose(imputer.fit_transform(X), X_imputed)
@pytest.mark.parametrize("na", [np.nan, -1])
def test_knn_imputer_all_samples_are_neighbors(na):
X = np.array([
[0, 0],
[na, 2],
[4, 3],
[5, na],
[7, 7],
[na, 8],
[14, 13]
])
X_imputed = np.array([
[0, 0],
[6, 2],
[4, 3],
[5, 5.5],
[7, 7],
[6, 8],
[14, 13]
])
n_neighbors = X.shape[0] - 1
imputer = KNNImputer(n_neighbors=n_neighbors, missing_values=na)
assert_allclose(imputer.fit_transform(X), X_imputed)
n_neighbors = X.shape[0]
imputer_plus1 = KNNImputer(n_neighbors=n_neighbors, missing_values=na)
assert_allclose(imputer_plus1.fit_transform(X), X_imputed)
@pytest.mark.parametrize("na", [np.nan, -1])
def test_knn_imputer_weight_uniform(na):
X = np.array([
[0, 0],
[na, 2],
[4, 3],
[5, 6],
[7, 7],
[9, 8],
[11, 10]
])
# Test with "uniform" weight (or unweighted)
X_imputed_uniform = np.array([
[0, 0],
[5, 2],
[4, 3],
[5, 6],
[7, 7],
[9, 8],
[11, 10]
])
imputer = KNNImputer(weights="uniform", missing_values=na)
assert_allclose(imputer.fit_transform(X), X_imputed_uniform)
# Test with "callable" weight
def no_weight(dist):
return None
imputer = KNNImputer(weights=no_weight, missing_values=na)
assert_allclose(imputer.fit_transform(X), X_imputed_uniform)
# Test with "callable" uniform weight
def uniform_weight(dist):
return np.ones_like(dist)
imputer = KNNImputer(weights=uniform_weight, missing_values=na)
assert_allclose(imputer.fit_transform(X), X_imputed_uniform)
@pytest.mark.parametrize("na", [np.nan, -1])
def test_knn_imputer_weight_distance(na):
X = np.array([
[0, 0],
[na, 2],
[4, 3],
[5, 6],
[7, 7],
[9, 8],
[11, 10]
])
# Test with "distance" weight
nn = KNeighborsRegressor(metric="euclidean", weights="distance")
X_rows_idx = [0, 2, 3, 4, 5, 6]
nn.fit(X[X_rows_idx, 1:], X[X_rows_idx, 0])
knn_imputed_value = nn.predict(X[1:2, 1:])[0]
# Manual calculation
X_neighbors_idx = [0, 2, 3, 4, 5]
dist = nan_euclidean_distances(X[1:2, :], X, missing_values=na)
weights = 1 / dist[:, X_neighbors_idx].ravel()
manual_imputed_value = np.average(X[X_neighbors_idx, 0], weights=weights)
X_imputed_distance1 = np.array([
[0, 0],
[manual_imputed_value, 2],
[4, 3],
[5, 6],
[7, 7],
[9, 8],
[11, 10]
])
# NearestNeighbor calculation
X_imputed_distance2 = np.array([
[0, 0],
[knn_imputed_value, 2],
[4, 3],
[5, 6],
[7, 7],
[9, 8],
[11, 10]
])
imputer = KNNImputer(weights="distance", missing_values=na)
assert_allclose(imputer.fit_transform(X), X_imputed_distance1)
assert_allclose(imputer.fit_transform(X), X_imputed_distance2)
# Test with weights = "distance" and n_neighbors=2
X = np.array([
[na, 0, 0],
[2, 1, 2],
[3, 2, 3],
[4, 5, 5],
])
# neighbors are rows 1, 2, the nan_euclidean_distances are:
dist_0_1 = np.sqrt((3/2)*((1 - 0)**2 + (2 - 0)**2))
dist_0_2 = np.sqrt((3/2)*((2 - 0)**2 + (3 - 0)**2))
imputed_value = np.average([2, 3], weights=[1 / dist_0_1, 1 / dist_0_2])
X_imputed = np.array([
[imputed_value, 0, 0],
[2, 1, 2],
[3, 2, 3],
[4, 5, 5],
])
imputer = KNNImputer(n_neighbors=2, weights="distance", missing_values=na)
assert_allclose(imputer.fit_transform(X), X_imputed)
# Test with varying missingness patterns
X = np.array([
[1, 0, 0, 1],
[0, na, 1, na],
[1, 1, 1, na],
[0, 1, 0, 0],
[0, 0, 0, 0],
[1, 0, 1, 1],
[10, 10, 10, 10],
])
# Get weights of donor neighbors
dist = nan_euclidean_distances(X, missing_values=na)
r1c1_nbor_dists = dist[1, [0, 2, 3, 4, 5]]
r1c3_nbor_dists = dist[1, [0, 3, 4, 5, 6]]
r1c1_nbor_wt = 1 / r1c1_nbor_dists
r1c3_nbor_wt = 1 / r1c3_nbor_dists
r2c3_nbor_dists = dist[2, [0, 3, 4, 5, 6]]
r2c3_nbor_wt = 1 / r2c3_nbor_dists
# Collect donor values
col1_donor_values = np.ma.masked_invalid(X[[0, 2, 3, 4, 5], 1]).copy()
col3_donor_values = np.ma.masked_invalid(X[[0, 3, 4, 5, 6], 3]).copy()
# Final imputed values
r1c1_imp = np.ma.average(col1_donor_values, weights=r1c1_nbor_wt)
r1c3_imp = np.ma.average(col3_donor_values, weights=r1c3_nbor_wt)
r2c3_imp = np.ma.average(col3_donor_values, weights=r2c3_nbor_wt)
X_imputed = np.array([
[1, 0, 0, 1],
[0, r1c1_imp, 1, r1c3_imp],
[1, 1, 1, r2c3_imp],
[0, 1, 0, 0],
[0, 0, 0, 0],
[1, 0, 1, 1],
[10, 10, 10, 10],
])
imputer = KNNImputer(weights="distance", missing_values=na)
assert_allclose(imputer.fit_transform(X), X_imputed)
X = np.array([
[0, 0, 0, na],
[1, 1, 1, na],
[2, 2, na, 2],
[3, 3, 3, 3],
[4, 4, 4, 4],
[5, 5, 5, 5],
[6, 6, 6, 6],
[na, 7, 7, 7]
])
dist = pairwise_distances(X, metric="nan_euclidean", squared=False,
missing_values=na)
# Calculate weights
r0c3_w = 1.0 / dist[0, 2:-1]
r1c3_w = 1.0 / dist[1, 2:-1]
r2c2_w = 1.0 / dist[2, (0, 1, 3, 4, 5)]
r7c0_w = 1.0 / dist[7, 2:7]
# Calculate weighted averages
r0c3 = np.average(X[2:-1, -1], weights=r0c3_w)
r1c3 = np.average(X[2:-1, -1], weights=r1c3_w)
r2c2 = np.average(X[(0, 1, 3, 4, 5), 2], weights=r2c2_w)
r7c0 = np.average(X[2:7, 0], weights=r7c0_w)
X_imputed = np.array([
[0, 0, 0, r0c3],
[1, 1, 1, r1c3],
[2, 2, r2c2, 2],
[3, 3, 3, 3],
[4, 4, 4, 4],
[5, 5, 5, 5],
[6, 6, 6, 6],
[r7c0, 7, 7, 7]
])
imputer_comp_wt = KNNImputer(missing_values=na, weights="distance")
assert_allclose(imputer_comp_wt.fit_transform(X), X_imputed)
def test_knn_imputer_callable_metric():
# Define callable metric that returns the l1 norm:
def custom_callable(x, y, missing_values=np.nan, squared=False):
x = np.ma.array(x, mask=np.isnan(x))
y = np.ma.array(y, mask=np.isnan(y))
dist = np.nansum(np.abs(x-y))
return dist
X = np.array([
[4, 3, 3, np.nan],
[6, 9, 6, 9],
[4, 8, 6, 9],
[np.nan, 9, 11, 10.]
])
X_0_3 = (9 + 9) / 2
X_3_0 = (6 + 4) / 2
X_imputed = np.array([
[4, 3, 3, X_0_3],
[6, 9, 6, 9],
[4, 8, 6, 9],
[X_3_0, 9, 11, 10.]
])
imputer = KNNImputer(n_neighbors=2, metric=custom_callable)
assert_allclose(imputer.fit_transform(X), X_imputed)
@pytest.mark.parametrize("working_memory", [None, 0])
@pytest.mark.parametrize("na", [-1, np.nan])
# Note that we use working_memory=0 to ensure that chunking is tested, even
# for a small dataset. However, it should raise a UserWarning that we ignore.
@pytest.mark.filterwarnings("ignore:adhere to working_memory")
def test_knn_imputer_with_simple_example(na, working_memory):
X = np.array([
[0, na, 0, na],
[1, 1, 1, na],
[2, 2, na, 2],
[3, 3, 3, 3],
[4, 4, 4, 4],
[5, 5, 5, 5],
[6, 6, 6, 6],
[na, 7, 7, 7]
])
r0c1 = np.mean(X[1:6, 1])
r0c3 = np.mean(X[2:-1, -1])
r1c3 = np.mean(X[2:-1, -1])
r2c2 = np.mean(X[[0, 1, 3, 4, 5], 2])
r7c0 = np.mean(X[2:-1, 0])
X_imputed = np.array([
[0, r0c1, 0, r0c3],
[1, 1, 1, r1c3],
[2, 2, r2c2, 2],
[3, 3, 3, 3],
[4, 4, 4, 4],
[5, 5, 5, 5],
[6, 6, 6, 6],
[r7c0, 7, 7, 7]
])
with config_context(working_memory=working_memory):
imputer_comp = KNNImputer(missing_values=na)
assert_allclose(imputer_comp.fit_transform(X), X_imputed)
@pytest.mark.parametrize("na", [-1, np.nan])
@pytest.mark.parametrize("weights", ['uniform', 'distance'])
def test_knn_imputer_not_enough_valid_distances(na, weights):
# Samples with needed feature has nan distance
X1 = np.array([
[na, 11],
[na, 1],
[3, na]
])
X1_imputed = np.array([
[3, 11],
[3, 1],
[3, 6]
])
knn = KNNImputer(missing_values=na, n_neighbors=1, weights=weights)
assert_allclose(knn.fit_transform(X1), X1_imputed)
X2 = np.array([[4, na]])
X2_imputed = np.array([[4, 6]])
assert_allclose(knn.transform(X2), X2_imputed)
@pytest.mark.parametrize("na", [-1, np.nan])
def test_knn_imputer_drops_all_nan_features(na):
X1 = np.array([
[na, 1],
[na, 2]
])
knn = KNNImputer(missing_values=na, n_neighbors=1)
X1_expected = np.array([[1], [2]])
assert_allclose(knn.fit_transform(X1), X1_expected)
X2 = np.array([
[1, 2],
[3, na]
])
X2_expected = np.array([[2], [1.5]])
assert_allclose(knn.transform(X2), X2_expected)
@pytest.mark.parametrize("working_memory", [None, 0])
@pytest.mark.parametrize("na", [-1, np.nan])
def test_knn_imputer_distance_weighted_not_enough_neighbors(na,
working_memory):
X = np.array([
[3, na],
[2, na],
[na, 4],
[5, 6],
[6, 8],
[na, 5]
])
dist = pairwise_distances(X, metric="nan_euclidean", squared=False,
missing_values=na)
X_01 = np.average(X[3:5, 1], weights=1/dist[0, 3:5])
X_11 = np.average(X[3:5, 1], weights=1/dist[1, 3:5])
X_20 = np.average(X[3:5, 0], weights=1/dist[2, 3:5])
X_50 = np.average(X[3:5, 0], weights=1/dist[5, 3:5])
X_expected = np.array([
[3, X_01],
[2, X_11],
[X_20, 4],
[5, 6],
[6, 8],
[X_50, 5]
])
with config_context(working_memory=working_memory):
knn_3 = KNNImputer(missing_values=na, n_neighbors=3,
weights='distance')
assert_allclose(knn_3.fit_transform(X), X_expected)
knn_4 = KNNImputer(missing_values=na, n_neighbors=4,
weights='distance')
assert_allclose(knn_4.fit_transform(X), X_expected)
@pytest.mark.parametrize("na, allow_nan", [(-1, False), (np.nan, True)])
def test_knn_tags(na, allow_nan):
knn = KNNImputer(missing_values=na)
assert knn._get_tags()["allow_nan"] == allow_nan