402 lines
14 KiB
Python
402 lines
14 KiB
Python
# Authors: Ashim Bhattarai <ashimb9@gmail.com>
|
|
# Thomas J Fan <thomasjpfan@gmail.com>
|
|
# License: BSD 3 clause
|
|
|
|
from numbers import Integral
|
|
|
|
import numpy as np
|
|
|
|
from ..base import _fit_context
|
|
from ..metrics import pairwise_distances_chunked
|
|
from ..metrics.pairwise import _NAN_METRICS
|
|
from ..neighbors._base import _get_weights
|
|
from ..utils._mask import _get_mask
|
|
from ..utils._missing import is_scalar_nan
|
|
from ..utils._param_validation import Hidden, Interval, StrOptions
|
|
from ..utils.validation import FLOAT_DTYPES, _check_feature_names_in, check_is_fitted
|
|
from ._base import _BaseImputer
|
|
|
|
|
|
class KNNImputer(_BaseImputer):
|
|
"""Imputation for completing missing values using k-Nearest Neighbors.
|
|
|
|
Each sample's missing values are imputed using the mean value from
|
|
`n_neighbors` nearest neighbors found in the training set. Two samples are
|
|
close if the features that neither is missing are close.
|
|
|
|
Read more in the :ref:`User Guide <knnimpute>`.
|
|
|
|
.. versionadded:: 0.22
|
|
|
|
Parameters
|
|
----------
|
|
missing_values : int, float, str, np.nan or None, default=np.nan
|
|
The placeholder for the missing values. All occurrences of
|
|
`missing_values` will be imputed. For pandas' dataframes with
|
|
nullable integer dtypes with missing values, `missing_values`
|
|
should be set to np.nan, since `pd.NA` will be converted to np.nan.
|
|
|
|
n_neighbors : int, default=5
|
|
Number of neighboring samples to use for imputation.
|
|
|
|
weights : {'uniform', 'distance'} or callable, default='uniform'
|
|
Weight function used in prediction. Possible values:
|
|
|
|
- 'uniform' : uniform weights. All points in each neighborhood are
|
|
weighted equally.
|
|
- 'distance' : weight points by the inverse of their distance.
|
|
in this case, closer neighbors of a query point will have a
|
|
greater influence than neighbors which are further away.
|
|
- callable : a user-defined function which accepts an
|
|
array of distances, and returns an array of the same shape
|
|
containing the weights.
|
|
|
|
metric : {'nan_euclidean'} or callable, default='nan_euclidean'
|
|
Distance metric for searching neighbors. Possible values:
|
|
|
|
- 'nan_euclidean'
|
|
- callable : a user-defined function which conforms to the definition
|
|
of ``_pairwise_callable(X, Y, metric, **kwds)``. The function
|
|
accepts two arrays, X and Y, and a `missing_values` keyword in
|
|
`kwds` and returns a scalar distance value.
|
|
|
|
copy : bool, default=True
|
|
If True, a copy of X will be created. If False, imputation will
|
|
be done in-place whenever possible.
|
|
|
|
add_indicator : bool, default=False
|
|
If True, a :class:`MissingIndicator` transform will stack onto the
|
|
output of the imputer's transform. This allows a predictive estimator
|
|
to account for missingness despite imputation. If a feature has no
|
|
missing values at fit/train time, the feature won't appear on the
|
|
missing indicator even if there are missing values at transform/test
|
|
time.
|
|
|
|
keep_empty_features : bool, default=False
|
|
If True, features that consist exclusively of missing values when
|
|
`fit` is called are returned in results when `transform` is called.
|
|
The imputed value is always `0`.
|
|
|
|
.. versionadded:: 1.2
|
|
|
|
Attributes
|
|
----------
|
|
indicator_ : :class:`~sklearn.impute.MissingIndicator`
|
|
Indicator used to add binary indicators for missing values.
|
|
``None`` if add_indicator is False.
|
|
|
|
n_features_in_ : int
|
|
Number of features seen during :term:`fit`.
|
|
|
|
.. versionadded:: 0.24
|
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
|
Names of features seen during :term:`fit`. Defined only when `X`
|
|
has feature names that are all strings.
|
|
|
|
.. versionadded:: 1.0
|
|
|
|
See Also
|
|
--------
|
|
SimpleImputer : Univariate imputer for completing missing values
|
|
with simple strategies.
|
|
IterativeImputer : Multivariate imputer that estimates values to impute for
|
|
each feature with missing values from all the others.
|
|
|
|
References
|
|
----------
|
|
* `Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor
|
|
Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing
|
|
value estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17
|
|
no. 6, 2001 Pages 520-525.
|
|
<https://academic.oup.com/bioinformatics/article/17/6/520/272365>`_
|
|
|
|
Examples
|
|
--------
|
|
>>> import numpy as np
|
|
>>> from sklearn.impute import KNNImputer
|
|
>>> X = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]
|
|
>>> imputer = KNNImputer(n_neighbors=2)
|
|
>>> imputer.fit_transform(X)
|
|
array([[1. , 2. , 4. ],
|
|
[3. , 4. , 3. ],
|
|
[5.5, 6. , 5. ],
|
|
[8. , 8. , 7. ]])
|
|
|
|
For a more detailed example see
|
|
:ref:`sphx_glr_auto_examples_impute_plot_missing_values.py`.
|
|
"""
|
|
|
|
_parameter_constraints: dict = {
|
|
**_BaseImputer._parameter_constraints,
|
|
"n_neighbors": [Interval(Integral, 1, None, closed="left")],
|
|
"weights": [StrOptions({"uniform", "distance"}), callable, Hidden(None)],
|
|
"metric": [StrOptions(set(_NAN_METRICS)), callable],
|
|
"copy": ["boolean"],
|
|
}
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
missing_values=np.nan,
|
|
n_neighbors=5,
|
|
weights="uniform",
|
|
metric="nan_euclidean",
|
|
copy=True,
|
|
add_indicator=False,
|
|
keep_empty_features=False,
|
|
):
|
|
super().__init__(
|
|
missing_values=missing_values,
|
|
add_indicator=add_indicator,
|
|
keep_empty_features=keep_empty_features,
|
|
)
|
|
self.n_neighbors = n_neighbors
|
|
self.weights = weights
|
|
self.metric = metric
|
|
self.copy = copy
|
|
|
|
def _calc_impute(self, dist_pot_donors, n_neighbors, fit_X_col, mask_fit_X_col):
|
|
"""Helper function to impute a single column.
|
|
|
|
Parameters
|
|
----------
|
|
dist_pot_donors : ndarray of shape (n_receivers, n_potential_donors)
|
|
Distance matrix between the receivers and potential donors from
|
|
training set. There must be at least one non-nan distance between
|
|
a receiver and a potential donor.
|
|
|
|
n_neighbors : int
|
|
Number of neighbors to consider.
|
|
|
|
fit_X_col : ndarray of shape (n_potential_donors,)
|
|
Column of potential donors from training set.
|
|
|
|
mask_fit_X_col : ndarray of shape (n_potential_donors,)
|
|
Missing mask for fit_X_col.
|
|
|
|
Returns
|
|
-------
|
|
imputed_values: ndarray of shape (n_receivers,)
|
|
Imputed values for receiver.
|
|
"""
|
|
# Get donors
|
|
donors_idx = np.argpartition(dist_pot_donors, n_neighbors - 1, axis=1)[
|
|
:, :n_neighbors
|
|
]
|
|
|
|
# Get weight matrix from distance matrix
|
|
donors_dist = dist_pot_donors[
|
|
np.arange(donors_idx.shape[0])[:, None], donors_idx
|
|
]
|
|
|
|
weight_matrix = _get_weights(donors_dist, self.weights)
|
|
|
|
# fill nans with zeros
|
|
if weight_matrix is not None:
|
|
weight_matrix[np.isnan(weight_matrix)] = 0.0
|
|
|
|
# Retrieve donor values and calculate kNN average
|
|
donors = fit_X_col.take(donors_idx)
|
|
donors_mask = mask_fit_X_col.take(donors_idx)
|
|
donors = np.ma.array(donors, mask=donors_mask)
|
|
|
|
return np.ma.average(donors, axis=1, weights=weight_matrix).data
|
|
|
|
@_fit_context(prefer_skip_nested_validation=True)
|
|
def fit(self, X, y=None):
|
|
"""Fit the imputer on X.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like shape of (n_samples, n_features)
|
|
Input data, where `n_samples` is the number of samples and
|
|
`n_features` is the number of features.
|
|
|
|
y : Ignored
|
|
Not used, present here for API consistency by convention.
|
|
|
|
Returns
|
|
-------
|
|
self : object
|
|
The fitted `KNNImputer` class instance.
|
|
"""
|
|
# Check data integrity and calling arguments
|
|
if not is_scalar_nan(self.missing_values):
|
|
force_all_finite = True
|
|
else:
|
|
force_all_finite = "allow-nan"
|
|
|
|
X = self._validate_data(
|
|
X,
|
|
accept_sparse=False,
|
|
dtype=FLOAT_DTYPES,
|
|
force_all_finite=force_all_finite,
|
|
copy=self.copy,
|
|
)
|
|
|
|
self._fit_X = X
|
|
self._mask_fit_X = _get_mask(self._fit_X, self.missing_values)
|
|
self._valid_mask = ~np.all(self._mask_fit_X, axis=0)
|
|
|
|
super()._fit_indicator(self._mask_fit_X)
|
|
|
|
return self
|
|
|
|
def transform(self, X):
|
|
"""Impute all missing values in X.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples, n_features)
|
|
The input data to complete.
|
|
|
|
Returns
|
|
-------
|
|
X : array-like of shape (n_samples, n_output_features)
|
|
The imputed dataset. `n_output_features` is the number of features
|
|
that is not always missing during `fit`.
|
|
"""
|
|
|
|
check_is_fitted(self)
|
|
if not is_scalar_nan(self.missing_values):
|
|
force_all_finite = True
|
|
else:
|
|
force_all_finite = "allow-nan"
|
|
X = self._validate_data(
|
|
X,
|
|
accept_sparse=False,
|
|
dtype=FLOAT_DTYPES,
|
|
force_all_finite=force_all_finite,
|
|
copy=self.copy,
|
|
reset=False,
|
|
)
|
|
|
|
mask = _get_mask(X, self.missing_values)
|
|
mask_fit_X = self._mask_fit_X
|
|
valid_mask = self._valid_mask
|
|
|
|
X_indicator = super()._transform_indicator(mask)
|
|
|
|
# Removes columns where the training data is all nan
|
|
if not np.any(mask):
|
|
# No missing values in X
|
|
if self.keep_empty_features:
|
|
Xc = X
|
|
Xc[:, ~valid_mask] = 0
|
|
else:
|
|
Xc = X[:, valid_mask]
|
|
|
|
# Even if there are no missing values in X, we still concatenate Xc
|
|
# with the missing value indicator matrix, X_indicator.
|
|
# This is to ensure that the output maintains consistency in terms
|
|
# of columns, regardless of whether missing values exist in X or not.
|
|
return super()._concatenate_indicator(Xc, X_indicator)
|
|
|
|
row_missing_idx = np.flatnonzero(mask.any(axis=1))
|
|
|
|
non_missing_fix_X = np.logical_not(mask_fit_X)
|
|
|
|
# Maps from indices from X to indices in dist matrix
|
|
dist_idx_map = np.zeros(X.shape[0], dtype=int)
|
|
dist_idx_map[row_missing_idx] = np.arange(row_missing_idx.shape[0])
|
|
|
|
def process_chunk(dist_chunk, start):
|
|
row_missing_chunk = row_missing_idx[start : start + len(dist_chunk)]
|
|
|
|
# Find and impute missing by column
|
|
for col in range(X.shape[1]):
|
|
if not valid_mask[col]:
|
|
# column was all missing during training
|
|
continue
|
|
|
|
col_mask = mask[row_missing_chunk, col]
|
|
if not np.any(col_mask):
|
|
# column has no missing values
|
|
continue
|
|
|
|
(potential_donors_idx,) = np.nonzero(non_missing_fix_X[:, col])
|
|
|
|
# receivers_idx are indices in X
|
|
receivers_idx = row_missing_chunk[np.flatnonzero(col_mask)]
|
|
|
|
# distances for samples that needed imputation for column
|
|
dist_subset = dist_chunk[dist_idx_map[receivers_idx] - start][
|
|
:, potential_donors_idx
|
|
]
|
|
|
|
# receivers with all nan distances impute with mean
|
|
all_nan_dist_mask = np.isnan(dist_subset).all(axis=1)
|
|
all_nan_receivers_idx = receivers_idx[all_nan_dist_mask]
|
|
|
|
if all_nan_receivers_idx.size:
|
|
col_mean = np.ma.array(
|
|
self._fit_X[:, col], mask=mask_fit_X[:, col]
|
|
).mean()
|
|
X[all_nan_receivers_idx, col] = col_mean
|
|
|
|
if len(all_nan_receivers_idx) == len(receivers_idx):
|
|
# all receivers imputed with mean
|
|
continue
|
|
|
|
# receivers with at least one defined distance
|
|
receivers_idx = receivers_idx[~all_nan_dist_mask]
|
|
dist_subset = dist_chunk[dist_idx_map[receivers_idx] - start][
|
|
:, potential_donors_idx
|
|
]
|
|
|
|
n_neighbors = min(self.n_neighbors, len(potential_donors_idx))
|
|
value = self._calc_impute(
|
|
dist_subset,
|
|
n_neighbors,
|
|
self._fit_X[potential_donors_idx, col],
|
|
mask_fit_X[potential_donors_idx, col],
|
|
)
|
|
X[receivers_idx, col] = value
|
|
|
|
# process in fixed-memory chunks
|
|
gen = pairwise_distances_chunked(
|
|
X[row_missing_idx, :],
|
|
self._fit_X,
|
|
metric=self.metric,
|
|
missing_values=self.missing_values,
|
|
force_all_finite=force_all_finite,
|
|
reduce_func=process_chunk,
|
|
)
|
|
for chunk in gen:
|
|
# process_chunk modifies X in place. No return value.
|
|
pass
|
|
|
|
if self.keep_empty_features:
|
|
Xc = X
|
|
Xc[:, ~valid_mask] = 0
|
|
else:
|
|
Xc = X[:, valid_mask]
|
|
|
|
return super()._concatenate_indicator(Xc, X_indicator)
|
|
|
|
def get_feature_names_out(self, input_features=None):
|
|
"""Get output feature names for transformation.
|
|
|
|
Parameters
|
|
----------
|
|
input_features : array-like of str or None, default=None
|
|
Input features.
|
|
|
|
- If `input_features` is `None`, then `feature_names_in_` is
|
|
used as feature names in. If `feature_names_in_` is not defined,
|
|
then the following input feature names are generated:
|
|
`["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
|
|
- If `input_features` is an array-like, then `input_features` must
|
|
match `feature_names_in_` if `feature_names_in_` is defined.
|
|
|
|
Returns
|
|
-------
|
|
feature_names_out : ndarray of str objects
|
|
Transformed feature names.
|
|
"""
|
|
check_is_fitted(self, "n_features_in_")
|
|
input_features = _check_feature_names_in(self, input_features)
|
|
names = input_features[self._valid_mask]
|
|
return self._concatenate_indicator_feature_names_out(names, input_features)
|