Traktor/myenv/Lib/site-packages/sklearn/utils/_random.pyx
2024-05-23 01:57:24 +02:00

356 lines
12 KiB
Cython

# Author: Arnaud Joly
#
# License: BSD 3 clause
"""
Random utility function
=======================
This module complements missing features of ``numpy.random``.
The module contains:
* Several algorithms to sample integers without replacement.
* Fast rand_r alternative based on xor shifts
"""
import numpy as np
from . import check_random_state
from ._typedefs cimport intp_t
cdef uint32_t DEFAULT_SEED = 1
# Compatibility type to always accept the default int type used by NumPy, both
# before and after NumPy 2. On Windows, `long` does not always match `inp_t`.
# See the comments in the `sample_without_replacement` Python function for more
# details.
ctypedef fused default_int:
intp_t
long
cpdef _sample_without_replacement_check_input(default_int n_population,
default_int n_samples):
""" Check that input are consistent for sample_without_replacement"""
if n_population < 0:
raise ValueError('n_population should be greater than 0, got %s.'
% n_population)
if n_samples > n_population:
raise ValueError('n_population should be greater or equal than '
'n_samples, got n_samples > n_population (%s > %s)'
% (n_samples, n_population))
cpdef _sample_without_replacement_with_tracking_selection(
default_int n_population,
default_int n_samples,
random_state=None):
r"""Sample integers without replacement.
Select n_samples integers from the set [0, n_population) without
replacement.
Time complexity:
- Worst-case: unbounded
- Average-case:
O(O(np.random.randint) * \sum_{i=1}^n_samples 1 /
(1 - i / n_population)))
<= O(O(np.random.randint) *
n_population * ln((n_population - 2)
/(n_population - 1 - n_samples)))
<= O(O(np.random.randint) *
n_population * 1 / (1 - n_samples / n_population))
Space complexity of O(n_samples) in a python set.
Parameters
----------
n_population : int
The size of the set to sample from.
n_samples : int
The number of integer to sample.
random_state : int, RandomState instance or None, default=None
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Returns
-------
out : ndarray of shape (n_samples,)
The sampled subsets of integer.
"""
_sample_without_replacement_check_input(n_population, n_samples)
cdef default_int i
cdef default_int j
cdef default_int[::1] out = np.empty((n_samples, ), dtype=int)
rng = check_random_state(random_state)
rng_randint = rng.randint
# The following line of code are heavily inspired from python core,
# more precisely of random.sample.
cdef set selected = set()
for i in range(n_samples):
j = rng_randint(n_population)
while j in selected:
j = rng_randint(n_population)
selected.add(j)
out[i] = j
return np.asarray(out)
cpdef _sample_without_replacement_with_pool(default_int n_population,
default_int n_samples,
random_state=None):
"""Sample integers without replacement.
Select n_samples integers from the set [0, n_population) without
replacement.
Time complexity: O(n_population + O(np.random.randint) * n_samples)
Space complexity of O(n_population + n_samples).
Parameters
----------
n_population : int
The size of the set to sample from.
n_samples : int
The number of integer to sample.
random_state : int, RandomState instance or None, default=None
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Returns
-------
out : ndarray of shape (n_samples,)
The sampled subsets of integer.
"""
_sample_without_replacement_check_input(n_population, n_samples)
cdef default_int i
cdef default_int j
cdef default_int[::1] out = np.empty((n_samples,), dtype=int)
cdef default_int[::1] pool = np.empty((n_population,), dtype=int)
rng = check_random_state(random_state)
rng_randint = rng.randint
# Initialize the pool
for i in range(n_population):
pool[i] = i
# The following line of code are heavily inspired from python core,
# more precisely of random.sample.
for i in range(n_samples):
j = rng_randint(n_population - i) # invariant: non-selected at [0,n-i)
out[i] = pool[j]
pool[j] = pool[n_population - i - 1] # move non-selected item into vacancy
return np.asarray(out)
cpdef _sample_without_replacement_with_reservoir_sampling(
default_int n_population,
default_int n_samples,
random_state=None
):
"""Sample integers without replacement.
Select n_samples integers from the set [0, n_population) without
replacement.
Time complexity of
O((n_population - n_samples) * O(np.random.randint) + n_samples)
Space complexity of O(n_samples)
Parameters
----------
n_population : int
The size of the set to sample from.
n_samples : int
The number of integer to sample.
random_state : int, RandomState instance or None, default=None
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Returns
-------
out : ndarray of shape (n_samples,)
The sampled subsets of integer. The order of the items is not
necessarily random. Use a random permutation of the array if the order
of the items has to be randomized.
"""
_sample_without_replacement_check_input(n_population, n_samples)
cdef default_int i
cdef default_int j
cdef default_int[::1] out = np.empty((n_samples, ), dtype=int)
rng = check_random_state(random_state)
rng_randint = rng.randint
# This cython implementation is based on the one of Robert Kern:
# http://mail.scipy.org/pipermail/numpy-discussion/2010-December/
# 054289.html
#
for i in range(n_samples):
out[i] = i
for i from n_samples <= i < n_population:
j = rng_randint(0, i + 1)
if j < n_samples:
out[j] = i
return np.asarray(out)
cdef _sample_without_replacement(default_int n_population,
default_int n_samples,
method="auto",
random_state=None):
"""Sample integers without replacement.
Private function for the implementation, see sample_without_replacement
documentation for more details.
"""
_sample_without_replacement_check_input(n_population, n_samples)
all_methods = ("auto", "tracking_selection", "reservoir_sampling", "pool")
ratio = <double> n_samples / n_population if n_population != 0.0 else 1.0
# Check ratio and use permutation unless ratio < 0.01 or ratio > 0.99
if method == "auto" and ratio > 0.01 and ratio < 0.99:
rng = check_random_state(random_state)
return rng.permutation(n_population)[:n_samples]
if method == "auto" or method == "tracking_selection":
# TODO the pool based method can also be used.
# however, it requires special benchmark to take into account
# the memory requirement of the array vs the set.
# The value 0.2 has been determined through benchmarking.
if ratio < 0.2:
return _sample_without_replacement_with_tracking_selection(
n_population, n_samples, random_state)
else:
return _sample_without_replacement_with_reservoir_sampling(
n_population, n_samples, random_state)
elif method == "reservoir_sampling":
return _sample_without_replacement_with_reservoir_sampling(
n_population, n_samples, random_state)
elif method == "pool":
return _sample_without_replacement_with_pool(n_population, n_samples,
random_state)
else:
raise ValueError('Expected a method name in %s, got %s. '
% (all_methods, method))
def sample_without_replacement(
object n_population, object n_samples, method="auto", random_state=None):
"""Sample integers without replacement.
Select n_samples integers from the set [0, n_population) without
replacement.
Parameters
----------
n_population : int
The size of the set to sample from.
n_samples : int
The number of integer to sample.
random_state : int, RandomState instance or None, default=None
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
method : {"auto", "tracking_selection", "reservoir_sampling", "pool"}, \
default='auto'
If method == "auto", the ratio of n_samples / n_population is used
to determine which algorithm to use:
If ratio is between 0 and 0.01, tracking selection is used.
If ratio is between 0.01 and 0.99, numpy.random.permutation is used.
If ratio is greater than 0.99, reservoir sampling is used.
The order of the selected integers is undefined. If a random order is
desired, the selected subset should be shuffled.
If method =="tracking_selection", a set based implementation is used
which is suitable for `n_samples` <<< `n_population`.
If method == "reservoir_sampling", a reservoir sampling algorithm is
used which is suitable for high memory constraint or when
O(`n_samples`) ~ O(`n_population`).
The order of the selected integers is undefined. If a random order is
desired, the selected subset should be shuffled.
If method == "pool", a pool based algorithm is particularly fast, even
faster than the tracking selection method. However, a vector containing
the entire population has to be initialized.
If n_samples ~ n_population, the reservoir sampling method is faster.
Returns
-------
out : ndarray of shape (n_samples,)
The sampled subsets of integer. The subset of selected integer might
not be randomized, see the method argument.
Examples
--------
>>> from sklearn.utils.random import sample_without_replacement
>>> sample_without_replacement(10, 5, random_state=42)
array([8, 1, 5, 0, 7])
"""
cdef:
intp_t n_pop_intp, n_samples_intp
long n_pop_long, n_samples_long
# On most platforms `np.int_ is np.intp`. However, before NumPy 2 the
# default integer `np.int_` was a long which is 32bit on 64bit windows
# while `intp` is 64bit on 64bit platforms and 32bit on 32bit ones.
if np.int_ is np.intp:
# Branch always taken on NumPy >=2 (or when not on 64bit windows).
# Cython has different rules for conversion of values to integers.
# For NumPy <1.26.2 AND Cython 3, this first branch requires `int()`
# called explicitly to allow e.g. floats.
n_pop_intp = int(n_population)
n_samples_intp = int(n_samples)
return _sample_without_replacement(
n_pop_intp, n_samples_intp, method, random_state)
else:
# Branch taken on 64bit windows with Numpy<2.0 where `long` is 32bit
n_pop_long = n_population
n_samples_long = n_samples
return _sample_without_replacement(
n_pop_long, n_samples_long, method, random_state)
def _our_rand_r_py(seed):
"""Python utils to test the our_rand_r function"""
cdef uint32_t my_seed = seed
return our_rand_r(&my_seed)