# Authors: Gilles Louppe # Peter Prettenhofer # Arnaud Joly # Jacob Schreiber # Nelson Liu # # # License: BSD 3 clause from libc.stdlib cimport free from libc.stdlib cimport realloc from libc.math cimport log as ln from libc.math cimport isnan import numpy as np cimport numpy as cnp cnp.import_array() from ..utils._random cimport our_rand_r # ============================================================================= # Helper functions # ============================================================================= cdef int safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil: # sizeof(realloc_ptr[0]) would be more like idiomatic C, but causes Cython # 0.20.1 to crash. cdef size_t nbytes = nelems * sizeof(p[0][0]) if nbytes / sizeof(p[0][0]) != nelems: # Overflow in the multiplication raise MemoryError(f"could not allocate ({nelems} * {sizeof(p[0][0])}) bytes") cdef realloc_ptr tmp = realloc(p[0], nbytes) if tmp == NULL: raise MemoryError(f"could not allocate {nbytes} bytes") p[0] = tmp return 0 def _realloc_test(): # Helper for tests. Tries to allocate (-1) / 2 * sizeof(size_t) # bytes, which will always overflow. cdef intp_t* p = NULL safe_realloc(&p, (-1) / 2) if p != NULL: free(p) assert False cdef inline cnp.ndarray sizet_ptr_to_ndarray(intp_t* data, intp_t size): """Return copied data as 1D numpy array of intp's.""" cdef cnp.npy_intp shape[1] shape[0] = size return cnp.PyArray_SimpleNewFromData(1, shape, cnp.NPY_INTP, data).copy() cdef inline intp_t rand_int(intp_t low, intp_t high, uint32_t* random_state) noexcept nogil: """Generate a random integer in [low; end).""" return low + our_rand_r(random_state) % (high - low) cdef inline float64_t rand_uniform(float64_t low, float64_t high, uint32_t* random_state) noexcept nogil: """Generate a random float64_t in [low; high).""" return ((high - low) * our_rand_r(random_state) / RAND_R_MAX) + low cdef inline float64_t log(float64_t x) noexcept nogil: return ln(x) / ln(2.0) # ============================================================================= # WeightedPQueue data structure # ============================================================================= cdef class WeightedPQueue: """A priority queue class, always sorted in increasing order. Attributes ---------- capacity : intp_t The capacity of the priority queue. array_ptr : intp_t The water mark of the priority queue; the priority queue grows from left to right in the array ``array_``. ``array_ptr`` is always less than ``capacity``. array_ : WeightedPQueueRecord* The array of priority queue records. The minimum element is on the left at index 0, and the maximum element is on the right at index ``array_ptr-1``. """ def __cinit__(self, intp_t capacity): self.capacity = capacity self.array_ptr = 0 safe_realloc(&self.array_, capacity) def __dealloc__(self): free(self.array_) cdef int reset(self) except -1 nogil: """Reset the WeightedPQueue to its state at construction Return -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ self.array_ptr = 0 # Since safe_realloc can raise MemoryError, use `except -1` safe_realloc(&self.array_, self.capacity) return 0 cdef bint is_empty(self) noexcept nogil: return self.array_ptr <= 0 cdef intp_t size(self) noexcept nogil: return self.array_ptr cdef int push(self, float64_t data, float64_t weight) except -1 nogil: """Push record on the array. Return -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ cdef intp_t array_ptr = self.array_ptr cdef WeightedPQueueRecord* array = NULL cdef intp_t i # Resize if capacity not sufficient if array_ptr >= self.capacity: self.capacity *= 2 # Since safe_realloc can raise MemoryError, use `except -1` safe_realloc(&self.array_, self.capacity) # Put element as last element of array array = self.array_ array[array_ptr].data = data array[array_ptr].weight = weight # bubble last element up according until it is sorted # in ascending order i = array_ptr while(i != 0 and array[i].data < array[i-1].data): array[i], array[i-1] = array[i-1], array[i] i -= 1 # Increase element count self.array_ptr = array_ptr + 1 return 0 cdef int remove(self, float64_t data, float64_t weight) noexcept nogil: """Remove a specific value/weight record from the array. Returns 0 if successful, -1 if record not found.""" cdef intp_t array_ptr = self.array_ptr cdef WeightedPQueueRecord* array = self.array_ cdef intp_t idx_to_remove = -1 cdef intp_t i if array_ptr <= 0: return -1 # find element to remove for i in range(array_ptr): if array[i].data == data and array[i].weight == weight: idx_to_remove = i break if idx_to_remove == -1: return -1 # shift the elements after the removed element # to the left. for i in range(idx_to_remove, array_ptr-1): array[i] = array[i+1] self.array_ptr = array_ptr - 1 return 0 cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil: """Remove the top (minimum) element from array. Returns 0 if successful, -1 if nothing to remove.""" cdef intp_t array_ptr = self.array_ptr cdef WeightedPQueueRecord* array = self.array_ cdef intp_t i if array_ptr <= 0: return -1 data[0] = array[0].data weight[0] = array[0].weight # shift the elements after the removed element # to the left. for i in range(0, array_ptr-1): array[i] = array[i+1] self.array_ptr = array_ptr - 1 return 0 cdef int peek(self, float64_t* data, float64_t* weight) noexcept nogil: """Write the top element from array to a pointer. Returns 0 if successful, -1 if nothing to write.""" cdef WeightedPQueueRecord* array = self.array_ if self.array_ptr <= 0: return -1 # Take first value data[0] = array[0].data weight[0] = array[0].weight return 0 cdef float64_t get_weight_from_index(self, intp_t index) noexcept nogil: """Given an index between [0,self.current_capacity], access the appropriate heap and return the requested weight""" cdef WeightedPQueueRecord* array = self.array_ # get weight at index return array[index].weight cdef float64_t get_value_from_index(self, intp_t index) noexcept nogil: """Given an index between [0,self.current_capacity], access the appropriate heap and return the requested value""" cdef WeightedPQueueRecord* array = self.array_ # get value at index return array[index].data # ============================================================================= # WeightedMedianCalculator data structure # ============================================================================= cdef class WeightedMedianCalculator: """A class to handle calculation of the weighted median from streams of data. To do so, it maintains a parameter ``k`` such that the sum of the weights in the range [0,k) is greater than or equal to half of the total weight. By minimizing the value of ``k`` that fulfills this constraint, calculating the median is done by either taking the value of the sample at index ``k-1`` of ``samples`` (samples[k-1].data) or the average of the samples at index ``k-1`` and ``k`` of ``samples`` ((samples[k-1] + samples[k]) / 2). Attributes ---------- initial_capacity : intp_t The initial capacity of the WeightedMedianCalculator. samples : WeightedPQueue Holds the samples (consisting of values and their weights) used in the weighted median calculation. total_weight : float64_t The sum of the weights of items in ``samples``. Represents the total weight of all samples used in the median calculation. k : intp_t Index used to calculate the median. sum_w_0_k : float64_t The sum of the weights from samples[0:k]. Used in the weighted median calculation; minimizing the value of ``k`` such that ``sum_w_0_k`` >= ``total_weight / 2`` provides a mechanism for calculating the median in constant time. """ def __cinit__(self, intp_t initial_capacity): self.initial_capacity = initial_capacity self.samples = WeightedPQueue(initial_capacity) self.total_weight = 0 self.k = 0 self.sum_w_0_k = 0 cdef intp_t size(self) noexcept nogil: """Return the number of samples in the WeightedMedianCalculator""" return self.samples.size() cdef int reset(self) except -1 nogil: """Reset the WeightedMedianCalculator to its state at construction Return -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ # samples.reset (WeightedPQueue.reset) uses safe_realloc, hence # except -1 self.samples.reset() self.total_weight = 0 self.k = 0 self.sum_w_0_k = 0 return 0 cdef int push(self, float64_t data, float64_t weight) except -1 nogil: """Push a value and its associated weight to the WeightedMedianCalculator Return -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ cdef int return_value cdef float64_t original_median = 0.0 if self.size() != 0: original_median = self.get_median() # samples.push (WeightedPQueue.push) uses safe_realloc, hence except -1 return_value = self.samples.push(data, weight) self.update_median_parameters_post_push(data, weight, original_median) return return_value cdef int update_median_parameters_post_push( self, float64_t data, float64_t weight, float64_t original_median) noexcept nogil: """Update the parameters used in the median calculation, namely `k` and `sum_w_0_k` after an insertion""" # trivial case of one element. if self.size() == 1: self.k = 1 self.total_weight = weight self.sum_w_0_k = self.total_weight return 0 # get the original weighted median self.total_weight += weight if data < original_median: # inserting below the median, so increment k and # then update self.sum_w_0_k accordingly by adding # the weight that was added. self.k += 1 # update sum_w_0_k by adding the weight added self.sum_w_0_k += weight # minimize k such that sum(W[0:k]) >= total_weight / 2 # minimum value of k is 1 while(self.k > 1 and ((self.sum_w_0_k - self.samples.get_weight_from_index(self.k-1)) >= self.total_weight / 2.0)): self.k -= 1 self.sum_w_0_k -= self.samples.get_weight_from_index(self.k) return 0 if data >= original_median: # inserting above or at the median # minimize k such that sum(W[0:k]) >= total_weight / 2 while(self.k < self.samples.size() and (self.sum_w_0_k < self.total_weight / 2.0)): self.k += 1 self.sum_w_0_k += self.samples.get_weight_from_index(self.k-1) return 0 cdef int remove(self, float64_t data, float64_t weight) noexcept nogil: """Remove a value from the MedianHeap, removing it from consideration in the median calculation """ cdef int return_value cdef float64_t original_median = 0.0 if self.size() != 0: original_median = self.get_median() return_value = self.samples.remove(data, weight) self.update_median_parameters_post_remove(data, weight, original_median) return return_value cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil: """Pop a value from the MedianHeap, starting from the left and moving to the right. """ cdef int return_value cdef float64_t original_median = 0.0 if self.size() != 0: original_median = self.get_median() # no elements to pop if self.samples.size() == 0: return -1 return_value = self.samples.pop(data, weight) self.update_median_parameters_post_remove(data[0], weight[0], original_median) return return_value cdef int update_median_parameters_post_remove( self, float64_t data, float64_t weight, float64_t original_median) noexcept nogil: """Update the parameters used in the median calculation, namely `k` and `sum_w_0_k` after a removal""" # reset parameters because it there are no elements if self.samples.size() == 0: self.k = 0 self.total_weight = 0 self.sum_w_0_k = 0 return 0 # trivial case of one element. if self.samples.size() == 1: self.k = 1 self.total_weight -= weight self.sum_w_0_k = self.total_weight return 0 # get the current weighted median self.total_weight -= weight if data < original_median: # removing below the median, so decrement k and # then update self.sum_w_0_k accordingly by subtracting # the removed weight self.k -= 1 # update sum_w_0_k by removing the weight at index k self.sum_w_0_k -= weight # minimize k such that sum(W[0:k]) >= total_weight / 2 # by incrementing k and updating sum_w_0_k accordingly # until the condition is met. while(self.k < self.samples.size() and (self.sum_w_0_k < self.total_weight / 2.0)): self.k += 1 self.sum_w_0_k += self.samples.get_weight_from_index(self.k-1) return 0 if data >= original_median: # removing above the median # minimize k such that sum(W[0:k]) >= total_weight / 2 while(self.k > 1 and ((self.sum_w_0_k - self.samples.get_weight_from_index(self.k-1)) >= self.total_weight / 2.0)): self.k -= 1 self.sum_w_0_k -= self.samples.get_weight_from_index(self.k) return 0 cdef float64_t get_median(self) noexcept nogil: """Write the median to a pointer, taking into account sample weights.""" if self.sum_w_0_k == (self.total_weight / 2.0): # split median return (self.samples.get_value_from_index(self.k) + self.samples.get_value_from_index(self.k-1)) / 2.0 if self.sum_w_0_k > (self.total_weight / 2.0): # whole median return self.samples.get_value_from_index(self.k-1) def _any_isnan_axis0(const float32_t[:, :] X): """Same as np.any(np.isnan(X), axis=0)""" cdef: intp_t i, j intp_t n_samples = X.shape[0] intp_t n_features = X.shape[1] unsigned char[::1] isnan_out = np.zeros(X.shape[1], dtype=np.bool_) with nogil: for i in range(n_samples): for j in range(n_features): if isnan_out[j]: continue if isnan(X[i, j]): isnan_out[j] = True break return np.asarray(isnan_out)