111 lines
4.6 KiB
Cython
111 lines
4.6 KiB
Cython
# Authors: Gilles Louppe <g.louppe@gmail.com>
|
|
# Peter Prettenhofer <peter.prettenhofer@gmail.com>
|
|
# Brian Holt <bdholt1@gmail.com>
|
|
# Joel Nothman <joel.nothman@gmail.com>
|
|
# Arnaud Joly <arnaud.v.joly@gmail.com>
|
|
# Jacob Schreiber <jmschreiber91@gmail.com>
|
|
#
|
|
# License: BSD 3 clause
|
|
|
|
# See _splitter.pyx for details.
|
|
from ._criterion cimport Criterion
|
|
from ._tree cimport ParentInfo
|
|
|
|
from ..utils._typedefs cimport float32_t, float64_t, intp_t, int8_t, int32_t, uint32_t
|
|
|
|
|
|
cdef struct SplitRecord:
|
|
# Data to track sample split
|
|
intp_t feature # Which feature to split on.
|
|
intp_t pos # Split samples array at the given position,
|
|
# # i.e. count of samples below threshold for feature.
|
|
# # pos is >= end if the node is a leaf.
|
|
float64_t threshold # Threshold to split at.
|
|
float64_t improvement # Impurity improvement given parent node.
|
|
float64_t impurity_left # Impurity of the left split.
|
|
float64_t impurity_right # Impurity of the right split.
|
|
float64_t lower_bound # Lower bound on value of both children for monotonicity
|
|
float64_t upper_bound # Upper bound on value of both children for monotonicity
|
|
unsigned char missing_go_to_left # Controls if missing values go to the left node.
|
|
intp_t n_missing # Number of missing values for the feature being split on
|
|
|
|
cdef class Splitter:
|
|
# The splitter searches in the input space for a feature and a threshold
|
|
# to split the samples samples[start:end].
|
|
#
|
|
# The impurity computations are delegated to a criterion object.
|
|
|
|
# Internal structures
|
|
cdef public Criterion criterion # Impurity criterion
|
|
cdef public intp_t max_features # Number of features to test
|
|
cdef public intp_t min_samples_leaf # Min samples in a leaf
|
|
cdef public float64_t min_weight_leaf # Minimum weight in a leaf
|
|
|
|
cdef object random_state # Random state
|
|
cdef uint32_t rand_r_state # sklearn_rand_r random number state
|
|
|
|
cdef intp_t[::1] samples # Sample indices in X, y
|
|
cdef intp_t n_samples # X.shape[0]
|
|
cdef float64_t weighted_n_samples # Weighted number of samples
|
|
cdef intp_t[::1] features # Feature indices in X
|
|
cdef intp_t[::1] constant_features # Constant features indices
|
|
cdef intp_t n_features # X.shape[1]
|
|
cdef float32_t[::1] feature_values # temp. array holding feature values
|
|
|
|
cdef intp_t start # Start position for the current node
|
|
cdef intp_t end # End position for the current node
|
|
|
|
cdef const float64_t[:, ::1] y
|
|
# Monotonicity constraints for each feature.
|
|
# The encoding is as follows:
|
|
# -1: monotonic decrease
|
|
# 0: no constraint
|
|
# +1: monotonic increase
|
|
cdef const int8_t[:] monotonic_cst
|
|
cdef bint with_monotonic_cst
|
|
cdef const float64_t[:] sample_weight
|
|
|
|
# The samples vector `samples` is maintained by the Splitter object such
|
|
# that the samples contained in a node are contiguous. With this setting,
|
|
# `node_split` reorganizes the node samples `samples[start:end]` in two
|
|
# subsets `samples[start:pos]` and `samples[pos:end]`.
|
|
|
|
# The 1-d `features` array of size n_features contains the features
|
|
# indices and allows fast sampling without replacement of features.
|
|
|
|
# The 1-d `constant_features` array of size n_features holds in
|
|
# `constant_features[:n_constant_features]` the feature ids with
|
|
# constant values for all the samples that reached a specific node.
|
|
# The value `n_constant_features` is given by the parent node to its
|
|
# child nodes. The content of the range `[n_constant_features:]` is left
|
|
# undefined, but preallocated for performance reasons
|
|
# This allows optimization with depth-based tree building.
|
|
|
|
# Methods
|
|
cdef int init(
|
|
self,
|
|
object X,
|
|
const float64_t[:, ::1] y,
|
|
const float64_t[:] sample_weight,
|
|
const unsigned char[::1] missing_values_in_feature_mask,
|
|
) except -1
|
|
|
|
cdef int node_reset(
|
|
self,
|
|
intp_t start,
|
|
intp_t end,
|
|
float64_t* weighted_n_node_samples
|
|
) except -1 nogil
|
|
|
|
cdef int node_split(
|
|
self,
|
|
ParentInfo* parent,
|
|
SplitRecord* split,
|
|
) except -1 nogil
|
|
|
|
cdef void node_value(self, float64_t* dest) noexcept nogil
|
|
|
|
cdef void clip_node_value(self, float64_t* dest, float64_t lower_bound, float64_t upper_bound) noexcept nogil
|
|
|
|
cdef float64_t node_impurity(self) noexcept nogil
|