Inzynierka/Lib/site-packages/sklearn/tree/_splitter.pxd

# Authors: Gilles Louppe <g.louppe@gmail.com>
#          Peter Prettenhofer <peter.prettenhofer@gmail.com>
#          Brian Holt <bdholt1@gmail.com>
#          Joel Nothman <joel.nothman@gmail.com>
#          Arnaud Joly <arnaud.v.joly@gmail.com>
#          Jacob Schreiber <jmschreiber91@gmail.com>
#
# License: BSD 3 clause

# See _splitter.pyx for details.

from ._criterion cimport Criterion

from ._tree cimport DTYPE_t          # Type of X
from ._tree cimport DOUBLE_t         # Type of y, sample_weight
from ._tree cimport SIZE_t           # Type for indices and counters
from ._tree cimport INT32_t          # Signed 32 bit integer
from ._tree cimport UINT32_t         # Unsigned 32 bit integer

cdef struct SplitRecord:
    # Data to track sample split
    SIZE_t feature         # Which feature to split on.
    SIZE_t pos             # Split samples array at the given position,
                           # i.e. count of samples below threshold for feature.
                           # pos is >= end if the node is a leaf.
    double threshold       # Threshold to split at.
    double improvement     # Impurity improvement given parent node.
    double impurity_left   # Impurity of the left split.
    double impurity_right  # Impurity of the right split.

cdef class Splitter:
    # The splitter searches in the input space for a feature and a threshold
    # to split the samples samples[start:end].
    #
    # The impurity computations are delegated to a criterion object.

    # Internal structures
    cdef public Criterion criterion      # Impurity criterion
    cdef public SIZE_t max_features      # Number of features to test
    cdef public SIZE_t min_samples_leaf  # Min samples in a leaf
    cdef public double min_weight_leaf   # Minimum weight in a leaf

    cdef object random_state             # Random state
    cdef UINT32_t rand_r_state           # sklearn_rand_r random number state

    cdef SIZE_t[::1] samples             # Sample indices in X, y
    cdef SIZE_t n_samples                # X.shape[0]
    cdef double weighted_n_samples       # Weighted number of samples
    cdef SIZE_t[::1] features            # Feature indices in X
    cdef SIZE_t[::1] constant_features   # Constant features indices
    cdef SIZE_t n_features               # X.shape[1]
    cdef DTYPE_t[::1] feature_values     # temp. array holding feature values

    cdef SIZE_t start                    # Start position for the current node
    cdef SIZE_t end                      # End position for the current node

    cdef const DOUBLE_t[:, ::1] y
    cdef const DOUBLE_t[:] sample_weight

    # The samples vector `samples` is maintained by the Splitter object such
    # that the samples contained in a node are contiguous. With this setting,
    # `node_split` reorganizes the node samples `samples[start:end]` in two
    # subsets `samples[start:pos]` and `samples[pos:end]`.

    # The 1-d  `features` array of size n_features contains the features
    # indices and allows fast sampling without replacement of features.

    # The 1-d `constant_features` array of size n_features holds in
    # `constant_features[:n_constant_features]` the feature ids with
    # constant values for all the samples that reached a specific node.
    # The value `n_constant_features` is given by the parent node to its
    # child nodes.  The content of the range `[n_constant_features:]` is left
    # undefined, but preallocated for performance reasons
    # This allows optimization with depth-based tree building.

    # Methods
    cdef int init(
        self,
        object X,
        const DOUBLE_t[:, ::1] y,
        const DOUBLE_t[:] sample_weight
    ) except -1

    cdef int node_reset(
        self,
        SIZE_t start,
        SIZE_t end,
        double* weighted_n_node_samples
    ) nogil except -1

    cdef int node_split(
        self,
        double impurity,   # Impurity of the node
        SplitRecord* split,
        SIZE_t* n_constant_features
    ) nogil except -1

    cdef void node_value(self, double* dest) nogil

    cdef double node_impurity(self) nogil
first commit 2023-06-02 12:51:02 +02:00			`# Authors: Gilles Louppe <g.louppe@gmail.com>`
			`# Peter Prettenhofer <peter.prettenhofer@gmail.com>`
			`# Brian Holt <bdholt1@gmail.com>`
			`# Joel Nothman <joel.nothman@gmail.com>`
			`# Arnaud Joly <arnaud.v.joly@gmail.com>`
			`# Jacob Schreiber <jmschreiber91@gmail.com>`
			`#`
			`# License: BSD 3 clause`

			`# See _splitter.pyx for details.`

			`from ._criterion cimport Criterion`

			`from ._tree cimport DTYPE_t # Type of X`
			`from ._tree cimport DOUBLE_t # Type of y, sample_weight`
			`from ._tree cimport SIZE_t # Type for indices and counters`
			`from ._tree cimport INT32_t # Signed 32 bit integer`
			`from ._tree cimport UINT32_t # Unsigned 32 bit integer`

			`cdef struct SplitRecord:`
			`# Data to track sample split`
			`SIZE_t feature # Which feature to split on.`
			`SIZE_t pos # Split samples array at the given position,`
			`# i.e. count of samples below threshold for feature.`
			`# pos is >= end if the node is a leaf.`
			`double threshold # Threshold to split at.`
			`double improvement # Impurity improvement given parent node.`
			`double impurity_left # Impurity of the left split.`
			`double impurity_right # Impurity of the right split.`

			`cdef class Splitter:`
			`# The splitter searches in the input space for a feature and a threshold`
			`# to split the samples samples[start:end].`
			`#`
			`# The impurity computations are delegated to a criterion object.`

			`# Internal structures`
			`cdef public Criterion criterion # Impurity criterion`
			`cdef public SIZE_t max_features # Number of features to test`
			`cdef public SIZE_t min_samples_leaf # Min samples in a leaf`
			`cdef public double min_weight_leaf # Minimum weight in a leaf`

			`cdef object random_state # Random state`
			`cdef UINT32_t rand_r_state # sklearn_rand_r random number state`

			`cdef SIZE_t[::1] samples # Sample indices in X, y`
			`cdef SIZE_t n_samples # X.shape[0]`
			`cdef double weighted_n_samples # Weighted number of samples`
			`cdef SIZE_t[::1] features # Feature indices in X`
			`cdef SIZE_t[::1] constant_features # Constant features indices`
			`cdef SIZE_t n_features # X.shape[1]`
			`cdef DTYPE_t[::1] feature_values # temp. array holding feature values`

			`cdef SIZE_t start # Start position for the current node`
			`cdef SIZE_t end # End position for the current node`

			`cdef const DOUBLE_t[:, ::1] y`
			`cdef const DOUBLE_t[:] sample_weight`

			# The samples vector `samples` is maintained by the Splitter object such
			`# that the samples contained in a node are contiguous. With this setting,`
			# `node_split` reorganizes the node samples `samples[start:end]` in two
			# subsets `samples[start:pos]` and `samples[pos:end]`.

			# The 1-d `features` array of size n_features contains the features
			`# indices and allows fast sampling without replacement of features.`

			# The 1-d `constant_features` array of size n_features holds in
			# `constant_features[:n_constant_features]` the feature ids with
			`# constant values for all the samples that reached a specific node.`
			# The value `n_constant_features` is given by the parent node to its
			# child nodes. The content of the range `[n_constant_features:]` is left
			`# undefined, but preallocated for performance reasons`
			`# This allows optimization with depth-based tree building.`

			`# Methods`
			`cdef int init(`
			`self,`
			`object X,`
			`const DOUBLE_t[:, ::1] y,`
			`const DOUBLE_t[:] sample_weight`
			`) except -1`

			`cdef int node_reset(`
			`self,`
			`SIZE_t start,`
			`SIZE_t end,`
			`double* weighted_n_node_samples`
			`) nogil except -1`

			`cdef int node_split(`
			`self,`
			`double impurity, # Impurity of the node`
			`SplitRecord* split,`
			`SIZE_t* n_constant_features`
			`) nogil except -1`

			`cdef void node_value(self, double* dest) nogil`

			`cdef double node_impurity(self) nogil`