Intelegentny_Pszczelarz/.venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py

import numpy as np
from numpy.testing import assert_allclose
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import pytest

from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
from sklearn.ensemble._hist_gradient_boosting.predictor import TreePredictor
from sklearn.ensemble._hist_gradient_boosting.common import (
    G_H_DTYPE,
    PREDICTOR_RECORD_DTYPE,
    ALMOST_INF,
    X_BINNED_DTYPE,
    X_BITSET_INNER_DTYPE,
    X_DTYPE,
)
from sklearn.ensemble._hist_gradient_boosting._bitset import (
    set_bitset_memoryview,
    set_raw_bitset_from_binned_bitset,
)
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads

n_threads = _openmp_effective_n_threads()


@pytest.mark.parametrize("n_bins", [200, 256])
def test_regression_dataset(n_bins):
    X, y = make_regression(
        n_samples=500, n_features=10, n_informative=5, random_state=42
    )
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    mapper = _BinMapper(n_bins=n_bins, random_state=42)
    X_train_binned = mapper.fit_transform(X_train)

    # Init gradients and hessians to that of least squares loss
    gradients = -y_train.astype(G_H_DTYPE)
    hessians = np.ones(1, dtype=G_H_DTYPE)

    min_samples_leaf = 10
    max_leaf_nodes = 30
    grower = TreeGrower(
        X_train_binned,
        gradients,
        hessians,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes,
        n_bins=n_bins,
        n_bins_non_missing=mapper.n_bins_non_missing_,
    )
    grower.grow()

    predictor = grower.make_predictor(binning_thresholds=mapper.bin_thresholds_)

    known_cat_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)
    f_idx_map = np.zeros(0, dtype=np.uint32)

    y_pred_train = predictor.predict(X_train, known_cat_bitsets, f_idx_map, n_threads)
    assert r2_score(y_train, y_pred_train) > 0.82

    y_pred_test = predictor.predict(X_test, known_cat_bitsets, f_idx_map, n_threads)
    assert r2_score(y_test, y_pred_test) > 0.67


@pytest.mark.parametrize(
    "num_threshold, expected_predictions",
    [
        (-np.inf, [0, 1, 1, 1]),
        (10, [0, 0, 1, 1]),
        (20, [0, 0, 0, 1]),
        (ALMOST_INF, [0, 0, 0, 1]),
        (np.inf, [0, 0, 0, 0]),
    ],
)
def test_infinite_values_and_thresholds(num_threshold, expected_predictions):
    # Make sure infinite values and infinite thresholds are handled properly.
    # In particular, if a value is +inf and the threshold is ALMOST_INF the
    # sample should go to the right child. If the threshold is inf (split on
    # nan), the +inf sample will go to the left child.

    X = np.array([-np.inf, 10, 20, np.inf]).reshape(-1, 1)
    nodes = np.zeros(3, dtype=PREDICTOR_RECORD_DTYPE)

    # We just construct a simple tree with 1 root and 2 children
    # parent node
    nodes[0]["left"] = 1
    nodes[0]["right"] = 2
    nodes[0]["feature_idx"] = 0
    nodes[0]["num_threshold"] = num_threshold

    # left child
    nodes[1]["is_leaf"] = True
    nodes[1]["value"] = 0

    # right child
    nodes[2]["is_leaf"] = True
    nodes[2]["value"] = 1

    binned_cat_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)
    raw_categorical_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)
    known_cat_bitset = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)
    f_idx_map = np.zeros(0, dtype=np.uint32)

    predictor = TreePredictor(nodes, binned_cat_bitsets, raw_categorical_bitsets)
    predictions = predictor.predict(X, known_cat_bitset, f_idx_map, n_threads)

    assert np.all(predictions == expected_predictions)


@pytest.mark.parametrize(
    "bins_go_left, expected_predictions",
    [
        ([0, 3, 4, 6], [1, 0, 0, 1, 1, 0]),
        ([0, 1, 2, 6], [1, 1, 1, 0, 0, 0]),
        ([3, 5, 6], [0, 0, 0, 1, 0, 1]),
    ],
)
def test_categorical_predictor(bins_go_left, expected_predictions):
    # Test predictor outputs are correct with categorical features

    X_binned = np.array([[0, 1, 2, 3, 4, 5]], dtype=X_BINNED_DTYPE).T
    categories = np.array([2, 5, 6, 8, 10, 15], dtype=X_DTYPE)

    bins_go_left = np.array(bins_go_left, dtype=X_BINNED_DTYPE)

    # We just construct a simple tree with 1 root and 2 children
    # parent node
    nodes = np.zeros(3, dtype=PREDICTOR_RECORD_DTYPE)
    nodes[0]["left"] = 1
    nodes[0]["right"] = 2
    nodes[0]["feature_idx"] = 0
    nodes[0]["is_categorical"] = True
    nodes[0]["missing_go_to_left"] = True

    # left child
    nodes[1]["is_leaf"] = True
    nodes[1]["value"] = 1

    # right child
    nodes[2]["is_leaf"] = True
    nodes[2]["value"] = 0

    binned_cat_bitsets = np.zeros((1, 8), dtype=X_BITSET_INNER_DTYPE)
    raw_categorical_bitsets = np.zeros((1, 8), dtype=X_BITSET_INNER_DTYPE)
    for go_left in bins_go_left:
        set_bitset_memoryview(binned_cat_bitsets[0], go_left)

    set_raw_bitset_from_binned_bitset(
        raw_categorical_bitsets[0], binned_cat_bitsets[0], categories
    )

    predictor = TreePredictor(nodes, binned_cat_bitsets, raw_categorical_bitsets)

    # Check binned data gives correct predictions
    prediction_binned = predictor.predict_binned(
        X_binned, missing_values_bin_idx=6, n_threads=n_threads
    )
    assert_allclose(prediction_binned, expected_predictions)

    # manually construct bitset
    known_cat_bitsets = np.zeros((1, 8), dtype=np.uint32)
    known_cat_bitsets[0, 0] = np.sum(2**categories, dtype=np.uint32)
    f_idx_map = np.array([0], dtype=np.uint32)

    # Check with un-binned data
    predictions = predictor.predict(
        categories.reshape(-1, 1), known_cat_bitsets, f_idx_map, n_threads
    )
    assert_allclose(predictions, expected_predictions)

    # Check missing goes left because missing_values_bin_idx=6
    X_binned_missing = np.array([[6]], dtype=X_BINNED_DTYPE).T
    predictions = predictor.predict_binned(
        X_binned_missing, missing_values_bin_idx=6, n_threads=n_threads
    )
    assert_allclose(predictions, [1])

    # missing and unknown go left
    predictions = predictor.predict(
        np.array([[np.nan, 17]], dtype=X_DTYPE).T,
        known_cat_bitsets,
        f_idx_map,
        n_threads,
    )
    assert_allclose(predictions, [1, 1])
feature: "ANN commit 2" 2023-06-19 00:49:18 +02:00			`import numpy as np`
			`from numpy.testing import assert_allclose`
			`from sklearn.datasets import make_regression`
			`from sklearn.model_selection import train_test_split`
			`from sklearn.metrics import r2_score`
			`import pytest`

			`from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper`
			`from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower`
			`from sklearn.ensemble._hist_gradient_boosting.predictor import TreePredictor`
			`from sklearn.ensemble._hist_gradient_boosting.common import (`
			`G_H_DTYPE,`
			`PREDICTOR_RECORD_DTYPE,`
			`ALMOST_INF,`
			`X_BINNED_DTYPE,`
			`X_BITSET_INNER_DTYPE,`
			`X_DTYPE,`
			`)`
			`from sklearn.ensemble._hist_gradient_boosting._bitset import (`
			`set_bitset_memoryview,`
			`set_raw_bitset_from_binned_bitset,`
			`)`
			`from sklearn.utils._openmp_helpers import _openmp_effective_n_threads`

			`n_threads = _openmp_effective_n_threads()`


			`@pytest.mark.parametrize("n_bins", [200, 256])`
			`def test_regression_dataset(n_bins):`
			`X, y = make_regression(`
			`n_samples=500, n_features=10, n_informative=5, random_state=42`
			`)`
			`X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)`

			`mapper = _BinMapper(n_bins=n_bins, random_state=42)`
			`X_train_binned = mapper.fit_transform(X_train)`

			`# Init gradients and hessians to that of least squares loss`
			`gradients = -y_train.astype(G_H_DTYPE)`
			`hessians = np.ones(1, dtype=G_H_DTYPE)`

			`min_samples_leaf = 10`
			`max_leaf_nodes = 30`
			`grower = TreeGrower(`
			`X_train_binned,`
			`gradients,`
			`hessians,`
			`min_samples_leaf=min_samples_leaf,`
			`max_leaf_nodes=max_leaf_nodes,`
			`n_bins=n_bins,`
			`n_bins_non_missing=mapper.n_bins_non_missing_,`
			`)`
			`grower.grow()`

			`predictor = grower.make_predictor(binning_thresholds=mapper.bin_thresholds_)`

			`known_cat_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)`
			`f_idx_map = np.zeros(0, dtype=np.uint32)`

			`y_pred_train = predictor.predict(X_train, known_cat_bitsets, f_idx_map, n_threads)`
			`assert r2_score(y_train, y_pred_train) > 0.82`

			`y_pred_test = predictor.predict(X_test, known_cat_bitsets, f_idx_map, n_threads)`
			`assert r2_score(y_test, y_pred_test) > 0.67`


			`@pytest.mark.parametrize(`
			`"num_threshold, expected_predictions",`
			`[`
			`(-np.inf, [0, 1, 1, 1]),`
			`(10, [0, 0, 1, 1]),`
			`(20, [0, 0, 0, 1]),`
			`(ALMOST_INF, [0, 0, 0, 1]),`
			`(np.inf, [0, 0, 0, 0]),`
			`],`
			`)`
			`def test_infinite_values_and_thresholds(num_threshold, expected_predictions):`
			`# Make sure infinite values and infinite thresholds are handled properly.`
			`# In particular, if a value is +inf and the threshold is ALMOST_INF the`
			`# sample should go to the right child. If the threshold is inf (split on`
			`# nan), the +inf sample will go to the left child.`

			`X = np.array([-np.inf, 10, 20, np.inf]).reshape(-1, 1)`
			`nodes = np.zeros(3, dtype=PREDICTOR_RECORD_DTYPE)`

			`# We just construct a simple tree with 1 root and 2 children`
			`# parent node`
			`nodes[0]["left"] = 1`
			`nodes[0]["right"] = 2`
			`nodes[0]["feature_idx"] = 0`
			`nodes[0]["num_threshold"] = num_threshold`

			`# left child`
			`nodes[1]["is_leaf"] = True`
			`nodes[1]["value"] = 0`

			`# right child`
			`nodes[2]["is_leaf"] = True`
			`nodes[2]["value"] = 1`

			`binned_cat_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)`
			`raw_categorical_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)`
			`known_cat_bitset = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)`
			`f_idx_map = np.zeros(0, dtype=np.uint32)`

			`predictor = TreePredictor(nodes, binned_cat_bitsets, raw_categorical_bitsets)`
			`predictions = predictor.predict(X, known_cat_bitset, f_idx_map, n_threads)`

			`assert np.all(predictions == expected_predictions)`


			`@pytest.mark.parametrize(`
			`"bins_go_left, expected_predictions",`
			`[`
			`([0, 3, 4, 6], [1, 0, 0, 1, 1, 0]),`
			`([0, 1, 2, 6], [1, 1, 1, 0, 0, 0]),`
			`([3, 5, 6], [0, 0, 0, 1, 0, 1]),`
			`],`
			`)`
			`def test_categorical_predictor(bins_go_left, expected_predictions):`
			`# Test predictor outputs are correct with categorical features`

			`X_binned = np.array([[0, 1, 2, 3, 4, 5]], dtype=X_BINNED_DTYPE).T`
			`categories = np.array([2, 5, 6, 8, 10, 15], dtype=X_DTYPE)`

			`bins_go_left = np.array(bins_go_left, dtype=X_BINNED_DTYPE)`

			`# We just construct a simple tree with 1 root and 2 children`
			`# parent node`
			`nodes = np.zeros(3, dtype=PREDICTOR_RECORD_DTYPE)`
			`nodes[0]["left"] = 1`
			`nodes[0]["right"] = 2`
			`nodes[0]["feature_idx"] = 0`
			`nodes[0]["is_categorical"] = True`
			`nodes[0]["missing_go_to_left"] = True`

			`# left child`
			`nodes[1]["is_leaf"] = True`
			`nodes[1]["value"] = 1`

			`# right child`
			`nodes[2]["is_leaf"] = True`
			`nodes[2]["value"] = 0`

			`binned_cat_bitsets = np.zeros((1, 8), dtype=X_BITSET_INNER_DTYPE)`
			`raw_categorical_bitsets = np.zeros((1, 8), dtype=X_BITSET_INNER_DTYPE)`
			`for go_left in bins_go_left:`
			`set_bitset_memoryview(binned_cat_bitsets[0], go_left)`

			`set_raw_bitset_from_binned_bitset(`
			`raw_categorical_bitsets[0], binned_cat_bitsets[0], categories`
			`)`

			`predictor = TreePredictor(nodes, binned_cat_bitsets, raw_categorical_bitsets)`

			`# Check binned data gives correct predictions`
			`prediction_binned = predictor.predict_binned(`
			`X_binned, missing_values_bin_idx=6, n_threads=n_threads`
			`)`
			`assert_allclose(prediction_binned, expected_predictions)`

			`# manually construct bitset`
			`known_cat_bitsets = np.zeros((1, 8), dtype=np.uint32)`
			`known_cat_bitsets[0, 0] = np.sum(2**categories, dtype=np.uint32)`
			`f_idx_map = np.array([0], dtype=np.uint32)`

			`# Check with un-binned data`
			`predictions = predictor.predict(`
			`categories.reshape(-1, 1), known_cat_bitsets, f_idx_map, n_threads`
			`)`
			`assert_allclose(predictions, expected_predictions)`

			`# Check missing goes left because missing_values_bin_idx=6`
			`X_binned_missing = np.array([[6]], dtype=X_BINNED_DTYPE).T`
			`predictions = predictor.predict_binned(`
			`X_binned_missing, missing_values_bin_idx=6, n_threads=n_threads`
			`)`
			`assert_allclose(predictions, [1])`

			`# missing and unknown go left`
			`predictions = predictor.predict(`
			`np.array([[np.nan, 17]], dtype=X_DTYPE).T,`
			`known_cat_bitsets,`
			`f_idx_map,`
			`n_threads,`
			`)`
			`assert_allclose(predictions, [1, 1])`