3RNN/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/utils.py
2024-05-26 19:49:15 +02:00

147 lines
5.3 KiB
Python

"""This module contains utility routines."""
from ...base import is_classifier
from .binning import _BinMapper
def get_equivalent_estimator(estimator, lib="lightgbm", n_classes=None):
"""Return an unfitted estimator from another lib with matching hyperparams.
This utility function takes care of renaming the sklearn parameters into
their LightGBM, XGBoost or CatBoost equivalent parameters.
# unmapped XGB parameters:
# - min_samples_leaf
# - min_data_in_bin
# - min_split_gain (there is min_split_loss though?)
# unmapped Catboost parameters:
# max_leaves
# min_*
"""
if lib not in ("lightgbm", "xgboost", "catboost"):
raise ValueError(
"accepted libs are lightgbm, xgboost, and catboost. got {}".format(lib)
)
sklearn_params = estimator.get_params()
if sklearn_params["loss"] == "auto":
raise ValueError(
"auto loss is not accepted. We need to know if "
"the problem is binary or multiclass classification."
)
if sklearn_params["early_stopping"]:
raise NotImplementedError("Early stopping should be deactivated.")
lightgbm_loss_mapping = {
"squared_error": "regression_l2",
"absolute_error": "regression_l1",
"log_loss": "binary" if n_classes == 2 else "multiclass",
"gamma": "gamma",
"poisson": "poisson",
}
lightgbm_params = {
"objective": lightgbm_loss_mapping[sklearn_params["loss"]],
"learning_rate": sklearn_params["learning_rate"],
"n_estimators": sklearn_params["max_iter"],
"num_leaves": sklearn_params["max_leaf_nodes"],
"max_depth": sklearn_params["max_depth"],
"min_data_in_leaf": sklearn_params["min_samples_leaf"],
"reg_lambda": sklearn_params["l2_regularization"],
"max_bin": sklearn_params["max_bins"],
"min_data_in_bin": 1,
"min_sum_hessian_in_leaf": 1e-3,
"min_split_gain": 0,
"verbosity": 10 if sklearn_params["verbose"] else -10,
"boost_from_average": True,
"enable_bundle": False, # also makes feature order consistent
"subsample_for_bin": _BinMapper().subsample,
"poisson_max_delta_step": 1e-12,
"feature_fraction_bynode": sklearn_params["max_features"],
}
if sklearn_params["loss"] == "log_loss" and n_classes > 2:
# LightGBM multiplies hessians by 2 in multiclass loss.
lightgbm_params["min_sum_hessian_in_leaf"] *= 2
# LightGBM 3.0 introduced a different scaling of the hessian for the multiclass
# case.
# It is equivalent of scaling the learning rate.
# See https://github.com/microsoft/LightGBM/pull/3256.
if n_classes is not None:
lightgbm_params["learning_rate"] *= n_classes / (n_classes - 1)
# XGB
xgboost_loss_mapping = {
"squared_error": "reg:linear",
"absolute_error": "LEAST_ABSOLUTE_DEV_NOT_SUPPORTED",
"log_loss": "reg:logistic" if n_classes == 2 else "multi:softmax",
"gamma": "reg:gamma",
"poisson": "count:poisson",
}
xgboost_params = {
"tree_method": "hist",
"grow_policy": "lossguide", # so that we can set max_leaves
"objective": xgboost_loss_mapping[sklearn_params["loss"]],
"learning_rate": sklearn_params["learning_rate"],
"n_estimators": sklearn_params["max_iter"],
"max_leaves": sklearn_params["max_leaf_nodes"],
"max_depth": sklearn_params["max_depth"] or 0,
"lambda": sklearn_params["l2_regularization"],
"max_bin": sklearn_params["max_bins"],
"min_child_weight": 1e-3,
"verbosity": 2 if sklearn_params["verbose"] else 0,
"silent": sklearn_params["verbose"] == 0,
"n_jobs": -1,
"colsample_bynode": sklearn_params["max_features"],
}
# Catboost
catboost_loss_mapping = {
"squared_error": "RMSE",
# catboost does not support MAE when leaf_estimation_method is Newton
"absolute_error": "LEAST_ASBOLUTE_DEV_NOT_SUPPORTED",
"log_loss": "Logloss" if n_classes == 2 else "MultiClass",
"gamma": None,
"poisson": "Poisson",
}
catboost_params = {
"loss_function": catboost_loss_mapping[sklearn_params["loss"]],
"learning_rate": sklearn_params["learning_rate"],
"iterations": sklearn_params["max_iter"],
"depth": sklearn_params["max_depth"],
"reg_lambda": sklearn_params["l2_regularization"],
"max_bin": sklearn_params["max_bins"],
"feature_border_type": "Median",
"leaf_estimation_method": "Newton",
"verbose": bool(sklearn_params["verbose"]),
}
if lib == "lightgbm":
from lightgbm import LGBMClassifier, LGBMRegressor
if is_classifier(estimator):
return LGBMClassifier(**lightgbm_params)
else:
return LGBMRegressor(**lightgbm_params)
elif lib == "xgboost":
from xgboost import XGBClassifier, XGBRegressor
if is_classifier(estimator):
return XGBClassifier(**xgboost_params)
else:
return XGBRegressor(**xgboost_params)
else:
from catboost import CatBoostClassifier, CatBoostRegressor
if is_classifier(estimator):
return CatBoostClassifier(**catboost_params)
else:
return CatBoostRegressor(**catboost_params)