147 lines
5.3 KiB
Python
147 lines
5.3 KiB
Python
|
"""This module contains utility routines."""
|
||
|
|
||
|
from ...base import is_classifier
|
||
|
from .binning import _BinMapper
|
||
|
|
||
|
|
||
|
def get_equivalent_estimator(estimator, lib="lightgbm", n_classes=None):
|
||
|
"""Return an unfitted estimator from another lib with matching hyperparams.
|
||
|
|
||
|
This utility function takes care of renaming the sklearn parameters into
|
||
|
their LightGBM, XGBoost or CatBoost equivalent parameters.
|
||
|
|
||
|
# unmapped XGB parameters:
|
||
|
# - min_samples_leaf
|
||
|
# - min_data_in_bin
|
||
|
# - min_split_gain (there is min_split_loss though?)
|
||
|
|
||
|
# unmapped Catboost parameters:
|
||
|
# max_leaves
|
||
|
# min_*
|
||
|
"""
|
||
|
|
||
|
if lib not in ("lightgbm", "xgboost", "catboost"):
|
||
|
raise ValueError(
|
||
|
"accepted libs are lightgbm, xgboost, and catboost. got {}".format(lib)
|
||
|
)
|
||
|
|
||
|
sklearn_params = estimator.get_params()
|
||
|
|
||
|
if sklearn_params["loss"] == "auto":
|
||
|
raise ValueError(
|
||
|
"auto loss is not accepted. We need to know if "
|
||
|
"the problem is binary or multiclass classification."
|
||
|
)
|
||
|
if sklearn_params["early_stopping"]:
|
||
|
raise NotImplementedError("Early stopping should be deactivated.")
|
||
|
|
||
|
lightgbm_loss_mapping = {
|
||
|
"squared_error": "regression_l2",
|
||
|
"absolute_error": "regression_l1",
|
||
|
"log_loss": "binary" if n_classes == 2 else "multiclass",
|
||
|
"gamma": "gamma",
|
||
|
"poisson": "poisson",
|
||
|
}
|
||
|
|
||
|
lightgbm_params = {
|
||
|
"objective": lightgbm_loss_mapping[sklearn_params["loss"]],
|
||
|
"learning_rate": sklearn_params["learning_rate"],
|
||
|
"n_estimators": sklearn_params["max_iter"],
|
||
|
"num_leaves": sklearn_params["max_leaf_nodes"],
|
||
|
"max_depth": sklearn_params["max_depth"],
|
||
|
"min_data_in_leaf": sklearn_params["min_samples_leaf"],
|
||
|
"reg_lambda": sklearn_params["l2_regularization"],
|
||
|
"max_bin": sklearn_params["max_bins"],
|
||
|
"min_data_in_bin": 1,
|
||
|
"min_sum_hessian_in_leaf": 1e-3,
|
||
|
"min_split_gain": 0,
|
||
|
"verbosity": 10 if sklearn_params["verbose"] else -10,
|
||
|
"boost_from_average": True,
|
||
|
"enable_bundle": False, # also makes feature order consistent
|
||
|
"subsample_for_bin": _BinMapper().subsample,
|
||
|
"poisson_max_delta_step": 1e-12,
|
||
|
"feature_fraction_bynode": sklearn_params["max_features"],
|
||
|
}
|
||
|
|
||
|
if sklearn_params["loss"] == "log_loss" and n_classes > 2:
|
||
|
# LightGBM multiplies hessians by 2 in multiclass loss.
|
||
|
lightgbm_params["min_sum_hessian_in_leaf"] *= 2
|
||
|
# LightGBM 3.0 introduced a different scaling of the hessian for the multiclass
|
||
|
# case.
|
||
|
# It is equivalent of scaling the learning rate.
|
||
|
# See https://github.com/microsoft/LightGBM/pull/3256.
|
||
|
if n_classes is not None:
|
||
|
lightgbm_params["learning_rate"] *= n_classes / (n_classes - 1)
|
||
|
|
||
|
# XGB
|
||
|
xgboost_loss_mapping = {
|
||
|
"squared_error": "reg:linear",
|
||
|
"absolute_error": "LEAST_ABSOLUTE_DEV_NOT_SUPPORTED",
|
||
|
"log_loss": "reg:logistic" if n_classes == 2 else "multi:softmax",
|
||
|
"gamma": "reg:gamma",
|
||
|
"poisson": "count:poisson",
|
||
|
}
|
||
|
|
||
|
xgboost_params = {
|
||
|
"tree_method": "hist",
|
||
|
"grow_policy": "lossguide", # so that we can set max_leaves
|
||
|
"objective": xgboost_loss_mapping[sklearn_params["loss"]],
|
||
|
"learning_rate": sklearn_params["learning_rate"],
|
||
|
"n_estimators": sklearn_params["max_iter"],
|
||
|
"max_leaves": sklearn_params["max_leaf_nodes"],
|
||
|
"max_depth": sklearn_params["max_depth"] or 0,
|
||
|
"lambda": sklearn_params["l2_regularization"],
|
||
|
"max_bin": sklearn_params["max_bins"],
|
||
|
"min_child_weight": 1e-3,
|
||
|
"verbosity": 2 if sklearn_params["verbose"] else 0,
|
||
|
"silent": sklearn_params["verbose"] == 0,
|
||
|
"n_jobs": -1,
|
||
|
"colsample_bynode": sklearn_params["max_features"],
|
||
|
}
|
||
|
|
||
|
# Catboost
|
||
|
catboost_loss_mapping = {
|
||
|
"squared_error": "RMSE",
|
||
|
# catboost does not support MAE when leaf_estimation_method is Newton
|
||
|
"absolute_error": "LEAST_ASBOLUTE_DEV_NOT_SUPPORTED",
|
||
|
"log_loss": "Logloss" if n_classes == 2 else "MultiClass",
|
||
|
"gamma": None,
|
||
|
"poisson": "Poisson",
|
||
|
}
|
||
|
|
||
|
catboost_params = {
|
||
|
"loss_function": catboost_loss_mapping[sklearn_params["loss"]],
|
||
|
"learning_rate": sklearn_params["learning_rate"],
|
||
|
"iterations": sklearn_params["max_iter"],
|
||
|
"depth": sklearn_params["max_depth"],
|
||
|
"reg_lambda": sklearn_params["l2_regularization"],
|
||
|
"max_bin": sklearn_params["max_bins"],
|
||
|
"feature_border_type": "Median",
|
||
|
"leaf_estimation_method": "Newton",
|
||
|
"verbose": bool(sklearn_params["verbose"]),
|
||
|
}
|
||
|
|
||
|
if lib == "lightgbm":
|
||
|
from lightgbm import LGBMClassifier, LGBMRegressor
|
||
|
|
||
|
if is_classifier(estimator):
|
||
|
return LGBMClassifier(**lightgbm_params)
|
||
|
else:
|
||
|
return LGBMRegressor(**lightgbm_params)
|
||
|
|
||
|
elif lib == "xgboost":
|
||
|
from xgboost import XGBClassifier, XGBRegressor
|
||
|
|
||
|
if is_classifier(estimator):
|
||
|
return XGBClassifier(**xgboost_params)
|
||
|
else:
|
||
|
return XGBRegressor(**xgboost_params)
|
||
|
|
||
|
else:
|
||
|
from catboost import CatBoostClassifier, CatBoostRegressor
|
||
|
|
||
|
if is_classifier(estimator):
|
||
|
return CatBoostClassifier(**catboost_params)
|
||
|
else:
|
||
|
return CatBoostRegressor(**catboost_params)
|