# Authors: Christian Lorentzen # # License: BSD 3 clause import numpy as np from numpy.testing import assert_allclose import pytest import warnings from sklearn.datasets import make_regression from sklearn.linear_model._glm import GeneralizedLinearRegressor from sklearn.linear_model import ( TweedieRegressor, PoissonRegressor, GammaRegressor ) from sklearn.linear_model._glm.link import ( IdentityLink, LogLink, ) from sklearn._loss.glm_distribution import ( TweedieDistribution, NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, ) from sklearn.linear_model import Ridge from sklearn.exceptions import ConvergenceWarning from sklearn.model_selection import train_test_split @pytest.fixture(scope="module") def regression_data(): X, y = make_regression(n_samples=107, n_features=10, n_informative=80, noise=0.5, random_state=2) return X, y def test_sample_weights_validation(): """Test the raised errors in the validation of sample_weight.""" # scalar value but not positive X = [[1]] y = [1] weights = 0 glm = GeneralizedLinearRegressor() # Positive weights are accepted glm.fit(X, y, sample_weight=1) # 2d array weights = [[0]] with pytest.raises(ValueError, match="must be 1D array or scalar"): glm.fit(X, y, weights) # 1d but wrong length weights = [1, 0] msg = r"sample_weight.shape == \(2,\), expected \(1,\)!" with pytest.raises(ValueError, match=msg): glm.fit(X, y, weights) @pytest.mark.parametrize('name, instance', [('normal', NormalDistribution()), ('poisson', PoissonDistribution()), ('gamma', GammaDistribution()), ('inverse-gaussian', InverseGaussianDistribution())]) def test_glm_family_argument(name, instance): """Test GLM family argument set as string.""" y = np.array([0.1, 0.5]) # in range of all distributions X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(family=name, alpha=0).fit(X, y) assert isinstance(glm._family_instance, instance.__class__) glm = GeneralizedLinearRegressor(family='not a family') with pytest.raises(ValueError, match="family must be"): glm.fit(X, y) @pytest.mark.parametrize('name, instance', [('identity', IdentityLink()), ('log', LogLink())]) def test_glm_link_argument(name, instance): """Test GLM link argument set as string.""" y = np.array([0.1, 0.5]) # in range of all distributions X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(family='normal', link=name).fit(X, y) assert isinstance(glm._link_instance, instance.__class__) glm = GeneralizedLinearRegressor(family='normal', link='not a link') with pytest.raises(ValueError, match="link must be"): glm.fit(X, y) @pytest.mark.parametrize('family, expected_link_class', [ ('normal', IdentityLink), ('poisson', LogLink), ('gamma', LogLink), ('inverse-gaussian', LogLink), ]) def test_glm_link_auto(family, expected_link_class): # Make sure link='auto' delivers the expected link function y = np.array([0.1, 0.5]) # in range of all distributions X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(family=family, link='auto').fit(X, y) assert isinstance(glm._link_instance, expected_link_class) @pytest.mark.parametrize('alpha', ['not a number', -4.2]) def test_glm_alpha_argument(alpha): """Test GLM for invalid alpha argument.""" y = np.array([1, 2]) X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(family='normal', alpha=alpha) with pytest.raises(ValueError, match="Penalty term must be a non-negative"): glm.fit(X, y) @pytest.mark.parametrize('fit_intercept', ['not bool', 1, 0, [True]]) def test_glm_fit_intercept_argument(fit_intercept): """Test GLM for invalid fit_intercept argument.""" y = np.array([1, 2]) X = np.array([[1], [1]]) glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept) with pytest.raises(ValueError, match="fit_intercept must be bool"): glm.fit(X, y) @pytest.mark.parametrize('solver', ['not a solver', 1, [1]]) def test_glm_solver_argument(solver): """Test GLM for invalid solver argument.""" y = np.array([1, 2]) X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(solver=solver) with pytest.raises(ValueError): glm.fit(X, y) @pytest.mark.parametrize('max_iter', ['not a number', 0, -1, 5.5, [1]]) def test_glm_max_iter_argument(max_iter): """Test GLM for invalid max_iter argument.""" y = np.array([1, 2]) X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(max_iter=max_iter) with pytest.raises(ValueError, match="must be a positive integer"): glm.fit(X, y) @pytest.mark.parametrize('tol', ['not a number', 0, -1.0, [1e-3]]) def test_glm_tol_argument(tol): """Test GLM for invalid tol argument.""" y = np.array([1, 2]) X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(tol=tol) with pytest.raises(ValueError, match="stopping criteria must be positive"): glm.fit(X, y) @pytest.mark.parametrize('warm_start', ['not bool', 1, 0, [True]]) def test_glm_warm_start_argument(warm_start): """Test GLM for invalid warm_start argument.""" y = np.array([1, 2]) X = np.array([[1], [1]]) glm = GeneralizedLinearRegressor(warm_start=warm_start) with pytest.raises(ValueError, match="warm_start must be bool"): glm.fit(X, y) @pytest.mark.parametrize('fit_intercept', [False, True]) def test_glm_identity_regression(fit_intercept): """Test GLM regression with identity link on a simple dataset.""" coef = [1., 2.] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity', fit_intercept=fit_intercept, tol=1e-12) if fit_intercept: glm.fit(X[:, 1:], y) assert_allclose(glm.coef_, coef[1:], rtol=1e-10) assert_allclose(glm.intercept_, coef[0], rtol=1e-10) else: glm.fit(X, y) assert_allclose(glm.coef_, coef, rtol=1e-12) @pytest.mark.parametrize('fit_intercept', [False, True]) @pytest.mark.parametrize('alpha', [0.0, 1.0]) @pytest.mark.parametrize('family', ['normal', 'poisson', 'gamma']) def test_glm_sample_weight_consistentcy(fit_intercept, alpha, family): """Test that the impact of sample_weight is consistent""" rng = np.random.RandomState(0) n_samples, n_features = 10, 5 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples) glm_params = dict(alpha=alpha, family=family, link='auto', fit_intercept=fit_intercept) glm = GeneralizedLinearRegressor(**glm_params).fit(X, y) coef = glm.coef_.copy() # sample_weight=np.ones(..) should be equivalent to sample_weight=None sample_weight = np.ones(y.shape) glm.fit(X, y, sample_weight=sample_weight) assert_allclose(glm.coef_, coef, rtol=1e-12) # sample_weight are normalized to 1 so, scaling them has no effect sample_weight = 2*np.ones(y.shape) glm.fit(X, y, sample_weight=sample_weight) assert_allclose(glm.coef_, coef, rtol=1e-12) # setting one element of sample_weight to 0 is equivalent to removing # the correspoding sample sample_weight = np.ones(y.shape) sample_weight[-1] = 0 glm.fit(X, y, sample_weight=sample_weight) coef1 = glm.coef_.copy() glm.fit(X[:-1], y[:-1]) assert_allclose(glm.coef_, coef1, rtol=1e-12) # check that multiplying sample_weight by 2 is equivalent # to repeating correspoding samples twice X2 = np.concatenate([X, X[:n_samples//2]], axis=0) y2 = np.concatenate([y, y[:n_samples//2]]) sample_weight_1 = np.ones(len(y)) sample_weight_1[:n_samples//2] = 2 glm1 = GeneralizedLinearRegressor(**glm_params).fit( X, y, sample_weight=sample_weight_1 ) glm2 = GeneralizedLinearRegressor(**glm_params).fit( X2, y2, sample_weight=None ) assert_allclose(glm1.coef_, glm2.coef_) @pytest.mark.parametrize('fit_intercept', [True, False]) @pytest.mark.parametrize( 'family', [NormalDistribution(), PoissonDistribution(), GammaDistribution(), InverseGaussianDistribution(), TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)]) def test_glm_log_regression(fit_intercept, family): """Test GLM regression with log link on a simple dataset.""" coef = [0.2, -0.1] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.exp(np.dot(X, coef)) glm = GeneralizedLinearRegressor( alpha=0, family=family, link='log', fit_intercept=fit_intercept, tol=1e-7) if fit_intercept: res = glm.fit(X[:, 1:], y) assert_allclose(res.coef_, coef[1:], rtol=1e-6) assert_allclose(res.intercept_, coef[0], rtol=1e-6) else: res = glm.fit(X, y) assert_allclose(res.coef_, coef, rtol=2e-6) @pytest.mark.parametrize('fit_intercept', [True, False]) def test_warm_start(fit_intercept): n_samples, n_features = 110, 10 X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_features-2, noise=0.5, random_state=42) glm1 = GeneralizedLinearRegressor( warm_start=False, fit_intercept=fit_intercept, max_iter=1000 ) glm1.fit(X, y) glm2 = GeneralizedLinearRegressor( warm_start=True, fit_intercept=fit_intercept, max_iter=1 ) # As we intentionally set max_iter=1, L-BFGS-B will issue a # ConvergenceWarning which we here simply ignore. with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=ConvergenceWarning) glm2.fit(X, y) assert glm1.score(X, y) > glm2.score(X, y) glm2.set_params(max_iter=1000) glm2.fit(X, y) # The two model are not exactly identical since the lbfgs solver # computes the approximate hessian from previous iterations, which # will not be strictly identical in the case of a warm start. assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-5) assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4) @pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)]) @pytest.mark.parametrize('fit_intercept', [True, False]) @pytest.mark.parametrize('sample_weight', [None, True]) def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, sample_weight, request): """Compare with Ridge regression for Normal distributions.""" test_size = 10 X, y = make_regression(n_samples=n_samples + test_size, n_features=n_features, n_informative=n_features-2, noise=0.5, random_state=42) if n_samples > n_features: ridge_params = {"solver": "svd"} else: ridge_params = {"solver": "saga", "max_iter": 1000000, "tol": 1e-7} X_train, X_test, y_train, y_test, = train_test_split( X, y, test_size=test_size, random_state=0 ) alpha = 1.0 if sample_weight is None: sw_train = None alpha_ridge = alpha * n_samples else: sw_train = np.random.RandomState(0).rand(len(y_train)) alpha_ridge = alpha * sw_train.sum() # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 ridge = Ridge(alpha=alpha_ridge, normalize=False, random_state=42, fit_intercept=fit_intercept, **ridge_params) ridge.fit(X_train, y_train, sample_weight=sw_train) glm = GeneralizedLinearRegressor(alpha=alpha, family='normal', link='identity', fit_intercept=fit_intercept, max_iter=300, tol=1e-5) glm.fit(X_train, y_train, sample_weight=sw_train) assert glm.coef_.shape == (X.shape[1], ) assert_allclose(glm.coef_, ridge.coef_, atol=5e-5) assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5) assert_allclose(glm.predict(X_train), ridge.predict(X_train), rtol=2e-4) assert_allclose(glm.predict(X_test), ridge.predict(X_test), rtol=2e-4) def test_poisson_glmnet(): """Compare Poisson regression with L2 regularization and LogLink to glmnet """ # library("glmnet") # options(digits=10) # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2)) # x <- data.matrix(df[,c("a", "b")]) # y <- df$y # fit <- glmnet(x=x, y=y, alpha=0, intercept=T, family="poisson", # standardize=F, thresh=1e-10, nlambda=10000) # coef(fit, s=1) # (Intercept) -0.12889386979 # a 0.29019207995 # b 0.03741173122 X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) glm = GeneralizedLinearRegressor(alpha=1, fit_intercept=True, family='poisson', link='log', tol=1e-7, max_iter=300) glm.fit(X, y) assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5) assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-5) def test_convergence_warning(regression_data): X, y = regression_data est = GeneralizedLinearRegressor(max_iter=1, tol=1e-20) with pytest.warns(ConvergenceWarning): est.fit(X, y) def test_poisson_regression_family(regression_data): # Make sure the family attribute is read-only to prevent searching over it # e.g. in a grid search est = PoissonRegressor() est.family == "poisson" msg = "PoissonRegressor.family must be 'poisson'!" with pytest.raises(ValueError, match=msg): est.family = 0 def test_gamma_regression_family(regression_data): # Make sure the family attribute is read-only to prevent searching over it # e.g. in a grid search est = GammaRegressor() est.family == "gamma" msg = "GammaRegressor.family must be 'gamma'!" with pytest.raises(ValueError, match=msg): est.family = 0 def test_tweedie_regression_family(regression_data): # Make sure the family attribute is always a TweedieDistribution and that # the power attribute is properly updated power = 2.0 est = TweedieRegressor(power=power) assert isinstance(est.family, TweedieDistribution) assert est.family.power == power assert est.power == power new_power = 0 new_family = TweedieDistribution(power=new_power) est.family = new_family assert isinstance(est.family, TweedieDistribution) assert est.family.power == new_power assert est.power == new_power msg = "TweedieRegressor.family must be of type TweedieDistribution!" with pytest.raises(TypeError, match=msg): est.family = None @pytest.mark.parametrize( 'estimator, value', [ (PoissonRegressor(), True), (GammaRegressor(), True), (TweedieRegressor(power=1.5), True), (TweedieRegressor(power=0), False), ], ) def test_tags(estimator, value): assert estimator._get_tags()['requires_positive_y'] is value