# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
#         Fabian Pedregosa <fabian.pedregosa@inria.fr>
#
# License: BSD 3 clause

from math import log

import numpy as np
from scipy.linalg import pinvh
import pytest


from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_less
from sklearn.utils._testing import assert_raise_message
from sklearn.utils import check_random_state
from sklearn.linear_model import BayesianRidge, ARDRegression
from sklearn.linear_model import Ridge
from sklearn import datasets
from sklearn.utils.extmath import fast_logdet

diabetes = datasets.load_diabetes()


def test_n_iter():
    """Check value of n_iter."""
    X = np.array([[1], [2], [6], [8], [10]])
    y = np.array([1, 2, 6, 8, 10])
    clf = BayesianRidge(n_iter=0)
    msg = "n_iter should be greater than or equal to 1."
    assert_raise_message(ValueError, msg, clf.fit, X, y)


def test_bayesian_ridge_scores():
    """Check scores attribute shape"""
    X, y = diabetes.data, diabetes.target

    clf = BayesianRidge(compute_score=True)
    clf.fit(X, y)

    assert clf.scores_.shape == (clf.n_iter_ + 1,)


def test_bayesian_ridge_score_values():
    """Check value of score on toy example.

    Compute log marginal likelihood with equation (36) in Sparse Bayesian
    Learning and the Relevance Vector Machine (Tipping, 2001):

    - 0.5 * (log |Id/alpha + X.X^T/lambda| +
             y^T.(Id/alpha + X.X^T/lambda).y + n * log(2 * pi))
    + lambda_1 * log(lambda) - lambda_2 * lambda
    + alpha_1 * log(alpha) - alpha_2 * alpha

    and check equality with the score computed during training.
    """

    X, y = diabetes.data, diabetes.target
    n_samples = X.shape[0]
    # check with initial values of alpha and lambda (see code for the values)
    eps = np.finfo(np.float64).eps
    alpha_ = 1. / (np.var(y) + eps)
    lambda_ = 1.

    # value of the parameters of the Gamma hyperpriors
    alpha_1 = 0.1
    alpha_2 = 0.1
    lambda_1 = 0.1
    lambda_2 = 0.1

    # compute score using formula of docstring
    score = lambda_1 * log(lambda_) - lambda_2 * lambda_
    score += alpha_1 * log(alpha_) - alpha_2 * alpha_
    M = 1. / alpha_ * np.eye(n_samples) + 1. / lambda_ * np.dot(X, X.T)
    M_inv = pinvh(M)
    score += - 0.5 * (fast_logdet(M) + np.dot(y.T, np.dot(M_inv, y)) +
                      n_samples * log(2 * np.pi))

    # compute score with BayesianRidge
    clf = BayesianRidge(alpha_1=alpha_1, alpha_2=alpha_2,
                        lambda_1=lambda_1, lambda_2=lambda_2,
                        n_iter=1, fit_intercept=False, compute_score=True)
    clf.fit(X, y)

    assert_almost_equal(clf.scores_[0], score, decimal=9)


def test_bayesian_ridge_parameter():
    # Test correctness of lambda_ and alpha_ parameters (GitHub issue #8224)
    X = np.array([[1, 1], [3, 4], [5, 7], [4, 1], [2, 6], [3, 10], [3, 2]])
    y = np.array([1, 2, 3, 2, 0, 4, 5]).T

    # A Ridge regression model using an alpha value equal to the ratio of
    # lambda_ and alpha_ from the Bayesian Ridge model must be identical
    br_model = BayesianRidge(compute_score=True).fit(X, y)
    rr_model = Ridge(alpha=br_model.lambda_ / br_model.alpha_).fit(X, y)
    assert_array_almost_equal(rr_model.coef_, br_model.coef_)
    assert_almost_equal(rr_model.intercept_, br_model.intercept_)


def test_bayesian_sample_weights():
    # Test correctness of the sample_weights method
    X = np.array([[1, 1], [3, 4], [5, 7], [4, 1], [2, 6], [3, 10], [3, 2]])
    y = np.array([1, 2, 3, 2, 0, 4, 5]).T
    w = np.array([4, 3, 3, 1, 1, 2, 3]).T

    # A Ridge regression model using an alpha value equal to the ratio of
    # lambda_ and alpha_ from the Bayesian Ridge model must be identical
    br_model = BayesianRidge(compute_score=True).fit(X, y, sample_weight=w)
    rr_model = Ridge(alpha=br_model.lambda_ / br_model.alpha_).fit(
        X, y, sample_weight=w)
    assert_array_almost_equal(rr_model.coef_, br_model.coef_)
    assert_almost_equal(rr_model.intercept_, br_model.intercept_)


def test_toy_bayesian_ridge_object():
    # Test BayesianRidge on toy
    X = np.array([[1], [2], [6], [8], [10]])
    Y = np.array([1, 2, 6, 8, 10])
    clf = BayesianRidge(compute_score=True)
    clf.fit(X, Y)

    # Check that the model could approximately learn the identity function
    test = [[1], [3], [4]]
    assert_array_almost_equal(clf.predict(test), [1, 3, 4], 2)


def test_bayesian_initial_params():
    # Test BayesianRidge with initial values (alpha_init, lambda_init)
    X = np.vander(np.linspace(0, 4, 5), 4)
    y = np.array([0., 1., 0., -1., 0.])    # y = (x^3 - 6x^2 + 8x) / 3

    # In this case, starting from the default initial values will increase
    # the bias of the fitted curve. So, lambda_init should be small.
    reg = BayesianRidge(alpha_init=1., lambda_init=1e-3)
    # Check the R2 score nearly equals to one.
    r2 = reg.fit(X, y).score(X, y)
    assert_almost_equal(r2, 1.)


def test_prediction_bayesian_ridge_ard_with_constant_input():
    # Test BayesianRidge and ARDRegression predictions for edge case of
    # constant target vectors
    n_samples = 4
    n_features = 5
    random_state = check_random_state(42)
    constant_value = random_state.rand()
    X = random_state.random_sample((n_samples, n_features))
    y = np.full(n_samples, constant_value,
                dtype=np.array(constant_value).dtype)
    expected = np.full(n_samples, constant_value,
                       dtype=np.array(constant_value).dtype)

    for clf in [BayesianRidge(), ARDRegression()]:
        y_pred = clf.fit(X, y).predict(X)
        assert_array_almost_equal(y_pred, expected)


def test_std_bayesian_ridge_ard_with_constant_input():
    # Test BayesianRidge and ARDRegression standard dev. for edge case of
    # constant target vector
    # The standard dev. should be relatively small (< 0.01 is tested here)
    n_samples = 10
    n_features = 5
    random_state = check_random_state(42)
    constant_value = random_state.rand()
    X = random_state.random_sample((n_samples, n_features))
    y = np.full(n_samples, constant_value,
                dtype=np.array(constant_value).dtype)
    expected_upper_boundary = 0.01

    for clf in [BayesianRidge(), ARDRegression()]:
        _, y_std = clf.fit(X, y).predict(X, return_std=True)
        assert_array_less(y_std, expected_upper_boundary)


def test_update_of_sigma_in_ard():
    # Checks that `sigma_` is updated correctly after the last iteration
    # of the ARDRegression algorithm. See issue #10128.
    X = np.array([[1, 0],
                  [0, 0]])
    y = np.array([0, 0])
    clf = ARDRegression(n_iter=1)
    clf.fit(X, y)
    # With the inputs above, ARDRegression prunes both of the two coefficients
    # in the first iteration. Hence, the expected shape of `sigma_` is (0, 0).
    assert clf.sigma_.shape == (0, 0)
    # Ensure that no error is thrown at prediction stage
    clf.predict(X, return_std=True)


def test_toy_ard_object():
    # Test BayesianRegression ARD classifier
    X = np.array([[1], [2], [3]])
    Y = np.array([1, 2, 3])
    clf = ARDRegression(compute_score=True)
    clf.fit(X, Y)

    # Check that the model could approximately learn the identity function
    test = [[1], [3], [4]]
    assert_array_almost_equal(clf.predict(test), [1, 3, 4], 2)


@pytest.mark.parametrize('seed', range(100))
@pytest.mark.parametrize('n_samples, n_features', ((10, 100), (100, 10)))
def test_ard_accuracy_on_easy_problem(seed, n_samples, n_features):
    # Check that ARD converges with reasonable accuracy on an easy problem
    # (Github issue #14055)
    X = np.random.RandomState(seed=seed).normal(size=(250, 3))
    y = X[:, 1]

    regressor = ARDRegression()
    regressor.fit(X, y)

    abs_coef_error = np.abs(1 - regressor.coef_[1])
    assert abs_coef_error < 1e-10


def test_return_std():
    # Test return_std option for both Bayesian regressors
    def f(X):
        return np.dot(X, w) + b

    def f_noise(X, noise_mult):
        return f(X) + np.random.randn(X.shape[0]) * noise_mult

    d = 5
    n_train = 50
    n_test = 10

    w = np.array([1.0, 0.0, 1.0, -1.0, 0.0])
    b = 1.0

    X = np.random.random((n_train, d))
    X_test = np.random.random((n_test, d))

    for decimal, noise_mult in enumerate([1, 0.1, 0.01]):
        y = f_noise(X, noise_mult)

        m1 = BayesianRidge()
        m1.fit(X, y)
        y_mean1, y_std1 = m1.predict(X_test, return_std=True)
        assert_array_almost_equal(y_std1, noise_mult, decimal=decimal)

        m2 = ARDRegression()
        m2.fit(X, y)
        y_mean2, y_std2 = m2.predict(X_test, return_std=True)
        assert_array_almost_equal(y_std2, noise_mult, decimal=decimal)


@pytest.mark.parametrize('seed', range(10))
def test_update_sigma(seed):
    # make sure the two update_sigma() helpers are equivalent. The woodbury
    # formula is used when n_samples < n_features, and the other one is used
    # otherwise.

    rng = np.random.RandomState(seed)

    # set n_samples == n_features to avoid instability issues when inverting
    # the matrices. Using the woodbury formula would be unstable when
    # n_samples > n_features
    n_samples = n_features = 10
    X = rng.randn(n_samples, n_features)
    alpha = 1
    lmbda = np.arange(1, n_features + 1)
    keep_lambda = np.array([True] * n_features)

    reg = ARDRegression()

    sigma = reg._update_sigma(X, alpha, lmbda, keep_lambda)
    sigma_woodbury = reg._update_sigma_woodbury(X, alpha, lmbda, keep_lambda)

    np.testing.assert_allclose(sigma, sigma_woodbury)


def test_ard_regression_predict_normalize_true():
    """Check that we can predict with `normalize=True` and `return_std=True`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/18605
    """
    clf = ARDRegression(normalize=True)
    clf.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2])
    clf.predict([[1, 1]], return_std=True)