# Sebastian Raschka 2014-2020
# mlxtend Machine Learning Library Extensions
#
# Implementation of a Multi-layer Perceptron in Tensorflow
# Author: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause

import numpy as np
from time import time
from scipy.special import expit
from .._base import _BaseModel
from .._base import _IterativeModel
from .._base import _MultiClass
from .._base import _MultiLayer
from .._base import _Classifier


class MultiLayerPerceptron(_BaseModel, _IterativeModel,
                           _MultiClass, _MultiLayer, _Classifier):

    """Multi-layer perceptron classifier with logistic sigmoid activations

    Parameters
    ------------
    eta : float (default: 0.5)
        Learning rate (between 0.0 and 1.0)
    epochs : int (default: 50)
        Passes over the training dataset.
        Prior to each epoch, the dataset is shuffled
        if `minibatches > 1` to prevent cycles in stochastic gradient descent.
    hidden_layers : list (default: [50])
        Number of units per hidden layer. By default 50 units in the
        first hidden layer. At the moment only 1 hidden layer is supported
    n_classes : int (default: None)
        A positive integer to declare the number of class labels
        if not all class labels are present in a partial training set.
        Gets the number of class labels automatically if None.
    l1 : float (default: 0.0)
        L1 regularization strength
    l2 : float (default: 0.0)
        L2 regularization strength
    momentum : float (default: 0.0)
        Momentum constant. Factor multiplied with the
        gradient of the previous epoch t-1 to improve
        learning speed
        w(t) := w(t) - (grad(t) + momentum * grad(t-1))
    decrease_const : float (default: 0.0)
        Decrease constant. Shrinks the learning rate
        after each epoch via eta / (1 + epoch*decrease_const)
    minibatches : int (default: 1)
        Divide the training data into *k* minibatches
        for accelerated stochastic gradient descent learning.
        Gradient Descent Learning if `minibatches` = 1
        Stochastic Gradient Descent learning if `minibatches` = len(y)
        Minibatch learning if `minibatches` > 1
    random_seed : int (default: None)
        Set random state for shuffling and initializing the weights.
    print_progress : int (default: 0)
        Prints progress in fitting to stderr.
        0: No output
        1: Epochs elapsed and cost
        2: 1 plus time elapsed
        3: 2 plus estimated time until completion

    Attributes
    -----------
    w_ : 2d-array, shape=[n_features, n_classes]
        Weights after fitting.
    b_ : 1D-array, shape=[n_classes]
        Bias units after fitting.
    cost_ : list
        List of floats; the mean categorical cross entropy
        cost after each epoch.

    Examples
    -----------
    For usage examples, please see
    http://rasbt.github.io/mlxtend/user_guide/classifier/MultiLayerPerceptron/

    """

    def __init__(self, eta=0.5, epochs=50,
                 hidden_layers=[50],
                 n_classes=None,
                 momentum=0.0, l1=0.0, l2=0.0,
                 dropout=1.0,
                 decrease_const=0.0,
                 minibatches=1, random_seed=None,
                 print_progress=0):

        _BaseModel.__init__(self)
        _Classifier.__init__(self)
        _IterativeModel.__init__(self)
        _MultiClass.__init__(self)
        _MultiLayer.__init__(self)

        if len(hidden_layers) > 1:
            raise AttributeError('Currently, only 1 hidden layer is supported')
        self.hidden_layers = hidden_layers
        self.eta = eta
        self.n_classes = n_classes
        self.l1 = l1
        self.l2 = l2
        self.decrease_const = decrease_const
        self.momentum = momentum
        self.epochs = epochs
        self.minibatches = minibatches
        self.random_seed = random_seed
        self.print_progress = print_progress
        self._is_fitted = False

    def _fit(self, X, y, init_params=True):

        self._check_target_array(y)

        if init_params:
            self._decr_eta = self.eta
            if self.n_classes is None:
                self.n_classes = np.max(y) + 1

            self._n_features = X.shape[1]
            self._weight_maps, self._bias_maps = self._layermapping(
                n_features=self._n_features,
                n_classes=self.n_classes,
                hidden_layers=self.hidden_layers)

            self.w_, self.b_ = self._init_params_from_layermapping(
                weight_maps=self._weight_maps,
                bias_maps=self._bias_maps,
                random_seed=self.random_seed)

            self.cost_ = []

            if self.momentum != 0.0:
                prev_grad_b_1 = np.zeros(shape=self.b_['1'].shape)
                prev_grad_w_1 = np.zeros(shape=self.w_['1'].shape)
                prev_grad_b_out = np.zeros(shape=self.b_['out'].shape)
                prev_grad_w_out = np.zeros(shape=self.w_['out'].shape)

        y_enc = self._one_hot(y=y, n_labels=self.n_classes, dtype=np.float)

        self.init_time_ = time()

        rgen = np.random.RandomState(self.random_seed)
        for i in range(self.epochs):
            for idx in self._yield_minibatches_idx(
                    rgen=rgen,
                    n_batches=self.minibatches,
                    data_ary=y,
                    shuffle=True):

                net_1, act_1, net_out, act_out = self._feedforward(X[idx])

                # GRADIENTS VIA BACKPROPAGATION

                # [n_samples, n_classlabels]
                sigma_out = act_out - y_enc[idx]

                # [n_samples, n_hidden]
                sigmoid_derivative_1 = act_1 * (1.0 - act_1)

                # [n_samples, n_classlabels] dot [n_classlabels, n_hidden]
                # -> [n_samples, n_hidden]
                sigma_1 = (np.dot(sigma_out, self.w_['out'].T) *
                           sigmoid_derivative_1)

                # [n_features, n_samples] dot [n_samples, n_hidden]
                # -> [n_features, n_hidden]
                grad_W_1 = np.dot(X[idx].T, sigma_1)

                grad_B_1 = np.sum(sigma_1, axis=0)

                # [n_hidden, n_samples] dot [n_samples, n_classlabels]
                # -> [n_hidden, n_classlabels]
                grad_W_out = np.dot(act_1.T, sigma_out)

                grad_B_out = np.sum(sigma_out, axis=0)

                # LEARNING RATE ADJUSTEMENTS
                self._decr_eta /= (1.0 + self.decrease_const * i)

                # REGULARIZATION AND WEIGHT UPDATES

                dW_1 = (self._decr_eta * grad_W_1 +
                        self._decr_eta * self.l2 * self.w_['1'])

                dW_out = (self._decr_eta * grad_W_out +
                          self._decr_eta * self.l2 * self.w_['out'])

                dB_1 = self._decr_eta * grad_B_1
                dB_out = self._decr_eta * grad_B_out

                self.w_['1'] -= dW_1
                self.b_['1'] -= dB_1
                self.w_['out'] -= dW_out
                self.b_['out'] -= dB_out

                if self.momentum != 0.0:
                    self.w_['1'] -= self.momentum * prev_grad_w_1
                    self.b_['1'] -= self.momentum * prev_grad_b_1
                    self.w_['out'] -= self.momentum * prev_grad_w_out
                    self.b_['out'] -= self.momentum * prev_grad_b_out
                    prev_grad_b_1 = grad_B_1
                    prev_grad_w_1 = grad_W_1
                    prev_grad_b_out = grad_B_out
                    prev_grad_w_out = grad_W_out

            net_1, act_1, net_out, act_out = self._feedforward(X)
            cross_ent = self._cross_entropy(output=act_out, y_target=y_enc)
            cost = self._compute_cost(cross_ent)

            self.cost_.append(cost)
            if self.print_progress:
                self._print_progress(iteration=i + 1,
                                     n_iter=self.epochs,
                                     cost=cost)

        return self

    def _feedforward(self, X):

        # [n_samples, n_features] dot [n_features, n_hidden]
        # -> [n_samples, n_hidden]
        net_1 = np.dot(X, self.w_['1']) + self.b_['1']
        act_1 = self._sigmoid(net_1)

        # [n_samples, n_hidden] dot [n_hidden, n_classlabels]
        # -> [n_samples, n_classlabels]
        net_out = np.dot(act_1, self.w_['out']) + self.b_['out']
        act_out = self._softmax(net_out)

        return net_1, act_1, net_out, act_out

    def _compute_cost(self, cross_entropy):
        L2_term = (self.l2 *
                   (np.sum(self.w_['1'] ** 2.0) +
                    np.sum(self.w_['out'] ** 2.0)))

        L1_term = (self.l1 *
                   (np.abs(self.w_['1']).sum() + np.abs(self.w_['out']).sum()))

        cross_entropy = cross_entropy + L2_term + L1_term
        return 0.5 * np.mean(cross_entropy)

    def _predict(self, X):
        net_1, act_1, net_out, act_out = self._feedforward(X)
        y_pred = np.argmax(net_out, axis=1)
        return y_pred

    def _softmax(self, z):
        e_x = np.exp(z - z.max(axis=1, keepdims=True))
        out = e_x / e_x.sum(axis=1, keepdims=True)
        return out
        # return (np.exp(z.T) / np.sum(np.exp(z), axis=1)).T

    def _cross_entropy(self, output, y_target):
        return - np.sum(np.log(output) * (y_target), axis=1)

    def predict_proba(self, X):
        """Predict class probabilities of X from the net input.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        Returns
        ----------
        Class probabilties : array-like, shape= [n_samples, n_classes]

        """
        net_1, act_1, net_out, act_out = self._feedforward(X)
        softm = self._softmax(act_out)
        return softm

    def _sigmoid(self, z):
        """Compute logistic function (sigmoid).
        Uses scipy.special.expit to avoid overflow
        error for very small input values z.
        """
        # return 1.0 / (1.0 + np.exp(-z))
        return expit(z)