# Copyright 2022 The JAX Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Sparse linear algebra routines."""

from typing import Union, Callable
import functools

import jax
import jax.numpy as jnp

from jax.interpreters import mlir
from jax.interpreters import xla

from jax._src import core
from jax._src.lib import gpu_solver

import numpy as np
from scipy.sparse import csr_matrix, linalg


def lobpcg_standard(
    A: Union[jax.Array, Callable[[jax.Array], jax.Array]],
    X: jax.Array,
    m: int = 100,
    tol: Union[jax.Array, float, None] = None):
  """Compute the top-k standard eigenvalues using the LOBPCG routine.

  LOBPCG [1] stands for Locally Optimal Block Preconditioned Conjugate Gradient.
  The method enables finding top-k eigenvectors in an accelerator-friendly
  manner.

  This initial experimental version has several caveats.

    - Only the standard eigenvalue problem `A U = lambda U` is supported,
      general eigenvalues are not.
    - Gradient code is not available.
    - f64 will only work where jnp.linalg.eigh is supported for that type.
    - Finding the smallest eigenvectors is not yet supported. As a result,
      we don't yet support preconditioning, which is mostly needed for this
      case.

  The implementation is based on [2] and [3]; however, we deviate from these
  sources in several ways to improve robustness or facilitate implementation:

    - Despite increased iteration cost, we always maintain an orthonormal basis
      for the block search directions.
    - We change the convergence criterion; see the `tol` argument.
    - Soft locking [4] is intentionally not implemented; it relies on
      choosing an appropriate problem-specific tolerance to prevent
      blow-up near convergence from catastrophic cancellation of
      near-0 residuals. Instead, the approach implemented favors
      truncating the iteration basis.

  [1]: http://ccm.ucdenver.edu/reports/rep149.pdf
  [2]: https://arxiv.org/abs/1704.07458
  [3]: https://arxiv.org/abs/0705.2626
  [4]: DOI 10.13140/RG.2.2.11794.48327

  Args:
    A : An `(n, n)` array representing a square Hermitian matrix or a
        callable with its action.
    X : An `(n, k)` array representing the initial search directions for the `k`
        desired top eigenvectors. This need not be orthogonal, but must be
        numerically linearly independent (`X` will be orthonormalized).
        Note that we must have `0 < k * 5 < n`.
    m : Maximum integer iteration count; LOBPCG will only ever explore (a
        subspace of) the Krylov basis `{X, A X, A^2 X, ..., A^m X}`.
    tol : A float convergence tolerance; an eigenpair `(lambda, v)` is converged
          when its residual L2 norm `r = |A v - lambda v|` is below
          `tol * 10 * n * (lambda + |A v|)`, which
          roughly estimates the worst-case floating point error for an ideal
          eigenvector. If all `k` eigenvectors satisfy the tolerance
          comparison, then LOBPCG exits early. If left as None, then this is set
          to the float epsilon of `A.dtype`.

  Returns:
    `theta, U, i`, where `theta` is a `(k,)` array
    of eigenvalues, `U` is a `(n, k)` array of eigenvectors, `i` is the
    number of iterations performed.

  Raises:
    ValueError : if `A,X` dtypes or `n` dimensions do not match, or `k` is too
                 large (only `k * 5 < n` supported), or `k == 0`.
  """
  # Jit-compile once per matrix shape if possible.
  if isinstance(A, (jax.Array, np.ndarray)):
    return _lobpcg_standard_matrix(A, X, m, tol, debug=False)
  return _lobpcg_standard_callable(A, X, m, tol, debug=False)

@functools.partial(jax.jit, static_argnames=['m', 'debug'])
def _lobpcg_standard_matrix(
    A: jax.Array,
    X: jax.Array,
    m: int,
    tol: Union[jax.Array, float, None],
    debug: bool = False):
  """Computes lobpcg_standard(), possibly with debug diagnostics."""
  return _lobpcg_standard_callable(
      functools.partial(_mm, A), X, m, tol, debug)

@functools.partial(jax.jit, static_argnames=['A', 'm', 'debug'])
def _lobpcg_standard_callable(
    A: Callable[[jax.Array], jax.Array],
    X: jax.Array,
    m: int,
    tol: Union[jax.Array, float, None],
    debug: bool = False):
  """Supports generic lobpcg_standard() callable interface."""

  # TODO(vladf): support mixed_precision flag, which allows f64 Rayleigh-Ritz
  # with f32 inputs.

  n, k = X.shape
  dt = X.dtype

  _check_inputs(A, X)

  if tol is None:
    tol = jnp.finfo(dt).eps

  X = _orthonormalize(X)
  P = _extend_basis(X, X.shape[1])

  # We maintain X, our current list of best eigenvectors,
  # P, our search direction, and
  # R, our residuals, in a large joint array XPR, column-stacked, so (n, 3*k).

  AX = A(X)
  theta = jnp.sum(X * AX, axis=0, keepdims=True)
  R = AX - theta * X

  def cond(state):
    i, _X, _P, _R, converged, _ = state
    return jnp.logical_and(i < m, converged < k)

  def body(state):
    i, X, P, R, _, theta = state
    # Invariants: X, P, R kept orthonormal
    # Some R, P columns may be 0 (due to basis truncation, as decided
    # by orthogonalization routines), but not X.

    # TODO(vladf): support preconditioning for bottom-k eigenvectors
    # if M is not None:
    #   R = M(R)

    # Residual basis selection.
    R = _project_out(jnp.concatenate((X, P), axis=1), R)
    XPR = jnp.concatenate((X, P, R), axis=1)

    # Projected eigensolve.
    theta, Q = _rayleigh_ritz_orth(A, XPR)

    # Eigenvector X extraction
    B = Q[:, :k]
    normB = jnp.linalg.norm(B, ord=2, axis=0, keepdims=True)
    B /= normB
    X = _mm(XPR, B)
    normX = jnp.linalg.norm(X, ord=2, axis=0, keepdims=True)
    X /= normX

    # Difference terms P extraction
    #
    # In next step of LOBPCG, naively, we'd set
    # P = S[:, k:] @ Q[k:, :k] to achieve span(X, P) == span(X, previous X)
    # (this is not obvious, see section 4 of [1]).
    #
    # Instead we orthogonalize concat(0, Q[k:, :k]) against Q[:, :k]
    # in the standard basis before mapping with XPR. Since XPR is itself
    # orthonormal, the resulting directions are themselves orthonormalized.
    #
    # [2] leverages Q's existing orthogonality to derive
    # an analytic expression for this value based on the quadrant Q[:k,k:]
    # (see section 4.2 of [2]).
    q, _ = jnp.linalg.qr(Q[:k, k:].T)
    diff_rayleigh_ortho = _mm(Q[:, k:], q)
    P = _mm(XPR, diff_rayleigh_ortho)
    normP = jnp.linalg.norm(P, ord=2, axis=0, keepdims=True)
    P /= jnp.where(normP == 0, 1.0, normP)

    # Compute new residuals.
    AX = A(X)
    R = AX - theta[jnp.newaxis, :k] * X
    resid_norms = jnp.linalg.norm(R, ord=2, axis=0)

    # I tried many variants of hard and soft locking [3]. All of them seemed
    # to worsen performance relative to no locking.
    #
    # Further, I found a more expermental convergence formula compared to what
    # is suggested in the literature, loosely based on floating-point
    # expectations.
    #
    # [2] discusses various strategies for this in Sec 5.3. The solution
    # they end up with, which estimates operator norm |A| via Gaussian
    # products, was too crude in practice (and overly-lax). The Gaussian
    # approximation seems like an estimate of the average eigenvalue.
    #
    # Instead, we test convergence via self-consistency of the eigenpair
    # i.e., the residual norm |r| should be small, relative to the floating
    # point error we'd expect from computing just the residuals given
    # candidate vectors.
    reltol = jnp.linalg.norm(AX, ord=2, axis=0) + theta[:k]
    reltol *= n
    # Allow some margin for a few element-wise operations.
    reltol *= 10
    res_converged = resid_norms < tol * reltol
    converged = jnp.sum(res_converged)

    new_state = i + 1, X, P, R, converged, theta[jnp.newaxis, :k]
    if debug:
      diagnostics = _generate_diagnostics(
          XPR, X, P, R, theta, converged, resid_norms / reltol)
      new_state = (new_state, diagnostics)
    return new_state

  converged = 0
  state = (0, X, P, R, converged, theta)
  if debug:
    state, diagnostics = jax.lax.scan(
        lambda state, _: body(state), state, xs=None, length=m)
  else:
    state = jax.lax.while_loop(cond, body, state)
  i, X, _P, _R, _converged, theta = state

  if debug:
    return theta[0, :], X, i, diagnostics
  return theta[0, :], X, i


def _check_inputs(A, X):
  n, k = X.shape
  dt = X.dtype

  if k == 0:
    raise ValueError(f'must have search dim > 0, got {k}')

  if k * 5 >= n:
    raise ValueError(f'expected search dim * 5 < matrix dim (got {k * 5}, {n})')

  test_output = A(jnp.zeros((n, 1), dtype=X.dtype))

  if test_output.dtype != dt:
    raise ValueError(
        f'A, X must have same dtypes (were {test_output.dtype}, {dt})')

  if test_output.shape != (n, 1):
    s = test_output.shape
    raise ValueError(f'A must be ({n}, {n}) matrix A, got output {s}')


def _mm(a, b, precision=jax.lax.Precision.HIGHEST):
  return jax.lax.dot(a, b, (precision, precision))

def _generate_diagnostics(prev_XPR, X, P, R, theta, converged, adj_resid):
  k = X.shape[1]
  assert X.shape == P.shape

  diagdiag = lambda x: jnp.diag(jnp.diag(x))
  abserr = lambda x: jnp.abs(x).sum() / (k ** 2)

  XTX = _mm(X.T, X)
  DX = diagdiag(XTX)
  orthX = abserr(XTX - DX)

  PTP = _mm(P.T, P)
  DP = diagdiag(PTP)
  orthP = abserr(PTP - DP)

  PX = abserr(X.T @ P)

  prev_basis = prev_XPR.shape[1] - jnp.sum(jnp.all(prev_XPR == 0.0, axis=0))

  return {
      'basis rank': prev_basis,
      'X zeros': jnp.sum(jnp.all(X == 0.0, axis=0)),
      'P zeros': jnp.sum(jnp.all(P == 0.0, axis=0)),
      'lambda history': theta[:k],
      'residual history': jnp.linalg.norm(R, axis=0, ord=2),
      'converged': converged,
      'adjusted residual max': jnp.max(adj_resid),
      'adjusted residual p50': jnp.median(adj_resid),
      'adjusted residual min': jnp.min(adj_resid),
      'X orth': orthX,
      'P orth': orthP,
      'P.X': PX}

def _eigh_ascending(A):
  w, V = jnp.linalg.eigh(A)
  return w[::-1], V[:, ::-1]


def _svqb(X):
  """Derives a truncated orthonormal basis for `X`.

  SVQB [1] is an accelerator-friendly orthonormalization procedure, which
  squares the matrix `C = X.T @ X` and computes an eigenbasis for a smaller
  `(k, k)` system; this offloads most of the work in orthonormalization
  to the first multiply when `n` is large.

  Importantly, if diagonalizing the squared matrix `C` reveals rank deficiency
  of X (which would be evidenced by near-0 then), eigenvalues corresponding
  columns are zeroed out.

  [1]: https://sdm.lbl.gov/~kewu/ps/45577.html

  Args:
    X : An `(n, k)` array which describes a linear subspace of R^n, possibly
        numerically degenerate with some rank less than `k`.

  Returns:
    An orthonormal space `V` described by a `(n, k)` array, with trailing
    columns possibly zeroed out if `X` is of low rank.
  """

  # In [1] diagonal conditioning is explicit, but by normalizing first
  # we can simplify the formulas a bit, since then diagonal conditioning
  # becomes a no-op.
  norms = jnp.linalg.norm(X, ord=2, axis=0, keepdims=True)
  X /= jnp.where(norms == 0, 1.0, norms)

  inner = _mm(X.T, X)

  w, V = _eigh_ascending(inner)

  # All mask logic is used to avoid divide-by-zeros when input columns
  # may have been zero or new zero columns introduced from truncation.
  #
  # If an eigenvalue is less than max eigvalue * eps, then consider
  # that direction "degenerate".
  tau = jnp.finfo(X.dtype).eps * w[0]
  padded = jnp.maximum(w, tau)

  # Note the the tau == 0 edge case where X was all zeros.
  sqrted = jnp.where(tau > 0, padded, 1.0) ** (-0.5)

  # X^T X = V diag(w) V^T, so
  # W = X V diag(w)^(-1/2) will yield W^T W = I (excerpting zeros).
  scaledV = V * sqrted[jnp.newaxis, :]
  orthoX = _mm(X, scaledV)

  keep = ((w > tau) * (jnp.diag(inner) > 0.0))[jnp.newaxis, :]
  orthoX *= keep.astype(orthoX.dtype)
  norms = jnp.linalg.norm(orthoX, ord=2, axis=0, keepdims=True)
  keep *= norms > 0.0
  orthoX /= jnp.where(keep, norms, 1.0)
  return orthoX


def _project_out(basis, U):
  """Derives component of U in the orthogonal complement of basis.

  This method iteratively subtracts out the basis component and orthonormalizes
  the remainder. To an extent, these two operations can oppose each other
  when the remainder norm is near-zero (since normalization enlarges a vector
  which may possibly lie in the subspace `basis` to be subtracted).

  We make sure to prioritize orthogonality between `basis` and `U`, favoring
  to return a lower-rank space thank `rank(U)`, in this tradeoff.

  Args:
    basis : An `(n, m)` array which describes a linear subspace of R^n, this
        is assumed to be orthonormal but zero columns are allowed.
    U : An `(n, k)` array representing another subspace of R^n, whose `basis`
        component is to be projected out.

  Returns:
    An `(n, k)` array, with some columns possibly zeroed out, representing
    the component of `U` in the complement of `basis`. The nonzero columns
    are mutually orthonormal.
  """

  # See Sec. 6.9 of The Symmetric Eigenvalue Problem by Beresford Parlett [1]
  # which motivates two loop iterations for basis subtraction. This
  # "twice is enough" approach is due to Kahan. See also a practical note
  # by SLEPc developers [2].
  #
  # Interspersing with orthonormalization isn't directly grounded in the
  # original analysis, but taken from Algorithm 5 of [3]. In practice, due to
  # normalization, I have noticed that that the orthonormalized basis
  # does not always end up as a subspace of the starting basis in practice.
  # There may be room to refine this procedure further, but the adjustment
  # in the subsequent block handles this edge case well enough for now.
  #
  # [1]: https://epubs.siam.org/doi/abs/10.1137/1.9781611971163
  # [2]: http://slepc.upv.es/documentation/reports/str1.pdf
  # [3]: https://arxiv.org/abs/1704.07458
  for _ in range(2):
    U -= _mm(basis, _mm(basis.T, U))
    U = _orthonormalize(U)

  # It's crucial to end on a subtraction of the original basis.
  # This seems to be a detail not present in [2], possibly because of
  # of reliance on soft locking.
  #
  # Near convergence, if the residuals R are 0 and our last
  # operation when projecting (X, P) out from R is the orthonormalization
  # done above, then due to catastrophic cancellation we may re-introduce
  # (X, P) subspace components into U, which can ruin the Rayleigh-Ritz
  # conditioning.
  #
  # We zero out any columns that are even remotely suspicious, so the invariant
  # that [basis, U] is zero-or-orthogonal is ensured.
  for _ in range(2):
    U -= _mm(basis, _mm(basis.T, U))
  normU = jnp.linalg.norm(U, ord=2, axis=0, keepdims=True)
  U *= (normU >= 0.99).astype(U.dtype)

  return U

def _orthonormalize(basis):
  # Twice is enough, again.
  for _ in range(2):
    basis = _svqb(basis)
  return basis


def _rayleigh_ritz_orth(A, S):
  """Solve the Rayleigh-Ritz problem for `A` projected to `S`.

  Solves the local eigenproblem for `A` within the subspace `S`, which is
  assumed to be orthonormal (with zero columns allowed), identifying `w, V`
  satisfying

  (1) `S.T A S V ~= diag(w) V`
  (2) `V` is standard orthonormal

  Note that (2) is simplified to be standard orthonormal because `S` is.

  Args:
    A: An operator representing the action of an `n`-sized square matrix.
    S: An orthonormal subspace of R^n represented by an `(n, k)` array, with
       zero columns allowed.

  Returns:
    Eigenvectors `V` and eigenvalues `w` satisfying the size-`k` system
    described in this method doc. Note `V` will be full rank, even if `S` isn't.
  """

  SAS = _mm(S.T, A(S))

  # Solve the projected subsytem.
  # If we could tell to eigh to stop after first k, we would.
  return _eigh_ascending(SAS)


def _extend_basis(X, m):
  """Extend the basis of `X` with `m` addition dimensions.

  Given an orthonormal `X` of dimension `k`, a typical strategy for deriving
  an extended basis is to generate a random one and project it out.

  We instead generate a basis using block householder reflectors [1] [2] to
  leverage the favorable properties of determinism and avoiding the chance that
  the generated random basis has overlap with the starting basis, which may
  happen with non-negligible probability in low-dimensional cases.

  [1]: https://epubs.siam.org/doi/abs/10.1137/0725014
  [2]: https://www.jstage.jst.go.jp/article/ipsjdc/2/0/2_0_298/_article

  Args:
    X : An `(n, k)` array representing a `k`-rank orthonormal basis for a linear
        subspace of R^n.
    m : A nonnegative integer such that `k + m <= n` telling us how much to
        extend the basis by.

  Returns:
    An `(n, m)` array representing an extension to the basis of `X` such that
    their union is orthonormal.
  """
  n, k = X.shape
  # X = vstack(Xupper, Xlower), where Xupper is (k, k)
  Xupper, Xlower = jnp.split(X, [k], axis=0)
  u, s, vt = jnp.linalg.svd(Xupper)

  # Adding U V^T to Xupper won't change its row or column space, but notice
  # its singular values are all lifted by 1; we could write its upper k rows
  # as u diag(1 + s) vt.
  y = jnp.concatenate([Xupper + _mm(u, vt), Xlower], axis=0)

  # Suppose we found a full-rank (n, k) matrix w which defines the
  # perpendicular to a space we'd like to reflect over. The block householder
  # reflector H(w) would have the usual involution property.
  #
  # Consider the two definitions below:
  # H(w) = I - 2 w w^T
  # 2 w w^T = y (v diag(1+s)^(-1) vt) y^T
  #
  # After some algebra, we see H(w) X = vstack(-u vt, 0)
  # Applying H(w) to both sides since H(w)^2 = I we have
  # X = H(w) vstack(-u vt, 0). But since H(w) is unitary its action must
  # preserve rank. Thus H(w) vstack(0, eye(n - k)) must be orthogonal to
  # X; taking just the first m columns H(w) vstack(0, eye(m), 0) yields
  # an orthogonal extension to X.
  other = jnp.concatenate(
      [jnp.eye(m, dtype=X.dtype),
       jnp.zeros((n - k - m, m), dtype=X.dtype)], axis=0)
  w = _mm(y, vt.T * ((2 * (1 + s)) ** (-1/2))[jnp.newaxis, :])
  h = -2 * jnp.linalg.multi_dot(
      [w, w[k:, :].T, other], precision=jax.lax.Precision.HIGHEST)
  return h.at[k:].add(other)


# Sparse direct solve via QR factorization
def _spsolve_abstract_eval(data, indices, indptr, b, *, tol, reorder):
  if data.dtype != b.dtype:
    raise ValueError(f"data types do not match: {data.dtype=} {b.dtype=}")
  if not (jnp.issubdtype(indices.dtype, jnp.integer) and jnp.issubdtype(indptr.dtype, jnp.integer)):
    raise ValueError(f"index arrays must be integer typed; got {indices.dtype=} {indptr.dtype=}")
  if not data.ndim == indices.ndim == indptr.ndim == b.ndim == 1:
    raise ValueError("Arrays must be one-dimensional. "
                     f"Got {data.shape=} {indices.shape=} {indptr.shape=} {b.shape=}")
  if indptr.size != b.size + 1 or  data.shape != indices.shape:
    raise ValueError(f"Invalid CSR buffer sizes: {data.shape=} {indices.shape=} {indptr.shape=}")
  if reorder not in [0, 1, 2, 3]:
    raise ValueError(f"{reorder=} not valid, must be one of [1, 2, 3, 4]")
  tol = float(tol)
  return b


def _spsolve_gpu_lowering(ctx, data, indices, indptr, b, *, tol, reorder):
  data_aval, _, _, _, = ctx.avals_in
  return gpu_solver.cuda_csrlsvqr(data_aval.dtype, data, indices,
                                  indptr, b, tol, reorder)


def _spsolve_cpu_lowering(ctx, data, indices, indptr, b, tol, reorder):
  del tol, reorder
  args = [data, indices, indptr, b]

  def _callback(data, indices, indptr, b, **kwargs):
    A = csr_matrix((data, indices, indptr), shape=(b.size, b.size))
    return (linalg.spsolve(A, b).astype(b.dtype),)

  result, _, keepalive = mlir.emit_python_callback(
      ctx, _callback, None, args, ctx.avals_in, ctx.avals_out,
      has_side_effect=False)
  ctx.module_context.add_keepalive(keepalive)
  return result


spsolve_p = core.Primitive('spsolve')
spsolve_p.def_impl(functools.partial(xla.apply_primitive, spsolve_p))
spsolve_p.def_abstract_eval(_spsolve_abstract_eval)
mlir.register_lowering(spsolve_p, _spsolve_gpu_lowering, platform='cuda')
mlir.register_lowering(spsolve_p, _spsolve_cpu_lowering, platform='cpu')


def spsolve(data, indices, indptr, b, tol=1e-6, reorder=1):
  """A sparse direct solver using QR factorization.

  Accepts a sparse matrix in CSR format `data, indices, indptr` arrays.
  Currently only the CUDA GPU backend is implemented.

  Args:
    data : An array containing the non-zero entries of the CSR matrix.
    indices : The column indices of the CSR matrix.
    indptr : The row pointer array of the CSR matrix.
    b : The right hand side of the linear system.
    tol : Tolerance to decide if singular or not. Defaults to 1e-6.
    reorder : The reordering scheme to use to reduce fill-in. No reordering if
      ``reorder=0``. Otherwise, symrcm, symamd, or csrmetisnd (``reorder=1,2,3``),
      respectively. Defaults to symrcm.

  Returns:
    An array with the same dtype and size as b representing the solution to
    the sparse linear system.
  """
  return spsolve_p.bind(data, indices, indptr, b, tol=tol, reorder=reorder)