"""fuzzy searching allowing subsitutions and insertions, but no deletions"""

__all__ = [
    'find_near_matches_no_deletions_ngrams',
]

import array

from fuzzysearch.common import Ngram, search_exact, Match


def _expand(subsequence, sequence, max_substitutions, max_insertions,
            max_l_dist):
    if not subsequence:
        return (0, 0)

    # Calculate the minimum number of substitutions required for each number
    # of insertions between 0 and max_insertions.
    #
    # This is done using a "dynamic programming" algorithm.
    n_subs = array.array('L', [0] * (max_insertions + 1))
    for subseq_index, char in enumerate(subsequence):
        n_subs[0] += (char != sequence[subseq_index])
        for n_ins in range(1, max_insertions + 1):
            n_subs[n_ins] = min(
                n_subs[n_ins] + (char != sequence[subseq_index + n_ins]),
                n_subs[n_ins - 1]
            )

    matches = [
        (_n_subs, _n_ins) for (_n_ins, _n_subs) in enumerate(n_subs)
        if _n_subs <= max_substitutions
        and _n_ins + _n_subs <= max_l_dist
    ]
    return [
        match for (i, match) in enumerate(matches)
        if i == 0 or match[0] < matches[i-1][0]
    ]


def find_near_matches_no_deletions_ngrams(subsequence, sequence, search_params):
    """search for near-matches of subsequence in sequence

    This searches for near-matches, where the nearly-matching parts of the
    sequence must meet the following limitations (relative to the subsequence):

    * the maximum allowed number of character substitutions
    * the maximum allowed number of new characters inserted
    * no deletions are allowed
    * the total number of substitutions, insertions and deletions
    """
    if not subsequence:
        raise ValueError('Given subsequence is empty!')

    max_substitutions, max_insertions, max_deletions, max_l_dist = search_params.unpacked

    max_substitutions = min(max_substitutions, max_l_dist)
    max_insertions = min(max_insertions, max_l_dist)

    subseq_len = len(subsequence)
    seq_len = len(sequence)

    ngram_len = subseq_len // (max_substitutions + max_insertions + 1)
    if ngram_len == 0:
        raise ValueError(
            "The subsequence's length must be greater than max_subs + max_ins!"
        )

    matches = []
    matched_indexes = set()

    for ngram_start in range(0, len(subsequence) - ngram_len + 1, ngram_len):
        ngram_end = ngram_start + ngram_len
        subseq_before = subsequence[:ngram_start]
        subseq_before_reversed = subseq_before[::-1]
        subseq_after = subsequence[ngram_end:]
        start_index = max(0, ngram_start - max_insertions)
        end_index = min(seq_len, seq_len - (subseq_len - ngram_end) + max_insertions)

        for index in search_exact(
                subsequence[ngram_start:ngram_end], sequence,
                start_index, end_index,
        ):
            if index - ngram_start in matched_indexes:
                continue

            seq_after = sequence[index + ngram_len:index + subseq_len - ngram_start + max_insertions]
            if seq_after.startswith(subseq_after):
                matches_after = [(0, 0)]
            else:
                matches_after = _expand(subseq_after, seq_after,
                                        max_substitutions, max_insertions, max_l_dist)
                if not matches_after:
                    continue

            _max_substitutions = max_substitutions - min(m[0] for m in matches_after)
            _max_insertions = max_insertions - min(m[1] for m in matches_after)
            _max_l_dist = max_l_dist - min(m[0] + m[1] for m in matches_after)
            seq_before = sequence[index - ngram_start - _max_insertions:index]
            if seq_before.endswith(subseq_before):
                matches_before = [(0, 0)]
            else:
                matches_before = _expand(
                    subseq_before_reversed, seq_before[::-1],
                    _max_substitutions, _max_insertions, _max_l_dist,
                )

            for (subs_before, ins_before) in matches_before:
                for (subs_after, ins_after) in matches_after:
                    if (
                            subs_before + subs_after <= max_substitutions and
                            ins_before + ins_after <= max_insertions and
                            subs_before + subs_after + ins_before + ins_after <= max_l_dist
                    ):
                        matches.append(Match(
                            start=index - ngram_start - ins_before,
                            end=index - ngram_start + subseq_len + ins_after,
                            dist=subs_before + subs_after + ins_before + ins_after,
                        ))
                        matched_indexes |= set(range(
                            index - ngram_start - ins_before,
                            index - ngram_start - ins_before + max_insertions + 1,
                        ))

    return sorted(matches, key=lambda match: match.start)