PCQRSCANER/venv/Lib/site-packages/fuzzysearch/substitutions_only.py

from collections import deque, defaultdict
from itertools import islice
from functools import wraps

import six


from fuzzysearch.common import Match, search_exact, \
    count_differences_with_maximum, get_best_match_in_group, group_matches


def _check_arguments(subsequence, sequence, max_substitutions):
    if not subsequence:
        raise ValueError('Given subsequence is empty!')

    if max_substitutions is None or max_substitutions < 0:
        raise ValueError('Maximum number of substitutions must be >= 0!')


def has_near_match_substitutions(subsequence, sequence, max_substitutions):
    _check_arguments(subsequence, sequence, max_substitutions)

    if max_substitutions == 0:
        for start_index in search_exact(subsequence, sequence):
            return True
        return False

    elif len(subsequence) // (max_substitutions + 1) >= 3:
        return has_near_match_substitutions_ngrams(
            subsequence, sequence, max_substitutions,
        )

    else:
        return has_near_match_substitutions_lp(
            subsequence, sequence, max_substitutions,
        )


def find_near_matches_substitutions(subsequence, sequence, max_substitutions):
    """Find near-matches of the subsequence in the sequence.

    This chooses a suitable fuzzy search implementation according to the given
    parameters.

    Returns a list of fuzzysearch.Match objects describing the matching parts
    of the sequence.
    """
    _check_arguments(subsequence, sequence, max_substitutions)

    if max_substitutions == 0:
        return [
            Match(start_index, start_index + len(subsequence), 0)
            for start_index in search_exact(subsequence, sequence)
        ]

    elif len(subsequence) // (max_substitutions + 1) >= 3:
        return find_near_matches_substitutions_ngrams(
            subsequence, sequence, max_substitutions,
        )

    else:
        return find_near_matches_substitutions_lp(
            subsequence, sequence, max_substitutions,
        )


def find_near_matches_substitutions_lp(subsequence, sequence,
                                       max_substitutions):
    """search for near-matches of subsequence in sequence

    This searches for near-matches, where the nearly-matching parts of the
    sequence must meet the following limitations (relative to the subsequence):

    * the number of character substitutions must be less than max_substitutions
    * no deletions or insertions are allowed
    """
    _check_arguments(subsequence, sequence, max_substitutions)

    return list(_find_near_matches_substitutions_lp(subsequence, sequence,
                                                    max_substitutions))


def _find_near_matches_substitutions_lp(subsequence, sequence,
                                        max_substitutions):
    # simple optimization: prepare some often used things in advance
    _SUBSEQ_LEN = len(subsequence)
    _SUBSEQ_LEN_MINUS_ONE = _SUBSEQ_LEN - 1

    # prepare quick lookup of where a character appears in the subsequence
    char_indexes_in_subsequence = defaultdict(list)
    for (index, char) in enumerate(subsequence):
        char_indexes_in_subsequence[char].append(index)

    # we'll iterate over the sequence once, but the iteration is split into two
    # for loops; therefore we prepare an iterator in advance which will be used
    # in both of the loops
    sequence_enum_iter = enumerate(sequence)

    # We'll count the number of matching characters assuming various attempted
    # alignments of the subsequence to the sequence. At any point in the
    # sequence there will be N such alignments to update. We'll keep
    # these in a "circular array" (a.k.a. a ring) which we'll rotate after each
    # iteration to re-align the indexing.

    # Initialize the candidate counts by iterating over the first N-1 items in
    # the sequence. No possible matches in this step!
    candidates = deque([0], maxlen=_SUBSEQ_LEN)
    for (index, char) in islice(sequence_enum_iter, _SUBSEQ_LEN_MINUS_ONE):
        for subseq_index in [idx for idx in char_indexes_in_subsequence[char] if idx <= index]:
            candidates[subseq_index] += 1
        candidates.appendleft(0)

    # From the N-th item onwards, we'll update the candidate counts exactly as
    # above, and additionally check if the part of the sequence whic began N-1
    # items before the current index was a near enough match to the given
    # sub-sequence.
    for (index, char) in sequence_enum_iter:
        for subseq_index in char_indexes_in_subsequence[char]:
            candidates[subseq_index] += 1

        # rotate the ring of candidate counts
        candidates.rotate(1)
        # fetch the count for the candidate which started N-1 items ago
        n_substitutions = _SUBSEQ_LEN - candidates[0]
        # set the count for the next index to zero
        candidates[0] = 0

        # if the candidate had few enough mismatches, yield a match
        if n_substitutions <= max_substitutions:
            yield Match(
                start=index - _SUBSEQ_LEN_MINUS_ONE,
                end=index + 1,
                dist=n_substitutions,
            )


def has_near_match_substitutions_lp(subsequence, sequence, max_substitutions):
    _check_arguments(subsequence, sequence, max_substitutions)

    for match in _find_near_matches_substitutions_lp(subsequence, sequence,
                                                     max_substitutions):
        return True
    return False


def find_near_matches_substitutions_ngrams(subsequence, sequence,
                                           max_substitutions):
    """search for near-matches of subsequence in sequence

    This searches for near-matches, where the nearly-matching parts of the
    sequence must meet the following limitations (relative to the subsequence):

    * the number of character substitutions must be less than max_substitutions
    * no deletions or insertions are allowed
    """
    _check_arguments(subsequence, sequence, max_substitutions)

    match_starts = set()
    matches = []
    for match in _find_near_matches_substitutions_ngrams(subsequence, sequence,
                                                         max_substitutions):
        if match.start not in match_starts:
            match_starts.add(match.start)
            matches.append(match)
    return sorted(matches, key=lambda match: match.start)


def _find_near_matches_substitutions_ngrams(subsequence, sequence,
                                            max_substitutions):
    subseq_len = len(subsequence)
    seq_len = len(sequence)

    ngram_len = subseq_len // (max_substitutions + 1)
    if ngram_len == 0:
        raise ValueError(
            "The subsequence's length must be greater than max_substitutions!"
        )

    for ngram_start in range(0, len(subsequence) - ngram_len + 1, ngram_len):
        ngram_end = ngram_start + ngram_len
        subseq_before = subsequence[:ngram_start]
        subseq_after = subsequence[ngram_end:]
        for index in search_exact(
                subsequence[ngram_start:ngram_end], sequence,
                ngram_start, seq_len - (subseq_len - ngram_end),
        ):
            n_substitutions = 0
            seq_before = sequence[index - ngram_start:index]
            if subseq_before != seq_before:
                n_substitutions += count_differences_with_maximum(
                    seq_before, subseq_before,
                    max_substitutions - n_substitutions + 1)
                if n_substitutions > max_substitutions:
                    continue

            seq_after = sequence[index + ngram_len:index - ngram_start + subseq_len]
            if subseq_after != seq_after:
                if n_substitutions == max_substitutions:
                    continue
                n_substitutions += count_differences_with_maximum(
                    seq_after, subseq_after,
                    max_substitutions - n_substitutions + 1)
                if n_substitutions > max_substitutions:
                    continue

            yield Match(
                start=index - ngram_start,
                end=index - ngram_start + subseq_len,
                dist=n_substitutions,
            )


def has_near_match_substitutions_ngrams(subsequence, sequence,
                                        max_substitutions):
    """search for near-matches of subsequence in sequence

    This searches for near-matches, where the nearly-matching parts of the
    sequence must meet the following limitations (relative to the subsequence):

    * the number of character substitutions must be less than max_substitutions
    * no deletions or insertions are allowed
    """
    _check_arguments(subsequence, sequence, max_substitutions)

    for match in _find_near_matches_substitutions_ngrams(subsequence, sequence,
                                                         max_substitutions):
        return True
    return False


try:
    from fuzzysearch._substitutions_only import \
        substitutions_only_has_near_matches_ngrams_byteslike, \
        substitutions_only_find_near_matches_ngrams_byteslike as \
            _subs_only_fnm_ngram_byteslike
except ImportError:
    pass
else:
    py_has_near_match_substitutions_ngrams = has_near_match_substitutions_ngrams
    @wraps(py_has_near_match_substitutions_ngrams)
    def has_near_match_substitutions_ngrams(subsequence, sequence,
                                            max_substitutions):
        if not (
            isinstance(subsequence, six.text_type) or
            isinstance(sequence, six.text_type)
        ):
            try:
                return substitutions_only_has_near_matches_ngrams_byteslike(
                    subsequence, sequence, max_substitutions)
            except TypeError:
                pass

        return py_has_near_match_substitutions_ngrams(
            subsequence, sequence, max_substitutions)

    py_find_near_matches_substitutions_ngrams = \
        find_near_matches_substitutions_ngrams
    @wraps(py_find_near_matches_substitutions_ngrams)
    def find_near_matches_substitutions_ngrams(subsequence, sequence,
                                               max_substitutions):
        if not (
            isinstance(subsequence, six.text_type) or
            isinstance(sequence, six.text_type)
        ):
            try:
                results = _subs_only_fnm_ngram_byteslike(
                    subsequence, sequence, max_substitutions)
            except TypeError:
                pass
            else:
                matches = [
                    Match(
                        index,
                        index + len(subsequence),
                        count_differences_with_maximum(
                            sequence[index:index+len(subsequence)],
                            subsequence,
                            max_substitutions + 1,
                        ),
                    )
                    for index in results
                ]
                return [
                    get_best_match_in_group(group)
                    for group in group_matches(matches)
                ]

        return py_find_near_matches_substitutions_ngrams(
            subsequence, sequence, max_substitutions)