PCQRSCANER/venv/Lib/site-packages/fuzzysearch/substitutions_only.py
2019-12-22 21:51:47 +01:00

290 lines
11 KiB
Python

from collections import deque, defaultdict
from itertools import islice
from functools import wraps
import six
from fuzzysearch.common import Match, search_exact, \
count_differences_with_maximum, get_best_match_in_group, group_matches
def _check_arguments(subsequence, sequence, max_substitutions):
if not subsequence:
raise ValueError('Given subsequence is empty!')
if max_substitutions is None or max_substitutions < 0:
raise ValueError('Maximum number of substitutions must be >= 0!')
def has_near_match_substitutions(subsequence, sequence, max_substitutions):
_check_arguments(subsequence, sequence, max_substitutions)
if max_substitutions == 0:
for start_index in search_exact(subsequence, sequence):
return True
return False
elif len(subsequence) // (max_substitutions + 1) >= 3:
return has_near_match_substitutions_ngrams(
subsequence, sequence, max_substitutions,
)
else:
return has_near_match_substitutions_lp(
subsequence, sequence, max_substitutions,
)
def find_near_matches_substitutions(subsequence, sequence, max_substitutions):
"""Find near-matches of the subsequence in the sequence.
This chooses a suitable fuzzy search implementation according to the given
parameters.
Returns a list of fuzzysearch.Match objects describing the matching parts
of the sequence.
"""
_check_arguments(subsequence, sequence, max_substitutions)
if max_substitutions == 0:
return [
Match(start_index, start_index + len(subsequence), 0)
for start_index in search_exact(subsequence, sequence)
]
elif len(subsequence) // (max_substitutions + 1) >= 3:
return find_near_matches_substitutions_ngrams(
subsequence, sequence, max_substitutions,
)
else:
return find_near_matches_substitutions_lp(
subsequence, sequence, max_substitutions,
)
def find_near_matches_substitutions_lp(subsequence, sequence,
max_substitutions):
"""search for near-matches of subsequence in sequence
This searches for near-matches, where the nearly-matching parts of the
sequence must meet the following limitations (relative to the subsequence):
* the number of character substitutions must be less than max_substitutions
* no deletions or insertions are allowed
"""
_check_arguments(subsequence, sequence, max_substitutions)
return list(_find_near_matches_substitutions_lp(subsequence, sequence,
max_substitutions))
def _find_near_matches_substitutions_lp(subsequence, sequence,
max_substitutions):
# simple optimization: prepare some often used things in advance
_SUBSEQ_LEN = len(subsequence)
_SUBSEQ_LEN_MINUS_ONE = _SUBSEQ_LEN - 1
# prepare quick lookup of where a character appears in the subsequence
char_indexes_in_subsequence = defaultdict(list)
for (index, char) in enumerate(subsequence):
char_indexes_in_subsequence[char].append(index)
# we'll iterate over the sequence once, but the iteration is split into two
# for loops; therefore we prepare an iterator in advance which will be used
# in both of the loops
sequence_enum_iter = enumerate(sequence)
# We'll count the number of matching characters assuming various attempted
# alignments of the subsequence to the sequence. At any point in the
# sequence there will be N such alignments to update. We'll keep
# these in a "circular array" (a.k.a. a ring) which we'll rotate after each
# iteration to re-align the indexing.
# Initialize the candidate counts by iterating over the first N-1 items in
# the sequence. No possible matches in this step!
candidates = deque([0], maxlen=_SUBSEQ_LEN)
for (index, char) in islice(sequence_enum_iter, _SUBSEQ_LEN_MINUS_ONE):
for subseq_index in [idx for idx in char_indexes_in_subsequence[char] if idx <= index]:
candidates[subseq_index] += 1
candidates.appendleft(0)
# From the N-th item onwards, we'll update the candidate counts exactly as
# above, and additionally check if the part of the sequence whic began N-1
# items before the current index was a near enough match to the given
# sub-sequence.
for (index, char) in sequence_enum_iter:
for subseq_index in char_indexes_in_subsequence[char]:
candidates[subseq_index] += 1
# rotate the ring of candidate counts
candidates.rotate(1)
# fetch the count for the candidate which started N-1 items ago
n_substitutions = _SUBSEQ_LEN - candidates[0]
# set the count for the next index to zero
candidates[0] = 0
# if the candidate had few enough mismatches, yield a match
if n_substitutions <= max_substitutions:
yield Match(
start=index - _SUBSEQ_LEN_MINUS_ONE,
end=index + 1,
dist=n_substitutions,
)
def has_near_match_substitutions_lp(subsequence, sequence, max_substitutions):
_check_arguments(subsequence, sequence, max_substitutions)
for match in _find_near_matches_substitutions_lp(subsequence, sequence,
max_substitutions):
return True
return False
def find_near_matches_substitutions_ngrams(subsequence, sequence,
max_substitutions):
"""search for near-matches of subsequence in sequence
This searches for near-matches, where the nearly-matching parts of the
sequence must meet the following limitations (relative to the subsequence):
* the number of character substitutions must be less than max_substitutions
* no deletions or insertions are allowed
"""
_check_arguments(subsequence, sequence, max_substitutions)
match_starts = set()
matches = []
for match in _find_near_matches_substitutions_ngrams(subsequence, sequence,
max_substitutions):
if match.start not in match_starts:
match_starts.add(match.start)
matches.append(match)
return sorted(matches, key=lambda match: match.start)
def _find_near_matches_substitutions_ngrams(subsequence, sequence,
max_substitutions):
subseq_len = len(subsequence)
seq_len = len(sequence)
ngram_len = subseq_len // (max_substitutions + 1)
if ngram_len == 0:
raise ValueError(
"The subsequence's length must be greater than max_substitutions!"
)
for ngram_start in range(0, len(subsequence) - ngram_len + 1, ngram_len):
ngram_end = ngram_start + ngram_len
subseq_before = subsequence[:ngram_start]
subseq_after = subsequence[ngram_end:]
for index in search_exact(
subsequence[ngram_start:ngram_end], sequence,
ngram_start, seq_len - (subseq_len - ngram_end),
):
n_substitutions = 0
seq_before = sequence[index - ngram_start:index]
if subseq_before != seq_before:
n_substitutions += count_differences_with_maximum(
seq_before, subseq_before,
max_substitutions - n_substitutions + 1)
if n_substitutions > max_substitutions:
continue
seq_after = sequence[index + ngram_len:index - ngram_start + subseq_len]
if subseq_after != seq_after:
if n_substitutions == max_substitutions:
continue
n_substitutions += count_differences_with_maximum(
seq_after, subseq_after,
max_substitutions - n_substitutions + 1)
if n_substitutions > max_substitutions:
continue
yield Match(
start=index - ngram_start,
end=index - ngram_start + subseq_len,
dist=n_substitutions,
)
def has_near_match_substitutions_ngrams(subsequence, sequence,
max_substitutions):
"""search for near-matches of subsequence in sequence
This searches for near-matches, where the nearly-matching parts of the
sequence must meet the following limitations (relative to the subsequence):
* the number of character substitutions must be less than max_substitutions
* no deletions or insertions are allowed
"""
_check_arguments(subsequence, sequence, max_substitutions)
for match in _find_near_matches_substitutions_ngrams(subsequence, sequence,
max_substitutions):
return True
return False
try:
from fuzzysearch._substitutions_only import \
substitutions_only_has_near_matches_ngrams_byteslike, \
substitutions_only_find_near_matches_ngrams_byteslike as \
_subs_only_fnm_ngram_byteslike
except ImportError:
pass
else:
py_has_near_match_substitutions_ngrams = has_near_match_substitutions_ngrams
@wraps(py_has_near_match_substitutions_ngrams)
def has_near_match_substitutions_ngrams(subsequence, sequence,
max_substitutions):
if not (
isinstance(subsequence, six.text_type) or
isinstance(sequence, six.text_type)
):
try:
return substitutions_only_has_near_matches_ngrams_byteslike(
subsequence, sequence, max_substitutions)
except TypeError:
pass
return py_has_near_match_substitutions_ngrams(
subsequence, sequence, max_substitutions)
py_find_near_matches_substitutions_ngrams = \
find_near_matches_substitutions_ngrams
@wraps(py_find_near_matches_substitutions_ngrams)
def find_near_matches_substitutions_ngrams(subsequence, sequence,
max_substitutions):
if not (
isinstance(subsequence, six.text_type) or
isinstance(sequence, six.text_type)
):
try:
results = _subs_only_fnm_ngram_byteslike(
subsequence, sequence, max_substitutions)
except TypeError:
pass
else:
matches = [
Match(
index,
index + len(subsequence),
count_differences_with_maximum(
sequence[index:index+len(subsequence)],
subsequence,
max_substitutions + 1,
),
)
for index in results
]
return [
get_best_match_in_group(group)
for group in group_matches(matches)
]
return py_find_near_matches_substitutions_ngrams(
subsequence, sequence, max_substitutions)