290 lines
11 KiB
Python
290 lines
11 KiB
Python
|
from collections import deque, defaultdict
|
||
|
from itertools import islice
|
||
|
from functools import wraps
|
||
|
|
||
|
import six
|
||
|
|
||
|
|
||
|
from fuzzysearch.common import Match, search_exact, \
|
||
|
count_differences_with_maximum, get_best_match_in_group, group_matches
|
||
|
|
||
|
|
||
|
def _check_arguments(subsequence, sequence, max_substitutions):
|
||
|
if not subsequence:
|
||
|
raise ValueError('Given subsequence is empty!')
|
||
|
|
||
|
if max_substitutions is None or max_substitutions < 0:
|
||
|
raise ValueError('Maximum number of substitutions must be >= 0!')
|
||
|
|
||
|
|
||
|
def has_near_match_substitutions(subsequence, sequence, max_substitutions):
|
||
|
_check_arguments(subsequence, sequence, max_substitutions)
|
||
|
|
||
|
if max_substitutions == 0:
|
||
|
for start_index in search_exact(subsequence, sequence):
|
||
|
return True
|
||
|
return False
|
||
|
|
||
|
elif len(subsequence) // (max_substitutions + 1) >= 3:
|
||
|
return has_near_match_substitutions_ngrams(
|
||
|
subsequence, sequence, max_substitutions,
|
||
|
)
|
||
|
|
||
|
else:
|
||
|
return has_near_match_substitutions_lp(
|
||
|
subsequence, sequence, max_substitutions,
|
||
|
)
|
||
|
|
||
|
|
||
|
def find_near_matches_substitutions(subsequence, sequence, max_substitutions):
|
||
|
"""Find near-matches of the subsequence in the sequence.
|
||
|
|
||
|
This chooses a suitable fuzzy search implementation according to the given
|
||
|
parameters.
|
||
|
|
||
|
Returns a list of fuzzysearch.Match objects describing the matching parts
|
||
|
of the sequence.
|
||
|
"""
|
||
|
_check_arguments(subsequence, sequence, max_substitutions)
|
||
|
|
||
|
if max_substitutions == 0:
|
||
|
return [
|
||
|
Match(start_index, start_index + len(subsequence), 0)
|
||
|
for start_index in search_exact(subsequence, sequence)
|
||
|
]
|
||
|
|
||
|
elif len(subsequence) // (max_substitutions + 1) >= 3:
|
||
|
return find_near_matches_substitutions_ngrams(
|
||
|
subsequence, sequence, max_substitutions,
|
||
|
)
|
||
|
|
||
|
else:
|
||
|
return find_near_matches_substitutions_lp(
|
||
|
subsequence, sequence, max_substitutions,
|
||
|
)
|
||
|
|
||
|
|
||
|
def find_near_matches_substitutions_lp(subsequence, sequence,
|
||
|
max_substitutions):
|
||
|
"""search for near-matches of subsequence in sequence
|
||
|
|
||
|
This searches for near-matches, where the nearly-matching parts of the
|
||
|
sequence must meet the following limitations (relative to the subsequence):
|
||
|
|
||
|
* the number of character substitutions must be less than max_substitutions
|
||
|
* no deletions or insertions are allowed
|
||
|
"""
|
||
|
_check_arguments(subsequence, sequence, max_substitutions)
|
||
|
|
||
|
return list(_find_near_matches_substitutions_lp(subsequence, sequence,
|
||
|
max_substitutions))
|
||
|
|
||
|
|
||
|
def _find_near_matches_substitutions_lp(subsequence, sequence,
|
||
|
max_substitutions):
|
||
|
# simple optimization: prepare some often used things in advance
|
||
|
_SUBSEQ_LEN = len(subsequence)
|
||
|
_SUBSEQ_LEN_MINUS_ONE = _SUBSEQ_LEN - 1
|
||
|
|
||
|
# prepare quick lookup of where a character appears in the subsequence
|
||
|
char_indexes_in_subsequence = defaultdict(list)
|
||
|
for (index, char) in enumerate(subsequence):
|
||
|
char_indexes_in_subsequence[char].append(index)
|
||
|
|
||
|
# we'll iterate over the sequence once, but the iteration is split into two
|
||
|
# for loops; therefore we prepare an iterator in advance which will be used
|
||
|
# in both of the loops
|
||
|
sequence_enum_iter = enumerate(sequence)
|
||
|
|
||
|
# We'll count the number of matching characters assuming various attempted
|
||
|
# alignments of the subsequence to the sequence. At any point in the
|
||
|
# sequence there will be N such alignments to update. We'll keep
|
||
|
# these in a "circular array" (a.k.a. a ring) which we'll rotate after each
|
||
|
# iteration to re-align the indexing.
|
||
|
|
||
|
# Initialize the candidate counts by iterating over the first N-1 items in
|
||
|
# the sequence. No possible matches in this step!
|
||
|
candidates = deque([0], maxlen=_SUBSEQ_LEN)
|
||
|
for (index, char) in islice(sequence_enum_iter, _SUBSEQ_LEN_MINUS_ONE):
|
||
|
for subseq_index in [idx for idx in char_indexes_in_subsequence[char] if idx <= index]:
|
||
|
candidates[subseq_index] += 1
|
||
|
candidates.appendleft(0)
|
||
|
|
||
|
# From the N-th item onwards, we'll update the candidate counts exactly as
|
||
|
# above, and additionally check if the part of the sequence whic began N-1
|
||
|
# items before the current index was a near enough match to the given
|
||
|
# sub-sequence.
|
||
|
for (index, char) in sequence_enum_iter:
|
||
|
for subseq_index in char_indexes_in_subsequence[char]:
|
||
|
candidates[subseq_index] += 1
|
||
|
|
||
|
# rotate the ring of candidate counts
|
||
|
candidates.rotate(1)
|
||
|
# fetch the count for the candidate which started N-1 items ago
|
||
|
n_substitutions = _SUBSEQ_LEN - candidates[0]
|
||
|
# set the count for the next index to zero
|
||
|
candidates[0] = 0
|
||
|
|
||
|
# if the candidate had few enough mismatches, yield a match
|
||
|
if n_substitutions <= max_substitutions:
|
||
|
yield Match(
|
||
|
start=index - _SUBSEQ_LEN_MINUS_ONE,
|
||
|
end=index + 1,
|
||
|
dist=n_substitutions,
|
||
|
)
|
||
|
|
||
|
|
||
|
def has_near_match_substitutions_lp(subsequence, sequence, max_substitutions):
|
||
|
_check_arguments(subsequence, sequence, max_substitutions)
|
||
|
|
||
|
for match in _find_near_matches_substitutions_lp(subsequence, sequence,
|
||
|
max_substitutions):
|
||
|
return True
|
||
|
return False
|
||
|
|
||
|
|
||
|
def find_near_matches_substitutions_ngrams(subsequence, sequence,
|
||
|
max_substitutions):
|
||
|
"""search for near-matches of subsequence in sequence
|
||
|
|
||
|
This searches for near-matches, where the nearly-matching parts of the
|
||
|
sequence must meet the following limitations (relative to the subsequence):
|
||
|
|
||
|
* the number of character substitutions must be less than max_substitutions
|
||
|
* no deletions or insertions are allowed
|
||
|
"""
|
||
|
_check_arguments(subsequence, sequence, max_substitutions)
|
||
|
|
||
|
match_starts = set()
|
||
|
matches = []
|
||
|
for match in _find_near_matches_substitutions_ngrams(subsequence, sequence,
|
||
|
max_substitutions):
|
||
|
if match.start not in match_starts:
|
||
|
match_starts.add(match.start)
|
||
|
matches.append(match)
|
||
|
return sorted(matches, key=lambda match: match.start)
|
||
|
|
||
|
|
||
|
def _find_near_matches_substitutions_ngrams(subsequence, sequence,
|
||
|
max_substitutions):
|
||
|
subseq_len = len(subsequence)
|
||
|
seq_len = len(sequence)
|
||
|
|
||
|
ngram_len = subseq_len // (max_substitutions + 1)
|
||
|
if ngram_len == 0:
|
||
|
raise ValueError(
|
||
|
"The subsequence's length must be greater than max_substitutions!"
|
||
|
)
|
||
|
|
||
|
for ngram_start in range(0, len(subsequence) - ngram_len + 1, ngram_len):
|
||
|
ngram_end = ngram_start + ngram_len
|
||
|
subseq_before = subsequence[:ngram_start]
|
||
|
subseq_after = subsequence[ngram_end:]
|
||
|
for index in search_exact(
|
||
|
subsequence[ngram_start:ngram_end], sequence,
|
||
|
ngram_start, seq_len - (subseq_len - ngram_end),
|
||
|
):
|
||
|
n_substitutions = 0
|
||
|
seq_before = sequence[index - ngram_start:index]
|
||
|
if subseq_before != seq_before:
|
||
|
n_substitutions += count_differences_with_maximum(
|
||
|
seq_before, subseq_before,
|
||
|
max_substitutions - n_substitutions + 1)
|
||
|
if n_substitutions > max_substitutions:
|
||
|
continue
|
||
|
|
||
|
seq_after = sequence[index + ngram_len:index - ngram_start + subseq_len]
|
||
|
if subseq_after != seq_after:
|
||
|
if n_substitutions == max_substitutions:
|
||
|
continue
|
||
|
n_substitutions += count_differences_with_maximum(
|
||
|
seq_after, subseq_after,
|
||
|
max_substitutions - n_substitutions + 1)
|
||
|
if n_substitutions > max_substitutions:
|
||
|
continue
|
||
|
|
||
|
yield Match(
|
||
|
start=index - ngram_start,
|
||
|
end=index - ngram_start + subseq_len,
|
||
|
dist=n_substitutions,
|
||
|
)
|
||
|
|
||
|
|
||
|
def has_near_match_substitutions_ngrams(subsequence, sequence,
|
||
|
max_substitutions):
|
||
|
"""search for near-matches of subsequence in sequence
|
||
|
|
||
|
This searches for near-matches, where the nearly-matching parts of the
|
||
|
sequence must meet the following limitations (relative to the subsequence):
|
||
|
|
||
|
* the number of character substitutions must be less than max_substitutions
|
||
|
* no deletions or insertions are allowed
|
||
|
"""
|
||
|
_check_arguments(subsequence, sequence, max_substitutions)
|
||
|
|
||
|
for match in _find_near_matches_substitutions_ngrams(subsequence, sequence,
|
||
|
max_substitutions):
|
||
|
return True
|
||
|
return False
|
||
|
|
||
|
|
||
|
try:
|
||
|
from fuzzysearch._substitutions_only import \
|
||
|
substitutions_only_has_near_matches_ngrams_byteslike, \
|
||
|
substitutions_only_find_near_matches_ngrams_byteslike as \
|
||
|
_subs_only_fnm_ngram_byteslike
|
||
|
except ImportError:
|
||
|
pass
|
||
|
else:
|
||
|
py_has_near_match_substitutions_ngrams = has_near_match_substitutions_ngrams
|
||
|
@wraps(py_has_near_match_substitutions_ngrams)
|
||
|
def has_near_match_substitutions_ngrams(subsequence, sequence,
|
||
|
max_substitutions):
|
||
|
if not (
|
||
|
isinstance(subsequence, six.text_type) or
|
||
|
isinstance(sequence, six.text_type)
|
||
|
):
|
||
|
try:
|
||
|
return substitutions_only_has_near_matches_ngrams_byteslike(
|
||
|
subsequence, sequence, max_substitutions)
|
||
|
except TypeError:
|
||
|
pass
|
||
|
|
||
|
return py_has_near_match_substitutions_ngrams(
|
||
|
subsequence, sequence, max_substitutions)
|
||
|
|
||
|
py_find_near_matches_substitutions_ngrams = \
|
||
|
find_near_matches_substitutions_ngrams
|
||
|
@wraps(py_find_near_matches_substitutions_ngrams)
|
||
|
def find_near_matches_substitutions_ngrams(subsequence, sequence,
|
||
|
max_substitutions):
|
||
|
if not (
|
||
|
isinstance(subsequence, six.text_type) or
|
||
|
isinstance(sequence, six.text_type)
|
||
|
):
|
||
|
try:
|
||
|
results = _subs_only_fnm_ngram_byteslike(
|
||
|
subsequence, sequence, max_substitutions)
|
||
|
except TypeError:
|
||
|
pass
|
||
|
else:
|
||
|
matches = [
|
||
|
Match(
|
||
|
index,
|
||
|
index + len(subsequence),
|
||
|
count_differences_with_maximum(
|
||
|
sequence[index:index+len(subsequence)],
|
||
|
subsequence,
|
||
|
max_substitutions + 1,
|
||
|
),
|
||
|
)
|
||
|
for index in results
|
||
|
]
|
||
|
return [
|
||
|
get_best_match_in_group(group)
|
||
|
for group in group_matches(matches)
|
||
|
]
|
||
|
|
||
|
return py_find_near_matches_substitutions_ngrams(
|
||
|
subsequence, sequence, max_substitutions)
|