"""fuzzy searching allowing subsitutions and insertions, but no deletions""" __all__ = [ 'find_near_matches_no_deletions_ngrams', ] import array from fuzzysearch.common import Ngram, search_exact, Match def _expand(subsequence, sequence, max_substitutions, max_insertions, max_l_dist): if not subsequence: return (0, 0) # Calculate the minimum number of substitutions required for each number # of insertions between 0 and max_insertions. # # This is done using a "dynamic programming" algorithm. n_subs = array.array('L', [0] * (max_insertions + 1)) for subseq_index, char in enumerate(subsequence): n_subs[0] += (char != sequence[subseq_index]) for n_ins in range(1, max_insertions + 1): n_subs[n_ins] = min( n_subs[n_ins] + (char != sequence[subseq_index + n_ins]), n_subs[n_ins - 1] ) matches = [ (_n_subs, _n_ins) for (_n_ins, _n_subs) in enumerate(n_subs) if _n_subs <= max_substitutions and _n_ins + _n_subs <= max_l_dist ] return [ match for (i, match) in enumerate(matches) if i == 0 or match[0] < matches[i-1][0] ] def find_near_matches_no_deletions_ngrams(subsequence, sequence, search_params): """search for near-matches of subsequence in sequence This searches for near-matches, where the nearly-matching parts of the sequence must meet the following limitations (relative to the subsequence): * the maximum allowed number of character substitutions * the maximum allowed number of new characters inserted * no deletions are allowed * the total number of substitutions, insertions and deletions """ if not subsequence: raise ValueError('Given subsequence is empty!') max_substitutions, max_insertions, max_deletions, max_l_dist = search_params.unpacked max_substitutions = min(max_substitutions, max_l_dist) max_insertions = min(max_insertions, max_l_dist) subseq_len = len(subsequence) seq_len = len(sequence) ngram_len = subseq_len // (max_substitutions + max_insertions + 1) if ngram_len == 0: raise ValueError( "The subsequence's length must be greater than max_subs + max_ins!" ) matches = [] matched_indexes = set() for ngram_start in range(0, len(subsequence) - ngram_len + 1, ngram_len): ngram_end = ngram_start + ngram_len subseq_before = subsequence[:ngram_start] subseq_before_reversed = subseq_before[::-1] subseq_after = subsequence[ngram_end:] start_index = max(0, ngram_start - max_insertions) end_index = min(seq_len, seq_len - (subseq_len - ngram_end) + max_insertions) for index in search_exact( subsequence[ngram_start:ngram_end], sequence, start_index, end_index, ): if index - ngram_start in matched_indexes: continue seq_after = sequence[index + ngram_len:index + subseq_len - ngram_start + max_insertions] if seq_after.startswith(subseq_after): matches_after = [(0, 0)] else: matches_after = _expand(subseq_after, seq_after, max_substitutions, max_insertions, max_l_dist) if not matches_after: continue _max_substitutions = max_substitutions - min(m[0] for m in matches_after) _max_insertions = max_insertions - min(m[1] for m in matches_after) _max_l_dist = max_l_dist - min(m[0] + m[1] for m in matches_after) seq_before = sequence[index - ngram_start - _max_insertions:index] if seq_before.endswith(subseq_before): matches_before = [(0, 0)] else: matches_before = _expand( subseq_before_reversed, seq_before[::-1], _max_substitutions, _max_insertions, _max_l_dist, ) for (subs_before, ins_before) in matches_before: for (subs_after, ins_after) in matches_after: if ( subs_before + subs_after <= max_substitutions and ins_before + ins_after <= max_insertions and subs_before + subs_after + ins_before + ins_after <= max_l_dist ): matches.append(Match( start=index - ngram_start - ins_before, end=index - ngram_start + subseq_len + ins_after, dist=subs_before + subs_after + ins_before + ins_after, )) matched_indexes |= set(range( index - ngram_start - ins_before, index - ngram_start - ins_before + max_insertions + 1, )) return sorted(matches, key=lambda match: match.start)