from fuzzysearch.common import Match, \ group_matches, get_best_match_in_group, search_exact from six.moves import xrange __all__ = ['find_near_matches_levenshtein_ngrams'] def _expand(subsequence, sequence, max_l_dist): """Expand a partial match of a Levenstein search. An expansion must begin at the beginning of the sequence, which makes this much simpler than a full search, and allows for greater optimization. """ # If given a long sub-sequence and relatively small max distance, # use a more complex algorithm better optimized for such cases. if len(subsequence) > max(max_l_dist * 2, 10): return _expand_long(subsequence, sequence, max_l_dist) else: return _expand_short(subsequence, sequence, max_l_dist) def _py_expand_short(subsequence, sequence, max_l_dist): """Straightforward implementation of partial match expansion.""" # The following diagram shows the score calculation step. # # Each new score is the minimum of: # * a OR a + 1 (substitution, if needed) # * b + 1 (deletion, i.e. skipping a sequence character) # * c + 1 (insertion, i.e. skipping a sub-sequence character) # # a -- +1 -> c # # | \ | # | \ | # +1 +1? +1 # | \ | # v ⌟ v # # b -- +1 -> scores[subseq_index] subseq_len = len(subsequence) if subseq_len == 0: return (0, 0) # Initialize the scores array with values for just skipping sub-sequence # chars. scores = list(range(1, subseq_len + 1)) min_score = subseq_len min_score_idx = -1 for seq_index, char in enumerate(sequence): # calculate scores, one for each character in the sub-sequence a = seq_index c = a + 1 for subseq_index in range(subseq_len): b = scores[subseq_index] c = scores[subseq_index] = min( a + (char != subsequence[subseq_index]), b + 1, c + 1, ) a = b # keep the minimum score found for matches of the entire sub-sequence if c <= min_score: min_score = c min_score_idx = seq_index # bail early when it is impossible to find a better expansion elif min(scores) >= min_score: break return (min_score, min_score_idx + 1) if min_score <= max_l_dist else (None, None) def _py_expand_long(subsequence, sequence, max_l_dist): """Partial match expansion, optimized for long sub-sequences.""" # The additional optimization in this version is to limit the part of # the sub-sequence inspected for each sequence character. The start and # end of the iteration are limited to the range where the scores are # smaller than the maximum allowed distance. Additionally, once a good # expansion has been found, the range is further reduced to where the # scores are smaller than the score of the best expansion found so far. subseq_len = len(subsequence) if subseq_len == 0: return (0, 0) # Initialize the scores array with values for just skipping sub-sequence # chars. scores = list(range(1, subseq_len + 1)) min_score = subseq_len min_score_idx = -1 max_good_score = max_l_dist new_needle_idx_range_start = 0 new_needle_idx_range_end = subseq_len - 1 for seq_index, char in enumerate(sequence): # calculate scores, one for each character in the sub-sequence needle_idx_range_start = new_needle_idx_range_start needle_idx_range_end = min(subseq_len, new_needle_idx_range_end + 1) a = seq_index c = a + 1 if c <= max_good_score: new_needle_idx_range_start = 0 new_needle_idx_range_end = 0 else: new_needle_idx_range_start = None new_needle_idx_range_end = -1 for subseq_index in range(needle_idx_range_start, needle_idx_range_end): b = scores[subseq_index] c = scores[subseq_index] = min( a + (char != subsequence[subseq_index]), b + 1, c + 1, ) a = b if c <= max_good_score: if new_needle_idx_range_start is None: new_needle_idx_range_start = subseq_index new_needle_idx_range_end = max( new_needle_idx_range_end, subseq_index + 1 + (max_good_score - c), ) # bail early when it is impossible to find a better expansion if new_needle_idx_range_start is None: break # keep the minimum score found for matches of the entire sub-sequence if needle_idx_range_end == subseq_len and c <= min_score: min_score = c min_score_idx = seq_index if min_score < max_good_score: max_good_score = min_score return (min_score, min_score_idx + 1) if min_score <= max_l_dist else (None, None) try: from fuzzysearch._levenshtein_ngrams import ( c_expand_short as _c_expand_short, c_expand_long as _c_expand_long, ) except ImportError: _expand_short = _py_expand_short _expand_long = _py_expand_long else: _expand_short = _c_expand_short _expand_long = _c_expand_long def find_near_matches_levenshtein_ngrams(subsequence, sequence, max_l_dist): subseq_len = len(subsequence) seq_len = len(sequence) ngram_len = subseq_len // (max_l_dist + 1) if ngram_len == 0: raise ValueError('the subsequence length must be greater than max_l_dist') matches = [] for ngram_start in xrange(0, subseq_len - ngram_len + 1, ngram_len): ngram_end = ngram_start + ngram_len subseq_before_reversed = subsequence[:ngram_start][::-1] subseq_after = subsequence[ngram_end:] start_index = max(0, ngram_start - max_l_dist) end_index = min(seq_len, seq_len - subseq_len + ngram_end + max_l_dist) for index in search_exact(subsequence[ngram_start:ngram_end], sequence, start_index, end_index): # try to expand left and/or right according to n_ngram dist_right, right_expand_size = _expand( subseq_after, sequence[index + ngram_len:index - ngram_start + subseq_len + max_l_dist], max_l_dist, ) if dist_right is None: continue dist_left, left_expand_size = _expand( subseq_before_reversed, sequence[max(0, index - ngram_start - (max_l_dist - dist_right)):index][::-1], max_l_dist - dist_right, ) if dist_left is None: continue assert dist_left + dist_right <= max_l_dist matches.append(Match( start=index - left_expand_size, end=index + ngram_len + right_expand_size, dist=dist_left + dist_right, )) # don't return overlapping matches; instead, group overlapping matches # together and return the best match from each group match_groups = group_matches(matches) best_matches = [get_best_match_in_group(group) for group in match_groups] return sorted(best_matches)