PCQRSCANER/venv/Lib/site-packages/fuzzysearch/common.py
2019-12-22 21:51:47 +01:00

205 lines
7.1 KiB
Python

import sys
from collections import namedtuple
from functools import wraps
from six.moves import range, zip
__all__ = [
'Match', 'Ngram', 'LevenshteinSearchParams',
'search_exact', 'count_differences_with_maximum',
'group_matches', 'get_best_match_in_group',
]
CLASSES_WITH_INDEX = (list, tuple)
if sys.version_info >= (3,):
CLASSES_WITH_FIND = (bytes, str)
else:
CLASSES_WITH_FIND = (str, unicode)
try:
from Bio.Seq import Seq
except ImportError:
pass
else:
CLASSES_WITH_FIND += (Seq,)
Match = namedtuple('Match', ['start', 'end', 'dist'])
Ngram = namedtuple('Ngram', ['start', 'end'])
class LevenshteinSearchParams(object):
def __init__(self,
max_substitutions=None,
max_insertions=None,
max_deletions=None,
max_l_dist=None):
self.check_params_valid(max_substitutions, max_insertions,
max_deletions, max_l_dist)
self.max_substitutions = max_substitutions
self.max_insertions = max_insertions
self.max_deletions = max_deletions
self.max_l_dist = self._get_max_l_dist(
max_substitutions, max_insertions,
max_deletions, max_l_dist,
)
@property
def unpacked(self):
return self.max_substitutions, self.max_insertions, self.max_deletions, self.max_l_dist
@classmethod
def check_params_valid(cls,
max_substitutions, max_insertions,
max_deletions, max_l_dist):
if not all(x is None or (isinstance(x, int) and x >= 0)
for x in
[max_substitutions, max_insertions, max_deletions, max_l_dist]):
raise TypeError("All limits must be positive integers or None.")
if max_l_dist is None:
n_limits = (
(1 if max_substitutions is not None else 0) +
(1 if max_insertions is not None else 0) +
(1 if max_deletions is not None else 0)
)
if n_limits < 3:
if n_limits == 0:
raise ValueError('No limitations given!')
elif max_substitutions is None:
raise ValueError('# substitutions must be limited!')
elif max_insertions is None:
raise ValueError('# insertions must be limited!')
elif max_deletions is None:
raise ValueError('# deletions must be limited!')
@classmethod
def _get_max_l_dist(cls,
max_substitutions, max_insertions,
max_deletions, max_l_dist):
bignum = 1 << 29
maxes_sum = (
(max_substitutions if max_substitutions is not None else bignum) +
(max_insertions if max_insertions is not None else bignum) +
(max_deletions if max_deletions is not None else bignum)
)
return (
max_l_dist
if max_l_dist is not None and max_l_dist <= maxes_sum
else maxes_sum
)
def search_exact(subsequence, sequence, start_index=0, end_index=None):
if not subsequence:
raise ValueError('subsequence must not be empty')
if end_index is None:
end_index = len(sequence)
if isinstance(sequence, CLASSES_WITH_FIND):
def find_in_index_range(start_index):
return sequence.find(subsequence, start_index, end_index)
elif isinstance(sequence, CLASSES_WITH_INDEX):
first_item = subsequence[0]
first_item_last_index = end_index - (len(subsequence) - 1)
def find_in_index_range(start_index):
while True:
try:
first_index = sequence.index(first_item, start_index, first_item_last_index)
start_index = first_index + 1
except ValueError:
return -1
for subseq_index in range(1, len(subsequence)):
if sequence[first_index + subseq_index] != subsequence[subseq_index]:
break
else:
return first_index
else:
raise TypeError('unsupported sequence type: %s' % type(sequence))
index = find_in_index_range(start_index)
while index >= 0:
yield index
index = find_in_index_range(index + 1)
def count_differences_with_maximum(sequence1, sequence2, max_differences):
n_different = 0
for item1, item2 in zip(sequence1, sequence2):
if item1 != item2:
n_different += 1
if n_different == max_differences:
return n_different
return n_different
try:
from fuzzysearch._common import count_differences_with_maximum_byteslike, \
search_exact_byteslike
except ImportError:
pass
else:
_count_differences_with_maximum = count_differences_with_maximum
@wraps(_count_differences_with_maximum)
def count_differences_with_maximum(sequence1, sequence2, max_differences):
try:
return count_differences_with_maximum_byteslike(sequence1,
sequence2,
max_differences)
except TypeError:
return _count_differences_with_maximum(sequence1, sequence2,
max_differences)
_search_exact = search_exact
@wraps(_search_exact)
def search_exact(subsequence, sequence, start_index=0, end_index=None):
if end_index is None:
end_index = len(sequence)
try:
return search_exact_byteslike(subsequence, sequence,
start_index, end_index)
except (TypeError, UnicodeEncodeError):
return _search_exact(subsequence, sequence, start_index, end_index)
class GroupOfMatches(object):
def __init__(self, match):
assert match.start <= match.end
self.start = match.start
self.end = match.end
self.matches = set([match])
def is_match_in_group(self, match):
return not (match.end <= self.start or match.start >= self.end)
def add_match(self, match):
self.matches.add(match)
self.start = min(self.start, match.start)
self.end = max(self.end, match.end)
def group_matches(matches):
groups = []
for match in matches:
overlapping_groups = [g for g in groups if g.is_match_in_group(match)]
if not overlapping_groups:
groups.append(GroupOfMatches(match))
elif len(overlapping_groups) == 1:
overlapping_groups[0].add_match(match)
else:
new_group = GroupOfMatches(match)
for group in overlapping_groups:
for match in group.matches:
new_group.add_match(match)
groups = [g for g in groups if g not in overlapping_groups]
groups.append(new_group)
return [group.matches for group in groups]
def get_best_match_in_group(group):
# return longest match amongst those with the shortest distance
return min(group, key=lambda match: (match.dist, -(match.end - match.start)))