PCQRSCANER/venv/Lib/site-packages/Hooke/compare.py
2019-12-22 21:51:47 +01:00

141 lines
6.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from fuzzysearch import find_near_matches
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from concurrent.futures import ThreadPoolExecutor, wait
def compare(input, texts,length=5, threshold=1):
'''Uses fizzysearch´s Levenshtein search to find matches in n length'''
matches = []
query = []
for n in range((len(input) - length + 1)):
query.append(input[n:n+length])
for q in query:
for index, t in enumerate(texts, start=0):
for x in find_near_matches(q,t,max_l_dist=threshold):
matches.append((x,index,q))
return matches
class NaturalLP():
'''Used for NLP
Inits with a language, picks stop words for comparison
'''
def __init__(self, lang = "english"):
'''Inits to specific language'''
self.stem = SnowballStemmer(lang)
self.stopwords = set(stopwords.words(lang))
def preprocess(self, input):
'''Stop word removal and preprocessing'''
output = []
dic = []
for index, x in enumerate(input):
if x not in self.stopwords:
output.append(self.stem.stem(x))
dic.append(index)
return output, dic
def bulkpreprocess(self, input, threads):
'''Bulk multithreaded preprocess function'''
output = []
pre = self.preprocess
with ThreadPoolExecutor(max_workers=threads) as executor:
futures = []
for x in input:
futures.append(executor.submit(pre, x))
wait(futures)
for x in futures:
output.append(x.result())
return output
def addstopword(self, stopwords):
'''Add a word or list to stopwords'''
if isinstance(stopwords, str):
self.stopwords.add(stopwords)
elif isinstance(stopwords, list):
self.stopwords.update(set(stopwords))
else:
pass
def shingle(input, k):
'''Shingles the input in k length ngrams'''
if k < 2:
return input
output = []
for index in range(0, len(input)-k+1):
output.append(input[index:index+k])
return output
def shin_matches(shin1, shin2):
'''Returns a list of tuples of the matches'''
output = []
for i, x in enumerate(shin1):
for j, y in enumerate(shin2):
if x == y:
output.append((i, j))
return output
def cluster_old(matches, gap, miin):
'''Clusters matches based un Chebyshev distance, with gap as maximum distance and min as minimum cluster size'''
#Initial Clustering
temp = [[matches[0]]]
for x in matches[1:]:
for y in temp:
t = False
for z in y:
if max(abs(x[0] - z[0]), abs(x[1] - z[1])) < gap:
t = True
break
if t:
y.append(x)
else:
temp.append([x])
#Cluster duplicate check
merges = []
for i, x in enumerate(temp):
for y in x:
for j, z in enumerate(temp[i+1:], i+1):
if (i,j) not in merges and y in z:
merges.append((i,j))
#Cluster meging
output = []
exclude = []
for i, x in enumerate(merges):
output.append( temp[x[0]].extend([x for x in temp[x[1]] if x not in temp[x[0]]]) )
exclude.extend([x[0], x[1]])
output.extend([x for x in temp if x not in exclude])
return [x for x in output if len(x) >= miin]
def cluster(matches, gap, miin):
'''Much improved version of clustering (cluster_old)'''
clusters = []
for x in matches: #For every matching point
merge = False #Assumes it does not need merging
for i, y in enumerate(clusters): #For every cluster
for z in y: #For every point in that cluster
if max(abs(x[0]-z[0]), abs(x[1] - z[1])) <= gap: #Check if the distance is small enough
if not merge: #If it does not need merging
y.append(x) #Add the point to that cluster
merge = i #Save to "merge" the index of the cluster with which, in case of the point being in two clusters, the last will merge with
else: #Else, if it does
clusters[merge].extend([k for k in y if k not in clusters[merge]]) #Put the non-repeating values of the last cluster in the first
y = [] #Empty the cluster
break #Goes to the next one
if not merge: #If it does not find any cluster
clusters.append([x]) #Creates a new cluster with just itself
return [x for x in clusters if len(x) >= miin] #Returns clusters with minimum size
def get_dist(matches):
'''Gets distance to the closest match of every point'''
output = []
for x in matches:
dic = []
for y in x:
min_dist = 255
for z in x:
if y != z and max(abs(y[0]-z[0]), abs(y[1] - z[1])) < min_dist:
min_dist = max(abs(y[0]-z[0]), abs(y[1] - z[1]))
dic.append(min_dist)
output.append(dic)
return output