PCQRSCANER/venv/Lib/site-packages/Hooke/compare.py

141 lines
6.2 KiB
Python
Raw Normal View History

2019-12-22 21:51:47 +01:00
from fuzzysearch import find_near_matches
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from concurrent.futures import ThreadPoolExecutor, wait
def compare(input, texts,length=5, threshold=1):
'''Uses fizzysearch´s Levenshtein search to find matches in n length'''
matches = []
query = []
for n in range((len(input) - length + 1)):
query.append(input[n:n+length])
for q in query:
for index, t in enumerate(texts, start=0):
for x in find_near_matches(q,t,max_l_dist=threshold):
matches.append((x,index,q))
return matches
class NaturalLP():
'''Used for NLP
Inits with a language, picks stop words for comparison
'''
def __init__(self, lang = "english"):
'''Inits to specific language'''
self.stem = SnowballStemmer(lang)
self.stopwords = set(stopwords.words(lang))
def preprocess(self, input):
'''Stop word removal and preprocessing'''
output = []
dic = []
for index, x in enumerate(input):
if x not in self.stopwords:
output.append(self.stem.stem(x))
dic.append(index)
return output, dic
def bulkpreprocess(self, input, threads):
'''Bulk multithreaded preprocess function'''
output = []
pre = self.preprocess
with ThreadPoolExecutor(max_workers=threads) as executor:
futures = []
for x in input:
futures.append(executor.submit(pre, x))
wait(futures)
for x in futures:
output.append(x.result())
return output
def addstopword(self, stopwords):
'''Add a word or list to stopwords'''
if isinstance(stopwords, str):
self.stopwords.add(stopwords)
elif isinstance(stopwords, list):
self.stopwords.update(set(stopwords))
else:
pass
def shingle(input, k):
'''Shingles the input in k length ngrams'''
if k < 2:
return input
output = []
for index in range(0, len(input)-k+1):
output.append(input[index:index+k])
return output
def shin_matches(shin1, shin2):
'''Returns a list of tuples of the matches'''
output = []
for i, x in enumerate(shin1):
for j, y in enumerate(shin2):
if x == y:
output.append((i, j))
return output
def cluster_old(matches, gap, miin):
'''Clusters matches based un Chebyshev distance, with gap as maximum distance and min as minimum cluster size'''
#Initial Clustering
temp = [[matches[0]]]
for x in matches[1:]:
for y in temp:
t = False
for z in y:
if max(abs(x[0] - z[0]), abs(x[1] - z[1])) < gap:
t = True
break
if t:
y.append(x)
else:
temp.append([x])
#Cluster duplicate check
merges = []
for i, x in enumerate(temp):
for y in x:
for j, z in enumerate(temp[i+1:], i+1):
if (i,j) not in merges and y in z:
merges.append((i,j))
#Cluster meging
output = []
exclude = []
for i, x in enumerate(merges):
output.append( temp[x[0]].extend([x for x in temp[x[1]] if x not in temp[x[0]]]) )
exclude.extend([x[0], x[1]])
output.extend([x for x in temp if x not in exclude])
return [x for x in output if len(x) >= miin]
def cluster(matches, gap, miin):
'''Much improved version of clustering (cluster_old)'''
clusters = []
for x in matches: #For every matching point
merge = False #Assumes it does not need merging
for i, y in enumerate(clusters): #For every cluster
for z in y: #For every point in that cluster
if max(abs(x[0]-z[0]), abs(x[1] - z[1])) <= gap: #Check if the distance is small enough
if not merge: #If it does not need merging
y.append(x) #Add the point to that cluster
merge = i #Save to "merge" the index of the cluster with which, in case of the point being in two clusters, the last will merge with
else: #Else, if it does
clusters[merge].extend([k for k in y if k not in clusters[merge]]) #Put the non-repeating values of the last cluster in the first
y = [] #Empty the cluster
break #Goes to the next one
if not merge: #If it does not find any cluster
clusters.append([x]) #Creates a new cluster with just itself
return [x for x in clusters if len(x) >= miin] #Returns clusters with minimum size
def get_dist(matches):
'''Gets distance to the closest match of every point'''
output = []
for x in matches:
dic = []
for y in x:
min_dist = 255
for z in x:
if y != z and max(abs(y[0]-z[0]), abs(y[1] - z[1])) < min_dist:
min_dist = max(abs(y[0]-z[0]), abs(y[1] - z[1]))
dic.append(min_dist)
output.append(dic)
return output