141 lines
6.2 KiB
Python
141 lines
6.2 KiB
Python
from fuzzysearch import find_near_matches
|
||
from nltk.corpus import stopwords
|
||
from nltk.stem.snowball import SnowballStemmer
|
||
from concurrent.futures import ThreadPoolExecutor, wait
|
||
|
||
def compare(input, texts,length=5, threshold=1):
|
||
'''Uses fizzysearch´s Levenshtein search to find matches in n length'''
|
||
matches = []
|
||
query = []
|
||
for n in range((len(input) - length + 1)):
|
||
query.append(input[n:n+length])
|
||
for q in query:
|
||
for index, t in enumerate(texts, start=0):
|
||
for x in find_near_matches(q,t,max_l_dist=threshold):
|
||
matches.append((x,index,q))
|
||
return matches
|
||
|
||
class NaturalLP():
|
||
'''Used for NLP
|
||
Inits with a language, picks stop words for comparison
|
||
'''
|
||
def __init__(self, lang = "english"):
|
||
'''Inits to specific language'''
|
||
self.stem = SnowballStemmer(lang)
|
||
self.stopwords = set(stopwords.words(lang))
|
||
|
||
def preprocess(self, input):
|
||
'''Stop word removal and preprocessing'''
|
||
output = []
|
||
dic = []
|
||
for index, x in enumerate(input):
|
||
if x not in self.stopwords:
|
||
output.append(self.stem.stem(x))
|
||
dic.append(index)
|
||
return output, dic
|
||
|
||
def bulkpreprocess(self, input, threads):
|
||
'''Bulk multithreaded preprocess function'''
|
||
output = []
|
||
pre = self.preprocess
|
||
with ThreadPoolExecutor(max_workers=threads) as executor:
|
||
futures = []
|
||
for x in input:
|
||
futures.append(executor.submit(pre, x))
|
||
wait(futures)
|
||
for x in futures:
|
||
output.append(x.result())
|
||
return output
|
||
|
||
def addstopword(self, stopwords):
|
||
'''Add a word or list to stopwords'''
|
||
if isinstance(stopwords, str):
|
||
self.stopwords.add(stopwords)
|
||
elif isinstance(stopwords, list):
|
||
self.stopwords.update(set(stopwords))
|
||
else:
|
||
pass
|
||
|
||
def shingle(input, k):
|
||
'''Shingles the input in k length ngrams'''
|
||
if k < 2:
|
||
return input
|
||
output = []
|
||
for index in range(0, len(input)-k+1):
|
||
output.append(input[index:index+k])
|
||
return output
|
||
|
||
def shin_matches(shin1, shin2):
|
||
'''Returns a list of tuples of the matches'''
|
||
output = []
|
||
for i, x in enumerate(shin1):
|
||
for j, y in enumerate(shin2):
|
||
if x == y:
|
||
output.append((i, j))
|
||
return output
|
||
|
||
def cluster_old(matches, gap, miin):
|
||
'''Clusters matches based un Chebyshev distance, with gap as maximum distance and min as minimum cluster size'''
|
||
#Initial Clustering
|
||
temp = [[matches[0]]]
|
||
for x in matches[1:]:
|
||
for y in temp:
|
||
t = False
|
||
for z in y:
|
||
if max(abs(x[0] - z[0]), abs(x[1] - z[1])) < gap:
|
||
t = True
|
||
break
|
||
if t:
|
||
y.append(x)
|
||
else:
|
||
temp.append([x])
|
||
#Cluster duplicate check
|
||
merges = []
|
||
for i, x in enumerate(temp):
|
||
for y in x:
|
||
for j, z in enumerate(temp[i+1:], i+1):
|
||
if (i,j) not in merges and y in z:
|
||
merges.append((i,j))
|
||
#Cluster meging
|
||
output = []
|
||
exclude = []
|
||
for i, x in enumerate(merges):
|
||
output.append( temp[x[0]].extend([x for x in temp[x[1]] if x not in temp[x[0]]]) )
|
||
exclude.extend([x[0], x[1]])
|
||
output.extend([x for x in temp if x not in exclude])
|
||
return [x for x in output if len(x) >= miin]
|
||
|
||
def cluster(matches, gap, miin):
|
||
'''Much improved version of clustering (cluster_old)'''
|
||
clusters = []
|
||
for x in matches: #For every matching point
|
||
merge = False #Assumes it does not need merging
|
||
for i, y in enumerate(clusters): #For every cluster
|
||
for z in y: #For every point in that cluster
|
||
if max(abs(x[0]-z[0]), abs(x[1] - z[1])) <= gap: #Check if the distance is small enough
|
||
if not merge: #If it does not need merging
|
||
y.append(x) #Add the point to that cluster
|
||
merge = i #Save to "merge" the index of the cluster with which, in case of the point being in two clusters, the last will merge with
|
||
else: #Else, if it does
|
||
clusters[merge].extend([k for k in y if k not in clusters[merge]]) #Put the non-repeating values of the last cluster in the first
|
||
y = [] #Empty the cluster
|
||
break #Goes to the next one
|
||
if not merge: #If it does not find any cluster
|
||
clusters.append([x]) #Creates a new cluster with just itself
|
||
return [x for x in clusters if len(x) >= miin] #Returns clusters with minimum size
|
||
|
||
def get_dist(matches):
|
||
'''Gets distance to the closest match of every point'''
|
||
output = []
|
||
for x in matches:
|
||
dic = []
|
||
for y in x:
|
||
min_dist = 255
|
||
for z in x:
|
||
if y != z and max(abs(y[0]-z[0]), abs(y[1] - z[1])) < min_dist:
|
||
min_dist = max(abs(y[0]-z[0]), abs(y[1] - z[1]))
|
||
dic.append(min_dist)
|
||
output.append(dic)
|
||
return output
|
||
|