141 lines
6.2 KiB
Python
141 lines
6.2 KiB
Python
|
from fuzzysearch import find_near_matches
|
|||
|
from nltk.corpus import stopwords
|
|||
|
from nltk.stem.snowball import SnowballStemmer
|
|||
|
from concurrent.futures import ThreadPoolExecutor, wait
|
|||
|
|
|||
|
def compare(input, texts,length=5, threshold=1):
|
|||
|
'''Uses fizzysearch´s Levenshtein search to find matches in n length'''
|
|||
|
matches = []
|
|||
|
query = []
|
|||
|
for n in range((len(input) - length + 1)):
|
|||
|
query.append(input[n:n+length])
|
|||
|
for q in query:
|
|||
|
for index, t in enumerate(texts, start=0):
|
|||
|
for x in find_near_matches(q,t,max_l_dist=threshold):
|
|||
|
matches.append((x,index,q))
|
|||
|
return matches
|
|||
|
|
|||
|
class NaturalLP():
|
|||
|
'''Used for NLP
|
|||
|
Inits with a language, picks stop words for comparison
|
|||
|
'''
|
|||
|
def __init__(self, lang = "english"):
|
|||
|
'''Inits to specific language'''
|
|||
|
self.stem = SnowballStemmer(lang)
|
|||
|
self.stopwords = set(stopwords.words(lang))
|
|||
|
|
|||
|
def preprocess(self, input):
|
|||
|
'''Stop word removal and preprocessing'''
|
|||
|
output = []
|
|||
|
dic = []
|
|||
|
for index, x in enumerate(input):
|
|||
|
if x not in self.stopwords:
|
|||
|
output.append(self.stem.stem(x))
|
|||
|
dic.append(index)
|
|||
|
return output, dic
|
|||
|
|
|||
|
def bulkpreprocess(self, input, threads):
|
|||
|
'''Bulk multithreaded preprocess function'''
|
|||
|
output = []
|
|||
|
pre = self.preprocess
|
|||
|
with ThreadPoolExecutor(max_workers=threads) as executor:
|
|||
|
futures = []
|
|||
|
for x in input:
|
|||
|
futures.append(executor.submit(pre, x))
|
|||
|
wait(futures)
|
|||
|
for x in futures:
|
|||
|
output.append(x.result())
|
|||
|
return output
|
|||
|
|
|||
|
def addstopword(self, stopwords):
|
|||
|
'''Add a word or list to stopwords'''
|
|||
|
if isinstance(stopwords, str):
|
|||
|
self.stopwords.add(stopwords)
|
|||
|
elif isinstance(stopwords, list):
|
|||
|
self.stopwords.update(set(stopwords))
|
|||
|
else:
|
|||
|
pass
|
|||
|
|
|||
|
def shingle(input, k):
|
|||
|
'''Shingles the input in k length ngrams'''
|
|||
|
if k < 2:
|
|||
|
return input
|
|||
|
output = []
|
|||
|
for index in range(0, len(input)-k+1):
|
|||
|
output.append(input[index:index+k])
|
|||
|
return output
|
|||
|
|
|||
|
def shin_matches(shin1, shin2):
|
|||
|
'''Returns a list of tuples of the matches'''
|
|||
|
output = []
|
|||
|
for i, x in enumerate(shin1):
|
|||
|
for j, y in enumerate(shin2):
|
|||
|
if x == y:
|
|||
|
output.append((i, j))
|
|||
|
return output
|
|||
|
|
|||
|
def cluster_old(matches, gap, miin):
|
|||
|
'''Clusters matches based un Chebyshev distance, with gap as maximum distance and min as minimum cluster size'''
|
|||
|
#Initial Clustering
|
|||
|
temp = [[matches[0]]]
|
|||
|
for x in matches[1:]:
|
|||
|
for y in temp:
|
|||
|
t = False
|
|||
|
for z in y:
|
|||
|
if max(abs(x[0] - z[0]), abs(x[1] - z[1])) < gap:
|
|||
|
t = True
|
|||
|
break
|
|||
|
if t:
|
|||
|
y.append(x)
|
|||
|
else:
|
|||
|
temp.append([x])
|
|||
|
#Cluster duplicate check
|
|||
|
merges = []
|
|||
|
for i, x in enumerate(temp):
|
|||
|
for y in x:
|
|||
|
for j, z in enumerate(temp[i+1:], i+1):
|
|||
|
if (i,j) not in merges and y in z:
|
|||
|
merges.append((i,j))
|
|||
|
#Cluster meging
|
|||
|
output = []
|
|||
|
exclude = []
|
|||
|
for i, x in enumerate(merges):
|
|||
|
output.append( temp[x[0]].extend([x for x in temp[x[1]] if x not in temp[x[0]]]) )
|
|||
|
exclude.extend([x[0], x[1]])
|
|||
|
output.extend([x for x in temp if x not in exclude])
|
|||
|
return [x for x in output if len(x) >= miin]
|
|||
|
|
|||
|
def cluster(matches, gap, miin):
|
|||
|
'''Much improved version of clustering (cluster_old)'''
|
|||
|
clusters = []
|
|||
|
for x in matches: #For every matching point
|
|||
|
merge = False #Assumes it does not need merging
|
|||
|
for i, y in enumerate(clusters): #For every cluster
|
|||
|
for z in y: #For every point in that cluster
|
|||
|
if max(abs(x[0]-z[0]), abs(x[1] - z[1])) <= gap: #Check if the distance is small enough
|
|||
|
if not merge: #If it does not need merging
|
|||
|
y.append(x) #Add the point to that cluster
|
|||
|
merge = i #Save to "merge" the index of the cluster with which, in case of the point being in two clusters, the last will merge with
|
|||
|
else: #Else, if it does
|
|||
|
clusters[merge].extend([k for k in y if k not in clusters[merge]]) #Put the non-repeating values of the last cluster in the first
|
|||
|
y = [] #Empty the cluster
|
|||
|
break #Goes to the next one
|
|||
|
if not merge: #If it does not find any cluster
|
|||
|
clusters.append([x]) #Creates a new cluster with just itself
|
|||
|
return [x for x in clusters if len(x) >= miin] #Returns clusters with minimum size
|
|||
|
|
|||
|
def get_dist(matches):
|
|||
|
'''Gets distance to the closest match of every point'''
|
|||
|
output = []
|
|||
|
for x in matches:
|
|||
|
dic = []
|
|||
|
for y in x:
|
|||
|
min_dist = 255
|
|||
|
for z in x:
|
|||
|
if y != z and max(abs(y[0]-z[0]), abs(y[1] - z[1])) < min_dist:
|
|||
|
min_dist = max(abs(y[0]-z[0]), abs(y[1] - z[1]))
|
|||
|
dic.append(min_dist)
|
|||
|
output.append(dic)
|
|||
|
return output
|
|||
|
|