PCQRSCANER/venv/Lib/site-packages/Hooke/order.py

168 lines
5.0 KiB
Python
Raw Normal View History

2019-12-22 21:51:47 +01:00
class Match:
'''Main class of the match
Variables
start = Start of the match
end = End of the match
density = A depth map of the distance of the match
s_start = The start of the match from the source
s_end = The end of the match from the source
mean = Find average distance of the match
source = Source of the Match
text 1 = Part of the text
text 2 = Match in other text
'''
density = None
s_start = None
s_end = None
source = None
text1 = None
text2 = None
def __init__(self, start, end):
self.start = start
self.end = end
def mean(self):
if not self.density:
print("Warning: No density specified, Skipping...")
return
x = 0
for y in self.density:
x += y
x = x/len(self.density)
return x
def find_text(self, text1, text2, extra = 5):
try:
self.text1 = text1[self.start-extra:self.end+extra]
except:
self.text1 = text1[self.start:self.end]
try:
self.text2 = text2[self.s_start-extra:self.s_end+extra]
except:
self.text2 = text2[self.s_start:self.s_end]
def print(self):
print(f"Document:\n {self.text1}\n\nSource:\n{self.text2}\n\n\n\n")
def match_elements(matches):
'''Extract match elements'''
output = []
for y in matches:
match, source, text = y
output.append([match.dist, text, source, match.start, match.end])
return output
def source_sort(matches, leng):
'''Orginize by source'''
output = [[]for k in range(leng)]
for x in matches:
output[x[2]].append(x)
return output
def check_merges(matches):
'''Checks merging matches'''
output = []
for source in matches:
try:
results = [(source[0],0)]
count = 0
for match in source[1:]:
if match[3] > results[-1][0][4]:
count += 1
results.append((match, count))
output.append(results)
except IndexError:
output.append(None)
return output
def print_matches(matches, searchurls, ex = []):
'''Prints Matches'''
count = 0
exclude = []
for source in matches:
try:
if count not in ex:
print(len(source),"match (es) from", searchurls[count])
exclude.append(count)
except:
pass
count += 1
return exclude
def separate_matches(matches):
'''Separates matches based on last classification'''
output = []
for source in matches:
try:
ts = [[] for x in range(source[-1][-1] + 1)]
for match in source:
ts[match[1]].append(match[0])
output.append(ts)
except:
output.append(None)
return output
def join_matches(matches):
'''Joins each array of the same match, putting the texts together and averaging its distance'''
output = []
for source in matches:
ns = []
try:
for match in source:
tex = match[0][1][:-1]
added = 0
for m in match:
added += m[0]
tex.append(m[1][-1])
ns.append([added/len(match), tex, match[0][3],match[-1][4]])
except:
ns = None
output.append(ns)
return output
def de_preprocess(matches, dic1, dic2, dist):
'''Takes the preprocessed match and makes it normal, and takes both dictionaries'''
output=[]
for i, cluster in enumerate(matches):
newcluster = []
for x in enumerate(cluster):
newcluster.append(dic1[x[0]])
start = newcluster[0]
end = newcluster[-1]
mstart = dic2[cluster[0][1]]
mend = dic2[cluster[-1][1]]
dens = [None]*(1+end-start) #Where None means not yet known
for j,x in enumerate(newcluster):
dens[x-start] = dist[i][j]
output.append((start, end, mstart, mend, dens))
return output
def bilinear(dens):
'''Linearly aliases the match accuracy for matches between existing ones'''
for x in dens:
points = [i for i, y in enumerate(x) if y != None]
pairs = []
for y in range(0,len(points)-1):
pairs.append((points[y], points[y+1]))
for y in pairs:
interval = (x[y[1]]-x[y[0]]) / (y[1]- y[0])
added = x[y[0]]
if y[1]-y[0] > 1:
for z in range(1, y[1]-y[0]):
added += interval
x[y[0]+z] = round(added)
return dens
def shingle_final(input, dens):
'''Makes the matches into the Match class'''
output = []
for x in input:
y = Match(start = x[0], end = x[1])
y.s_start = x[2]
y.density = dens
y.s_end = x[3]
output.append(y)
return output