168 lines
5.0 KiB
Python
168 lines
5.0 KiB
Python
|
class Match:
|
||
|
'''Main class of the match
|
||
|
Variables
|
||
|
start = Start of the match
|
||
|
end = End of the match
|
||
|
density = A depth map of the distance of the match
|
||
|
s_start = The start of the match from the source
|
||
|
s_end = The end of the match from the source
|
||
|
mean = Find average distance of the match
|
||
|
source = Source of the Match
|
||
|
text 1 = Part of the text
|
||
|
text 2 = Match in other text
|
||
|
'''
|
||
|
density = None
|
||
|
s_start = None
|
||
|
s_end = None
|
||
|
source = None
|
||
|
text1 = None
|
||
|
text2 = None
|
||
|
|
||
|
def __init__(self, start, end):
|
||
|
self.start = start
|
||
|
self.end = end
|
||
|
|
||
|
def mean(self):
|
||
|
if not self.density:
|
||
|
print("Warning: No density specified, Skipping...")
|
||
|
return
|
||
|
x = 0
|
||
|
for y in self.density:
|
||
|
x += y
|
||
|
x = x/len(self.density)
|
||
|
return x
|
||
|
|
||
|
def find_text(self, text1, text2, extra = 5):
|
||
|
try:
|
||
|
self.text1 = text1[self.start-extra:self.end+extra]
|
||
|
except:
|
||
|
self.text1 = text1[self.start:self.end]
|
||
|
try:
|
||
|
self.text2 = text2[self.s_start-extra:self.s_end+extra]
|
||
|
except:
|
||
|
self.text2 = text2[self.s_start:self.s_end]
|
||
|
|
||
|
def print(self):
|
||
|
print(f"Document:\n {self.text1}\n\nSource:\n{self.text2}\n\n\n\n")
|
||
|
|
||
|
def match_elements(matches):
|
||
|
'''Extract match elements'''
|
||
|
output = []
|
||
|
for y in matches:
|
||
|
match, source, text = y
|
||
|
output.append([match.dist, text, source, match.start, match.end])
|
||
|
return output
|
||
|
|
||
|
def source_sort(matches, leng):
|
||
|
'''Orginize by source'''
|
||
|
output = [[]for k in range(leng)]
|
||
|
for x in matches:
|
||
|
output[x[2]].append(x)
|
||
|
return output
|
||
|
|
||
|
def check_merges(matches):
|
||
|
'''Checks merging matches'''
|
||
|
output = []
|
||
|
for source in matches:
|
||
|
try:
|
||
|
results = [(source[0],0)]
|
||
|
count = 0
|
||
|
for match in source[1:]:
|
||
|
if match[3] > results[-1][0][4]:
|
||
|
count += 1
|
||
|
results.append((match, count))
|
||
|
output.append(results)
|
||
|
except IndexError:
|
||
|
output.append(None)
|
||
|
return output
|
||
|
|
||
|
def print_matches(matches, searchurls, ex = []):
|
||
|
'''Prints Matches'''
|
||
|
count = 0
|
||
|
exclude = []
|
||
|
for source in matches:
|
||
|
try:
|
||
|
if count not in ex:
|
||
|
print(len(source),"match (es) from", searchurls[count])
|
||
|
exclude.append(count)
|
||
|
except:
|
||
|
pass
|
||
|
count += 1
|
||
|
return exclude
|
||
|
|
||
|
def separate_matches(matches):
|
||
|
'''Separates matches based on last classification'''
|
||
|
output = []
|
||
|
for source in matches:
|
||
|
try:
|
||
|
ts = [[] for x in range(source[-1][-1] + 1)]
|
||
|
for match in source:
|
||
|
ts[match[1]].append(match[0])
|
||
|
output.append(ts)
|
||
|
except:
|
||
|
output.append(None)
|
||
|
return output
|
||
|
|
||
|
|
||
|
def join_matches(matches):
|
||
|
'''Joins each array of the same match, putting the texts together and averaging its distance'''
|
||
|
output = []
|
||
|
for source in matches:
|
||
|
ns = []
|
||
|
try:
|
||
|
for match in source:
|
||
|
tex = match[0][1][:-1]
|
||
|
added = 0
|
||
|
for m in match:
|
||
|
added += m[0]
|
||
|
tex.append(m[1][-1])
|
||
|
ns.append([added/len(match), tex, match[0][3],match[-1][4]])
|
||
|
except:
|
||
|
ns = None
|
||
|
output.append(ns)
|
||
|
return output
|
||
|
|
||
|
def de_preprocess(matches, dic1, dic2, dist):
|
||
|
'''Takes the preprocessed match and makes it normal, and takes both dictionaries'''
|
||
|
output=[]
|
||
|
for i, cluster in enumerate(matches):
|
||
|
newcluster = []
|
||
|
for x in enumerate(cluster):
|
||
|
newcluster.append(dic1[x[0]])
|
||
|
start = newcluster[0]
|
||
|
end = newcluster[-1]
|
||
|
mstart = dic2[cluster[0][1]]
|
||
|
mend = dic2[cluster[-1][1]]
|
||
|
dens = [None]*(1+end-start) #Where None means not yet known
|
||
|
for j,x in enumerate(newcluster):
|
||
|
dens[x-start] = dist[i][j]
|
||
|
output.append((start, end, mstart, mend, dens))
|
||
|
return output
|
||
|
|
||
|
def bilinear(dens):
|
||
|
'''Linearly aliases the match accuracy for matches between existing ones'''
|
||
|
for x in dens:
|
||
|
points = [i for i, y in enumerate(x) if y != None]
|
||
|
pairs = []
|
||
|
for y in range(0,len(points)-1):
|
||
|
pairs.append((points[y], points[y+1]))
|
||
|
for y in pairs:
|
||
|
interval = (x[y[1]]-x[y[0]]) / (y[1]- y[0])
|
||
|
added = x[y[0]]
|
||
|
if y[1]-y[0] > 1:
|
||
|
for z in range(1, y[1]-y[0]):
|
||
|
added += interval
|
||
|
x[y[0]+z] = round(added)
|
||
|
return dens
|
||
|
|
||
|
def shingle_final(input, dens):
|
||
|
'''Makes the matches into the Match class'''
|
||
|
output = []
|
||
|
for x in input:
|
||
|
y = Match(start = x[0], end = x[1])
|
||
|
y.s_start = x[2]
|
||
|
y.density = dens
|
||
|
y.s_end = x[3]
|
||
|
output.append(y)
|
||
|
return output
|