class Match: '''Main class of the match Variables start = Start of the match end = End of the match density = A depth map of the distance of the match s_start = The start of the match from the source s_end = The end of the match from the source mean = Find average distance of the match source = Source of the Match text 1 = Part of the text text 2 = Match in other text ''' density = None s_start = None s_end = None source = None text1 = None text2 = None def __init__(self, start, end): self.start = start self.end = end def mean(self): if not self.density: print("Warning: No density specified, Skipping...") return x = 0 for y in self.density: x += y x = x/len(self.density) return x def find_text(self, text1, text2, extra = 5): try: self.text1 = text1[self.start-extra:self.end+extra] except: self.text1 = text1[self.start:self.end] try: self.text2 = text2[self.s_start-extra:self.s_end+extra] except: self.text2 = text2[self.s_start:self.s_end] def print(self): print(f"Document:\n {self.text1}\n\nSource:\n{self.text2}\n\n\n\n") def match_elements(matches): '''Extract match elements''' output = [] for y in matches: match, source, text = y output.append([match.dist, text, source, match.start, match.end]) return output def source_sort(matches, leng): '''Orginize by source''' output = [[]for k in range(leng)] for x in matches: output[x[2]].append(x) return output def check_merges(matches): '''Checks merging matches''' output = [] for source in matches: try: results = [(source[0],0)] count = 0 for match in source[1:]: if match[3] > results[-1][0][4]: count += 1 results.append((match, count)) output.append(results) except IndexError: output.append(None) return output def print_matches(matches, searchurls, ex = []): '''Prints Matches''' count = 0 exclude = [] for source in matches: try: if count not in ex: print(len(source),"match (es) from", searchurls[count]) exclude.append(count) except: pass count += 1 return exclude def separate_matches(matches): '''Separates matches based on last classification''' output = [] for source in matches: try: ts = [[] for x in range(source[-1][-1] + 1)] for match in source: ts[match[1]].append(match[0]) output.append(ts) except: output.append(None) return output def join_matches(matches): '''Joins each array of the same match, putting the texts together and averaging its distance''' output = [] for source in matches: ns = [] try: for match in source: tex = match[0][1][:-1] added = 0 for m in match: added += m[0] tex.append(m[1][-1]) ns.append([added/len(match), tex, match[0][3],match[-1][4]]) except: ns = None output.append(ns) return output def de_preprocess(matches, dic1, dic2, dist): '''Takes the preprocessed match and makes it normal, and takes both dictionaries''' output=[] for i, cluster in enumerate(matches): newcluster = [] for x in enumerate(cluster): newcluster.append(dic1[x[0]]) start = newcluster[0] end = newcluster[-1] mstart = dic2[cluster[0][1]] mend = dic2[cluster[-1][1]] dens = [None]*(1+end-start) #Where None means not yet known for j,x in enumerate(newcluster): dens[x-start] = dist[i][j] output.append((start, end, mstart, mend, dens)) return output def bilinear(dens): '''Linearly aliases the match accuracy for matches between existing ones''' for x in dens: points = [i for i, y in enumerate(x) if y != None] pairs = [] for y in range(0,len(points)-1): pairs.append((points[y], points[y+1])) for y in pairs: interval = (x[y[1]]-x[y[0]]) / (y[1]- y[0]) added = x[y[0]] if y[1]-y[0] > 1: for z in range(1, y[1]-y[0]): added += interval x[y[0]+z] = round(added) return dens def shingle_final(input, dens): '''Makes the matches into the Match class''' output = [] for x in input: y = Match(start = x[0], end = x[1]) y.s_start = x[2] y.density = dens y.s_end = x[3] output.append(y) return output