180 lines
6.7 KiB
Python
180 lines
6.7 KiB
Python
|
from .compare import NaturalLP
|
||
|
from . import search, extract, order, compare
|
||
|
import time
|
||
|
from concurrent.futures import ThreadPoolExecutor, wait
|
||
|
|
||
|
def tim(times = None):
|
||
|
'''Times Action'''
|
||
|
if times:
|
||
|
times.append(time.time())
|
||
|
else:
|
||
|
times = []
|
||
|
times.append(time.time())
|
||
|
return times
|
||
|
|
||
|
def read_file(file):
|
||
|
'''Read and tokenize file using textract
|
||
|
Takes a file as input, and outputs raw and normalized texts
|
||
|
If it fails to read, it just runs "read_text"
|
||
|
'''
|
||
|
read = search.read(file)
|
||
|
norread = extract.normalize(read)
|
||
|
return read, norread
|
||
|
|
||
|
def read_text(text):
|
||
|
'''Reads and tokenizes text
|
||
|
Takes a text as input, and outputs raw and normalized texts
|
||
|
'''
|
||
|
read = text.split()
|
||
|
norread = extract.normalize(read)
|
||
|
return read, norread
|
||
|
|
||
|
def divide(read):
|
||
|
'''Divides text, output list of searches'''
|
||
|
queries = search.div(read)
|
||
|
return queries
|
||
|
|
||
|
def search_texts(queries):
|
||
|
'''Searches using google'''
|
||
|
sources = search.search(queries)
|
||
|
return sources
|
||
|
|
||
|
def download_texts(sources, threads = 10, max_time = 30, timeout = 10, pdfsupport = False):
|
||
|
'''Download texts from search using multithreading
|
||
|
Returns normalized set of texts. The indices match their source
|
||
|
'''
|
||
|
nortexts = []
|
||
|
with ThreadPoolExecutor(max_workers=threads) as executor:
|
||
|
futures = []
|
||
|
for url in sources:
|
||
|
futures.append(executor.submit(extract.doall, url, timeout, pdfsupport))
|
||
|
wait(futures, timeout=max_time)
|
||
|
for x in futures:
|
||
|
nortexts.append(x.result())
|
||
|
return nortexts
|
||
|
|
||
|
def levenshtein_compare(norread, nortexts, length = 30, threshhold = 7):
|
||
|
'''Compares the texts with the input
|
||
|
Returns unordered set of matches
|
||
|
'''
|
||
|
norcom = compare.compare(norread, nortexts, threshold=threshhold,length=length)
|
||
|
return norcom
|
||
|
|
||
|
def order_results(norcom, sources):
|
||
|
'''Orders Array of matches
|
||
|
Returns ordered
|
||
|
'''
|
||
|
matchs = norcom
|
||
|
sources = sources
|
||
|
m2 = order.source_sort(order.match_elements(matchs) , len(sources))
|
||
|
m3 = order.check_merges(m2)
|
||
|
m4 = order.separate_matches(m3)
|
||
|
matches = order.join_matches(m4)
|
||
|
return matches
|
||
|
|
||
|
def print_matches(matches, sources = None, used = None):
|
||
|
'''Prints
|
||
|
Returns a list of indices of used sources
|
||
|
'''
|
||
|
is_match_type = False
|
||
|
for x in matches:
|
||
|
try:
|
||
|
if isinstance(x[0], order.Match):
|
||
|
is_match_type = True
|
||
|
break
|
||
|
except:
|
||
|
pass
|
||
|
|
||
|
if is_match_type:
|
||
|
print("\nMatches:")
|
||
|
for x in matches:
|
||
|
if len(x) > 0:
|
||
|
print(f"{len(x)} matches from {x[0].source}")
|
||
|
for y in x:
|
||
|
y.print()
|
||
|
return
|
||
|
|
||
|
if not used:
|
||
|
used = []
|
||
|
print("\nMatches:")
|
||
|
used = order.print_matches(matches, sources, used)
|
||
|
return used
|
||
|
|
||
|
def Textual(input, verbose = True, length = 20, threshhold = 5, threads = 15, max_time = 30, timeout = 10, pdfsupport = True):
|
||
|
'''Does a textual search of the input'''
|
||
|
read, norread = read_file(input)
|
||
|
queries = divide(read)
|
||
|
sources = search_texts(queries)
|
||
|
nortexts = download_texts(sources, threads = threads, max_time = max_time, timeout = timeout, pdfsupport = pdfsupport)
|
||
|
norcom = levenshtein_compare(norread, nortexts,length = length, threshhold = threshhold)
|
||
|
matches = order_results(norcom, sources)
|
||
|
if verbose:
|
||
|
print_matches(matches, sources)
|
||
|
return matches
|
||
|
|
||
|
def print_time(times):
|
||
|
'''Prints Time'''
|
||
|
print("\nTime taken:")
|
||
|
for x in range(0, len(times) - 1):
|
||
|
print(times[x+1]- times[x])
|
||
|
print("Total:", times[-1] - times[0])
|
||
|
|
||
|
def shingle(input,k):
|
||
|
'''Shingle the input in k length ngrams'''
|
||
|
output = compare.shingle(input, k)
|
||
|
return output
|
||
|
|
||
|
def pre_search(norread, stopwords=None):
|
||
|
'''Makes a google search of the text without stop words'''
|
||
|
if not stopwords:
|
||
|
stopwords = Nlp().stopwords
|
||
|
norread = [x for x in norread if x not in stopwords]
|
||
|
output = ""
|
||
|
for x in norread:
|
||
|
output = output + " " + x
|
||
|
output = divide(output)
|
||
|
return search_texts(output)
|
||
|
|
||
|
def full_shin_comparison(input1, input2, dic1, dic2, shingle_size, gap, miin):
|
||
|
'''Does all the shingles process'''
|
||
|
input1 = compare.shingle(input1, shingle_size)
|
||
|
input2 = compare.shingle(input2, shingle_size)
|
||
|
matches = compare.shin_matches(input1, input2)
|
||
|
matches = compare.cluster(matches, gap, miin)
|
||
|
dist = compare.get_dist(matches)
|
||
|
matches = order.de_preprocess(matches, dic1, dic2, dist)
|
||
|
dist = order.bilinear(dist)
|
||
|
output = order.shingle_final(matches, dist)
|
||
|
return output
|
||
|
|
||
|
def Shingled(input, lang="english", miin=20, gap=4, shingle_size=2, threads = 10, pdfsupport = True, max_time=100, verbose = True):
|
||
|
'''Does a complete search of the input using nlp'''
|
||
|
nnlp = NaturalLP(lang)
|
||
|
read, norread = read_file(input)
|
||
|
sources = search_texts(divide(read))
|
||
|
sources.append([x for x in pre_search(norread, nnlp.stopwords) if x not in sources])
|
||
|
nortexts = download_texts(sources, threads = threads, pdfsupport = pdfsupport)
|
||
|
nnlp.addstopword("hello")
|
||
|
preread = nnlp.preprocess(norread)
|
||
|
pretexts = nnlp.bulkpreprocess(nortexts, threads = threads)
|
||
|
output = []
|
||
|
with ThreadPoolExecutor(max_workers=threads) as executor:
|
||
|
futures = []
|
||
|
for text in pretexts:
|
||
|
futures.append(executor.submit(full_shin_comparison, preread[0], text[0], preread[1], text[1], shingle_size, gap, miin))
|
||
|
wait(futures, timeout=max_time)
|
||
|
for x in futures:
|
||
|
output.append(x.result())
|
||
|
for i, x in enumerate(output):
|
||
|
for y in x:
|
||
|
y.source = sources[i]
|
||
|
y.find_text(norread, nortexts[i])
|
||
|
if verbose:
|
||
|
print_matches(output)
|
||
|
return output
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
#Textual("In information theory, linguistics and computer science, the Levenshtein distance is a string metric for measuring the difference between two sequences. Informally, the Levenshtein distance between two words is the minimum number of single-character edits (insertions, deletions or substitutions) required to change one word into the other")
|
||
|
Shingled("In information theory, linguistics and computer science, the Levenshtein distance is a string metric for measuring the difference between two sequences. Informally, the Levenshtein distance between two words is the minimum number of single-character edits (insertions, deletions or substitutions) required to change one word into the other","english",3,3,2)
|