from .compare import NaturalLP
from . import search, extract, order, compare
import time
from concurrent.futures import ThreadPoolExecutor, wait

def tim(times = None):
    '''Times Action'''
    if times:
        times.append(time.time())
    else:
        times = []
        times.append(time.time())
        return times

def read_file(file):                       
    '''Read and tokenize file using textract
    Takes a file as input, and outputs raw and normalized texts
    If it fails to read, it just runs "read_text"
    '''
    read = search.read(file)
    norread = extract.normalize(read)
    return read, norread

def read_text(text):
    '''Reads and tokenizes text
    Takes a text as input, and outputs raw and normalized texts 
    '''
    read = text.split()
    norread = extract.normalize(read)
    return read, norread

def divide(read):
    '''Divides text, output list of searches'''
    queries = search.div(read)
    return queries

def search_texts(queries):
    '''Searches using google'''
    sources = search.search(queries)
    return sources

def download_texts(sources, threads = 10, max_time = 30, timeout = 10, pdfsupport = False):
    '''Download texts from search using multithreading
    Returns normalized set of texts. The indices match their source
    '''
    nortexts = []
    with ThreadPoolExecutor(max_workers=threads) as executor:
        futures = []
        for url in sources:
            futures.append(executor.submit(extract.doall, url, timeout, pdfsupport))
        wait(futures, timeout=max_time)
        for x in futures:
            nortexts.append(x.result())
    return nortexts

def levenshtein_compare(norread, nortexts, length = 30, threshhold = 7):
    '''Compares the texts with the input
    Returns unordered set of matches
    '''
    norcom = compare.compare(norread, nortexts, threshold=threshhold,length=length)
    return norcom

def order_results(norcom, sources):
    '''Orders Array of matches
    Returns ordered
    '''
    matchs = norcom
    sources = sources
    m2 = order.source_sort(order.match_elements(matchs) , len(sources))
    m3 = order.check_merges(m2)
    m4 = order.separate_matches(m3)
    matches = order.join_matches(m4)
    return matches

def print_matches(matches, sources = None, used = None):
    '''Prints
    Returns a list of indices of used sources
    '''
    is_match_type = False
    for x in matches:
        try:
            if isinstance(x[0], order.Match):
                is_match_type = True
                break
        except:
            pass

    if is_match_type:
        print("\nMatches:")
        for x in matches:
            if len(x) > 0:
                print(f"{len(x)} matches from {x[0].source}")
                for y in x:
                    y.print()
        return

    if not used:
        used = []
    print("\nMatches:")
    used = order.print_matches(matches, sources, used)
    return used

def Textual(input, verbose = True, length = 20, threshhold = 5, threads = 15, max_time = 30, timeout = 10, pdfsupport = True):
    '''Does a textual search of the input'''
    read, norread = read_file(input)
    queries = divide(read)
    sources = search_texts(queries)
    nortexts = download_texts(sources, threads = threads, max_time = max_time, timeout = timeout, pdfsupport = pdfsupport)
    norcom = levenshtein_compare(norread, nortexts,length = length, threshhold = threshhold)
    matches = order_results(norcom, sources)
    if verbose:
        print_matches(matches, sources)
    return matches

def print_time(times):
    '''Prints Time'''
    print("\nTime taken:")
    for x in range(0, len(times) - 1):
        print(times[x+1]- times[x])
    print("Total:", times[-1] - times[0])

def shingle(input,k):
    '''Shingle the input in k length ngrams'''
    output = compare.shingle(input, k)
    return output

def pre_search(norread, stopwords=None):
    '''Makes a google search of the text without stop words'''
    if not stopwords:
        stopwords = Nlp().stopwords
    norread = [x for x in norread if x not in stopwords]
    output = ""
    for x in norread:
        output = output + " " + x
    output = divide(output)
    return search_texts(output)

def full_shin_comparison(input1, input2, dic1, dic2, shingle_size, gap, miin):
    '''Does all the shingles process'''
    input1 = compare.shingle(input1, shingle_size)
    input2 = compare.shingle(input2, shingle_size)
    matches = compare.shin_matches(input1, input2)
    matches = compare.cluster(matches, gap, miin)
    dist = compare.get_dist(matches)
    matches = order.de_preprocess(matches, dic1, dic2, dist)
    dist = order.bilinear(dist)
    output = order.shingle_final(matches, dist)
    return output

def Shingled(input, lang="english", miin=20, gap=4, shingle_size=2, threads = 10, pdfsupport = True, max_time=100, verbose = True):
    '''Does a complete search of the input using nlp'''
    nnlp = NaturalLP(lang)
    read, norread = read_file(input)
    sources = search_texts(divide(read))
    sources.append([x for x in pre_search(norread, nnlp.stopwords) if x not in sources])
    nortexts = download_texts(sources, threads = threads, pdfsupport = pdfsupport)
    nnlp.addstopword("hello")
    preread = nnlp.preprocess(norread)
    pretexts = nnlp.bulkpreprocess(nortexts, threads = threads)
    output = []
    with ThreadPoolExecutor(max_workers=threads) as executor:
        futures = []
        for text in pretexts:
            futures.append(executor.submit(full_shin_comparison, preread[0], text[0], preread[1], text[1], shingle_size, gap, miin))
        wait(futures, timeout=max_time)
        for x in futures:
            output.append(x.result())
    for i, x in enumerate(output):
        for y in x:
            y.source = sources[i]
            y.find_text(norread, nortexts[i])
    if verbose:
        print_matches(output)
    return output
    

if __name__ == "__main__":
    #Textual("In information theory, linguistics and computer science, the Levenshtein distance is a string metric for measuring the difference between two sequences. Informally, the Levenshtein distance between two words is the minimum number of single-character edits (insertions, deletions or substitutions) required to change one word into the other")
    Shingled("In information theory, linguistics and computer science, the Levenshtein distance is a string metric for measuring the difference between two sequences. Informally, the Levenshtein distance between two words is the minimum number of single-character edits (insertions, deletions or substitutions) required to change one word into the other","english",3,3,2)