From be6a8023384b00f8816c08801fad181d0980a03a Mon Sep 17 00:00:00 2001 From: Maria Marchwicka Date: Sat, 7 Sep 2019 09:28:28 +0000 Subject: [PATCH] script that compare effectiveness of two different algorithms that calculate the genus of an embedded graph --- compare_algorithms.py | 364 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 364 insertions(+) create mode 100644 compare_algorithms.py diff --git a/compare_algorithms.py b/compare_algorithms.py new file mode 100644 index 0000000..ffb1398 --- /dev/null +++ b/compare_algorithms.py @@ -0,0 +1,364 @@ +#!/usr/bin/env python + +import sys +import os +import numpy as np +import pandas as pd +# from collections import defaultdict +import ntpath +# import time +from timeit import default_timer +from pycallgraph import PyCallGraph +from pycallgraph.output import GraphvizOutput + + +def time_usage(func): + def wrapper(*args, **kwargs): + best_times = [] + for _ in range(3): + beg_ts = default_timer() + retval = func(*args, **kwargs) + end_ts = default_timer() + best_times.append(end_ts - beg_ts) + best_times.sort() + best_times = best_times[:3:] + print(func.__name__, end=" ") + print("time:", sum(best_times)/3) + return retval + return wrapper + +try: + profile +except NameError: + profile = lambda x: x + +# import cProfile +# from memory_profiler import profile + + +class MySettings(object): + def __init__(self): + self.new_version = True + self.old_version = True + # self.old_version = False + # self.new_version = False + # self.dir_from = os.path.join(os.getcwd(), "../mapsTEST/") + self.dir_from = os.path.join(os.getcwd(), "../mapsSMOG/") + # self.dir_from = os.path.join(os.getcwd(), "../mapsCHIMERA/") + # self.dir_to = os.path.join(os.getcwd(), "../test/") + self.dir_to = os.path.join(os.getcwd(), "../dataGENUS/") + self.overwrite = True + # self.overwrite = False + self.save_ext = ".genus" + self.read_ext = ".smog" + # self.read_ext = ".test" + # self.read_ext = ".chimera" + # self.dir_rap = os.path.join(os.getcwd(), "./outGENUS/") + # self.log_rap = os.path.join(self.dir_rap, "log.out") + # self.log_gen = os.path.join(self.dir_rap, "genus.out") + # self.misMaps = os.path.join(self.dir_rap, "missingMaps.txt") + + +class MyProtein(object): + def __init__(self, read_path, save_path, old_version=False): + self.save_path = save_path + self.genus_change = [] + # self.contacts_dict = dict(int) + # self.contacts_dict = defaultdict(int) + self.contacts_dict = {} + self.old_version = old_version + try: + # original_map = np.loadtxt(read_path, int) + original_map = pd.read_csv(read_path, skip_blank_lines=True, + header=None, delim_whitespace=True) + original_map = np.array(original_map) + except ValueError: + # print("ValueError while reading the file %s.\n" % (read_path)) + self.original_map = np.array([]) + return None + + original_map.sort() + idx = np.lexsort(original_map.T) # T means transpose + original_map = np.take(original_map, idx, axis=0) + self.maxelement = np.amax(original_map) + row_mask = np.append(True, np.diff(original_map, axis=0).any(axis=1)) + original_map = original_map[row_mask] + # skip contacts between neighbour vertices + row_mask = (original_map[:, 1] - original_map[:, 0] > 1) + self.original_map = original_map[row_mask] + self.modified_map = np.copy(self.original_map) + return None + + # @profile + def solve_bifurcations(self, modified_map, axis=0): + if axis: + frequency = np.unique(modified_map, return_counts=True) + else: + frequency = np.unique(modified_map[:, 0], return_counts=True) + frequency = np.column_stack((frequency[0], frequency[1])) + frequency = frequency[frequency[:, 1] > 1] + for row in frequency[::-1]: + to_add = row[1] - 1 + np.putmask(modified_map, modified_map > row[0], + modified_map + to_add) + if axis: + duplicate = np.where(modified_map[:, 0] == row[0]) + if duplicate[0]: + modified_map[duplicate[0][0]][0] += to_add + to_add -= 1 + + duplicate = np.where(modified_map[:, axis] == row[0]) + for dup in duplicate[0]: + modified_map[dup][axis] += to_add + to_add -= 1 + if not axis: + self.solve_bifurcations(modified_map, 1) + + def simplify_map(self): + self.solve_bifurcations(self.modified_map) + self.modified_map = self.remove_unused(self.modified_map) + ind = np.lexsort((self.modified_map[:, 0], self.modified_map[:, 1])) + self.original_map = self.original_map[ind] + self.modified_map = self.modified_map[ind] + + def remove_unused(self, in_map): + ind = np.argsort(np.argsort(in_map.flatten())) + m_map = np.array(range(in_map.size))[ind] + m_map.shape = (-1, 2) + return m_map + + @profile + # @time_usage + def old_genus_function(self): + genus_change = [] # genus value along a chain + chain_positions = [] + length = 1 + genus = 0 + for num in range(self.maxelement): + row_mask = (self.original_map[:, 1] <= num + 1) + partial_map = self.modified_map[row_mask] + if partial_map.shape[0] > length: + partial_map = self.remove_unused(partial_map) + genus = self.old_get_genus(partial_map) + length = partial_map.shape[0] + genus_change.append(genus) + chain_positions.append(num + 1) + return np.column_stack((chain_positions, genus_change)) + + # ######################## GENUS ALONG CHAIN ############## + @profile + def old_get_genus(self, partial_map): + remaining_chain = np.ones(4 * partial_map.shape[0], dtype=np.int) + remaining_chain.shape = (-1, 2, 2) + boundary_components = 0 + p_max = np.amax(partial_map) + while remaining_chain.any(): + boundary_components += 1 + x, y, z = np.unravel_index(np.argmax(remaining_chain == 1), + remaining_chain.shape) + old_x, old_y = x, y + while True: + remaining_chain[x][y][z] = 0 + remaining_chain[x][1-y][1-z] = 0 + value = partial_map[x][1-y] + value += 2*z - 1 + value %= (p_max + 1) + x, y = np.unravel_index(np.argmax(partial_map == value), + partial_map.shape) + if x == old_x and y == old_y: + break + + genus = int((1 + partial_map.shape[0] - boundary_components)/2) + return genus + + # @time_usage + @profile + def get_genus_function(self): + genus_change = [] # genus value along a chain + chain_positions = [] + genus = 0 # current genus value + old_position = 0 # needed for vertices with bifurcations + # get_genus_change() is recalled for each row in modified_map. + # If a result is positive, the current genus value increases by 1. + # The current position in chain is taken from original_map. + for row_id in range(self.modified_map.shape[0]): + new_position = self.original_map[row_id][1] + for i in range(new_position - old_position): + chain_positions.append(old_position + i + 1) + genus_change.append(genus) + genus += self.get_genus_change(row_id) + genus_change[new_position-1] = genus + old_position = new_position + for i in range(self.maxelement - old_position): + chain_positions.append(old_position + i + 1) + genus_change.append(genus) + # Genus function is calculated and two lists are saved into a table: + return np.column_stack((chain_positions, genus_change)) + + +# ####### GENUS CHANGE FOR AN EDGE ###################### + @profile + def get_genus_change(self, row_id): + # updating dictionary + last_end = self.modified_map[row_id][1] # end of added edge + last_start = self.modified_map[row_id][0] # start of added edge + self.contacts_dict[last_end] = last_start + self.contacts_dict[last_start] = last_end + # Each step in the following loop consists of two phases: + # "footsteps" along vertices without "extra-edges" and + # a "jump" along an edge if such one is found in the dictionary + current = last_start + 1 + # start by "footstep" from the vertex last_start + path_begin = last_end # used for memoization + empty_path = False + # used for memoization - there is no extra + while current != last_start: + if current not in self.contacts_dict: + # a vertex without extra edge + if not empty_path: + # used for memoization - memoization should be finalized + self.contacts_dict[path_begin] = current - 1 + # the dictionary is updated - in next recalls of + # the function new "shortcut" + # instead of full walk will be used + empty_path = True + current += 1 # "footstep" + continue + if current == last_end: + # STOP - the new edge has two boundaries + return 0 # genus doesn't change + if empty_path: # used for memoization + # Set "shortcut" begin (path_begin) if + # the last one was just erased. + path_begin = current + empty_path = False + current = self.contacts_dict[current] + 1 + # a "jump" (along edge from dictionary) + # and a "footstep" (+1) + return 1 + # the new edge has only one boundary - genus increases by one + +############################################################# + + # @profile + def save(self): + np.savetxt(self.save_path, self.genus_change, fmt='%i') + + # @profile + def calculate(self): + if self.original_map.shape[0] < 2: + lst = np.column_stack((list(range(1, self.maxelement + 1)), + self.maxelement * [0])) + self.genus_change = lst + else: + self.simplify_map() + # print("**************** Genus change ***********") + # print(self.save_path) + # print(self.genus_change) + if self.old_version: + self.genus_change = self.old_genus_function() + else: + self.genus_change = self.get_genus_function() + # if self.genus_change.shape[0]: + # print( + # self.genus_change[self.genus_change.shape[0] - 1][1], "\t", + # self.genus_change[self.genus_change.shape[0] - 1][0], "\t", + # ntpath.basename(self.save_path) + # ) + # else: + # print(self.genus_change, "\t", ntpath.basename(self.save_path)) + + +@time_usage +def new_version(read_path, save_path): + with PyCallGraph(output=GraphvizOutput(), ): + my_protein = MyProtein(read_path, save_path) + if not my_protein.original_map.any(): + # if my_protein.genus_change: + # print("Genus zero") + # else: + # # print(my_protein.genus_change, "\t", + # # ntpath.basename(my_protein.save_path)) + # print("Error: %s. Empty map." % read_path) + return None + my_protein.calculate() + # my_protein.save() + return my_protein.genus_change + + +@time_usage +def old_version(read_path, save_path): + my_protein = MyProtein(read_path, save_path, old_version=True) + if not my_protein.original_map.any(): + # print(old_protein.genus_change, "\t", + # ntpath.basename(old_protein.save_path)) + # print("Error: %s. Empty map." % read_path) + return None + my_protein.calculate() + # my_protein.save() + return my_protein.genus_change + + +def main(argv): + settings = MySettings() + + if not os.path.isdir(settings.dir_from): + print ("""Error: {}: no such directory. Please change settings in {}""" + .format(settings.dir_from, __file__)) + sys.exit() + + os.makedirs(settings.dir_to, exist_ok=True) + # os.makedirs(settings.dir_rap, exist_ok=True) + if len(argv) != 2: + my_list = [] + # print ("Usage: $ %s 'chain_list.txt'" % __file__) + # exit() + settings.dir_from = os.path.join(os.getcwd(), "../mapsTEST/") + settings.dir_to = os.path.join(os.getcwd(), "../test/") + settings.read_ext = ".test" + all_files = list(filter(lambda x: x.endswith(settings.read_ext), + os.listdir(settings.dir_from))) + for f in all_files[:5]: + # print(f) + idn = ntpath.basename(f).split(".")[0] + my_list.append(idn) + else: + chain_list = argv[1] + if not os.path.isfile(chain_list): + print ("Error: {}: no such file.".format(chain_list)) + sys.exit() + with open(chain_list, "r") as f: + my_list = f.read().splitlines() + # misMaps = "" + for my_id in my_list: + # pdb_id = line[:4].upper() + # chain = line[4:].strip() + # my_id = pdb_id + chain + save_path = os.path.join(settings.dir_to, my_id + settings.save_ext) + if os.path.isfile(save_path) and not settings.overwrite: + print("File ", save_path, " already exists.") + continue + read_path = os.path.join(settings.dir_from, my_id + settings.read_ext) + if not os.path.isfile(read_path): + # print("Error: %s. No such file." % read_path) + # misMaps += my_id + "\n" # to be changed + continue + print(read_path) + print("Chain: " + my_id) + if settings.new_version: + genus_change = new_version(read_path, save_path) + if genus_change is None: + continue + if settings.old_version: + old_genus_change = old_version(read_path, save_path) + if settings.new_version: + if np.array_equal(old_genus_change, genus_change): + print("OK") + print(genus_change[-1]) + else: + print("\n!!!\n!!!Results are different!!!\n!!!") + # with open(settings.misMaps, 'w') as f: + # f.write(misMaps) + +if __name__ == "__main__": + main(sys.argv)