script that compare effectiveness of two different algorithms that calculate the genus of an embedded graph

2019-09-07 09:28:28 +00:00 · 2019-09-07 09:28:28 +00:00 · be6a802338
commit be6a802338
parent ca555c5709
1 changed files with 364 additions and 0 deletions
--- a/compare_algorithms.py
+++ b/compare_algorithms.py
@ -0,0 +1,364 @@
+#!/usr/bin/env python
+
+import sys
+import os
+import numpy as np
+import pandas as pd
+# from collections import defaultdict
+import ntpath
+# import time
+from timeit import default_timer
+from pycallgraph import PyCallGraph
+from pycallgraph.output import GraphvizOutput
+
+
+def time_usage(func):
+    def wrapper(*args, **kwargs):
+        best_times = []
+        for _ in range(3):
+            beg_ts = default_timer()
+            retval = func(*args, **kwargs)
+            end_ts = default_timer()
+            best_times.append(end_ts - beg_ts)
+            best_times.sort()
+            best_times = best_times[:3:]
+        print(func.__name__, end=" ")
+        print("time:", sum(best_times)/3)
+        return retval
+    return wrapper
+
+try:
+    profile
+except NameError:
+    profile = lambda x: x
+
+# import cProfile
+# from memory_profiler import profile
+
+
+class MySettings(object):
+    def __init__(self):
+        self.new_version = True
+        self.old_version = True
+        # self.old_version = False
+        # self.new_version = False
+        # self.dir_from = os.path.join(os.getcwd(), "../mapsTEST/")
+        self.dir_from = os.path.join(os.getcwd(), "../mapsSMOG/")
+        # self.dir_from = os.path.join(os.getcwd(), "../mapsCHIMERA/")
+        # self.dir_to = os.path.join(os.getcwd(), "../test/")
+        self.dir_to = os.path.join(os.getcwd(), "../dataGENUS/")
+        self.overwrite = True
+        # self.overwrite = False
+        self.save_ext = ".genus"
+        self.read_ext = ".smog"
+        # self.read_ext = ".test"
+        # self.read_ext = ".chimera"
+        # self.dir_rap = os.path.join(os.getcwd(), "./outGENUS/")
+        # self.log_rap = os.path.join(self.dir_rap, "log.out")
+        # self.log_gen = os.path.join(self.dir_rap, "genus.out")
+        # self.misMaps = os.path.join(self.dir_rap, "missingMaps.txt")
+
+
+class MyProtein(object):
+    def __init__(self, read_path, save_path, old_version=False):
+        self.save_path = save_path
+        self.genus_change = []
+        # self.contacts_dict = dict(int)
+        # self.contacts_dict = defaultdict(int)
+        self.contacts_dict = {}
+        self.old_version = old_version
+        try:
+            # original_map = np.loadtxt(read_path, int)
+            original_map = pd.read_csv(read_path, skip_blank_lines=True,
+                                       header=None, delim_whitespace=True)
+            original_map = np.array(original_map)
+        except ValueError:
+            # print("ValueError while reading the file %s.\n" % (read_path))
+            self.original_map = np.array([])
+            return None
+
+        original_map.sort()
+        idx = np.lexsort(original_map.T)  # T means transpose
+        original_map = np.take(original_map, idx, axis=0)
+        self.maxelement = np.amax(original_map)
+        row_mask = np.append(True, np.diff(original_map, axis=0).any(axis=1))
+        original_map = original_map[row_mask]
+        # skip contacts between neighbour vertices
+        row_mask = (original_map[:, 1] - original_map[:, 0] > 1)
+        self.original_map = original_map[row_mask]
+        self.modified_map = np.copy(self.original_map)
+        return None
+
+    # @profile
+    def solve_bifurcations(self, modified_map, axis=0):
+        if axis:
+            frequency = np.unique(modified_map, return_counts=True)
+        else:
+            frequency = np.unique(modified_map[:, 0], return_counts=True)
+        frequency = np.column_stack((frequency[0], frequency[1]))
+        frequency = frequency[frequency[:, 1] > 1]
+        for row in frequency[::-1]:
+            to_add = row[1] - 1
+            np.putmask(modified_map, modified_map > row[0],
+                       modified_map + to_add)
+            if axis:
+                duplicate = np.where(modified_map[:, 0] == row[0])
+                if duplicate[0]:
+                    modified_map[duplicate[0][0]][0] += to_add
+                    to_add -= 1
+
+            duplicate = np.where(modified_map[:, axis] == row[0])
+            for dup in duplicate[0]:
+                modified_map[dup][axis] += to_add
+                to_add -= 1
+        if not axis:
+            self.solve_bifurcations(modified_map, 1)
+
+    def simplify_map(self):
+        self.solve_bifurcations(self.modified_map)
+        self.modified_map = self.remove_unused(self.modified_map)
+        ind = np.lexsort((self.modified_map[:, 0], self.modified_map[:, 1]))
+        self.original_map = self.original_map[ind]
+        self.modified_map = self.modified_map[ind]
+
+    def remove_unused(self, in_map):
+        ind = np.argsort(np.argsort(in_map.flatten()))
+        m_map = np.array(range(in_map.size))[ind]
+        m_map.shape = (-1, 2)
+        return m_map
+
+    @profile
+    # @time_usage
+    def old_genus_function(self):
+        genus_change = []  # genus value along a chain
+        chain_positions = []
+        length = 1
+        genus = 0
+        for num in range(self.maxelement):
+            row_mask = (self.original_map[:, 1] <= num + 1)
+            partial_map = self.modified_map[row_mask]
+            if partial_map.shape[0] > length:
+                partial_map = self.remove_unused(partial_map)
+                genus = self.old_get_genus(partial_map)
+                length = partial_map.shape[0]
+            genus_change.append(genus)
+            chain_positions.append(num + 1)
+        return np.column_stack((chain_positions, genus_change))
+
+    # ######################## GENUS ALONG CHAIN ##############
+    @profile
+    def old_get_genus(self, partial_map):
+        remaining_chain = np.ones(4 * partial_map.shape[0], dtype=np.int)
+        remaining_chain.shape = (-1, 2, 2)
+        boundary_components = 0
+        p_max = np.amax(partial_map)
+        while remaining_chain.any():
+            boundary_components += 1
+            x, y, z = np.unravel_index(np.argmax(remaining_chain == 1),
+                                       remaining_chain.shape)
+            old_x, old_y = x, y
+            while True:
+                remaining_chain[x][y][z] = 0
+                remaining_chain[x][1-y][1-z] = 0
+                value = partial_map[x][1-y]
+                value += 2*z - 1
+                value %= (p_max + 1)
+                x, y = np.unravel_index(np.argmax(partial_map == value),
+                                        partial_map.shape)
+                if x == old_x and y == old_y:
+                    break
+
+        genus = int((1 + partial_map.shape[0] - boundary_components)/2)
+        return genus
+
+    # @time_usage
+    @profile
+    def get_genus_function(self):
+        genus_change = []  # genus value along a chain
+        chain_positions = []
+        genus = 0  # current genus value
+        old_position = 0  # needed for vertices with bifurcations
+        # get_genus_change() is recalled for each row in modified_map.
+        # If a result is positive, the current genus value increases by 1.
+        # The current position in chain is taken from original_map.
+        for row_id in range(self.modified_map.shape[0]):
+            new_position = self.original_map[row_id][1]
+            for i in range(new_position - old_position):
+                chain_positions.append(old_position + i + 1)
+                genus_change.append(genus)
+            genus += self.get_genus_change(row_id)
+            genus_change[new_position-1] = genus
+            old_position = new_position
+        for i in range(self.maxelement - old_position):
+            chain_positions.append(old_position + i + 1)
+            genus_change.append(genus)
+        # Genus function is calculated and two lists are saved into a table:
+        return np.column_stack((chain_positions, genus_change))
+
+
+# ####### GENUS CHANGE FOR AN EDGE ######################
+    @profile
+    def get_genus_change(self, row_id):
+        # updating dictionary
+        last_end = self.modified_map[row_id][1]  # end of added edge
+        last_start = self.modified_map[row_id][0]  # start of added edge
+        self.contacts_dict[last_end] = last_start
+        self.contacts_dict[last_start] = last_end
+        # Each step in the following loop consists of two phases:
+        # "footsteps" along vertices without "extra-edges" and
+        # a "jump" along an edge if such one is found in the dictionary
+        current = last_start + 1
+        # start by "footstep" from the vertex last_start
+        path_begin = last_end  # used for memoization
+        empty_path = False
+        # used for memoization - there is no extra
+        while current != last_start:
+            if current not in self.contacts_dict:
+                # a vertex without extra edge
+                if not empty_path:
+                    # used for memoization - memoization should be finalized
+                    self.contacts_dict[path_begin] = current - 1
+                    # the dictionary is updated - in next recalls of
+                    # the function new "shortcut"
+                    # instead of full walk will be used
+                    empty_path = True
+                current += 1  # "footstep"
+                continue
+            if current == last_end:
+                # STOP - the new edge has two boundaries
+                return 0  # genus doesn't change
+            if empty_path:  # used for memoization
+                # Set "shortcut" begin (path_begin) if
+                # the last one was just erased.
+                path_begin = current
+                empty_path = False
+            current = self.contacts_dict[current] + 1
+            # a "jump" (along edge from dictionary)
+            # and a "footstep" (+1)
+        return 1
+    # the new edge has only one boundary - genus increases by one
+
+#############################################################
+
+    # @profile
+    def save(self):
+        np.savetxt(self.save_path, self.genus_change, fmt='%i')
+
+    # @profile
+    def calculate(self):
+        if self.original_map.shape[0] < 2:
+            lst = np.column_stack((list(range(1, self.maxelement + 1)),
+                                   self.maxelement * [0]))
+            self.genus_change = lst
+        else:
+            self.simplify_map()
+            # print("**************** Genus change ***********")
+            # print(self.save_path)
+            # print(self.genus_change)
+            if self.old_version:
+                self.genus_change = self.old_genus_function()
+            else:
+                self.genus_change = self.get_genus_function()
+        # if self.genus_change.shape[0]:
+        #     print(
+        #           self.genus_change[self.genus_change.shape[0] - 1][1], "\t",
+        #           self.genus_change[self.genus_change.shape[0] - 1][0], "\t",
+        #           ntpath.basename(self.save_path)
+        #           )
+        # else:
+        #     print(self.genus_change, "\t", ntpath.basename(self.save_path))
+
+
+@time_usage
+def new_version(read_path, save_path):
+    with PyCallGraph(output=GraphvizOutput(), ):
+        my_protein = MyProtein(read_path, save_path)
+        if not my_protein.original_map.any():
+            # if my_protein.genus_change:
+            #     print("Genus zero")
+            # else:
+            #     # print(my_protein.genus_change, "\t",
+            #     #      ntpath.basename(my_protein.save_path))
+            #     print("Error: %s. Empty map." % read_path)
+            return None
+        my_protein.calculate()
+    # my_protein.save()
+    return my_protein.genus_change
+
+
+@time_usage
+def old_version(read_path, save_path):
+    my_protein = MyProtein(read_path, save_path, old_version=True)
+    if not my_protein.original_map.any():
+        # print(old_protein.genus_change, "\t",
+        #      ntpath.basename(old_protein.save_path))
+        # print("Error: %s. Empty map." % read_path)
+        return None
+    my_protein.calculate()
+    # my_protein.save()
+    return my_protein.genus_change
+
+
+def main(argv):
+    settings = MySettings()
+
+    if not os.path.isdir(settings.dir_from):
+        print ("""Error: {}: no such directory. Please change settings in {}"""
+               .format(settings.dir_from, __file__))
+        sys.exit()
+
+    os.makedirs(settings.dir_to, exist_ok=True)
+    # os.makedirs(settings.dir_rap, exist_ok=True)
+    if len(argv) != 2:
+        my_list = []
+        # print ("Usage: $ %s 'chain_list.txt'" % __file__)
+        # exit()
+        settings.dir_from = os.path.join(os.getcwd(), "../mapsTEST/")
+        settings.dir_to = os.path.join(os.getcwd(), "../test/")
+        settings.read_ext = ".test"
+        all_files = list(filter(lambda x: x.endswith(settings.read_ext),
+                                os.listdir(settings.dir_from)))
+        for f in all_files[:5]:
+            # print(f)
+            idn = ntpath.basename(f).split(".")[0]
+            my_list.append(idn)
+    else:
+        chain_list = argv[1]
+        if not os.path.isfile(chain_list):
+            print ("Error: {}: no such file.".format(chain_list))
+            sys.exit()
+        with open(chain_list, "r") as f:
+            my_list = f.read().splitlines()
+            # misMaps = ""
+    for my_id in my_list:
+        # pdb_id = line[:4].upper()
+        # chain = line[4:].strip()
+        # my_id = pdb_id + chain
+        save_path = os.path.join(settings.dir_to, my_id + settings.save_ext)
+        if os.path.isfile(save_path) and not settings.overwrite:
+            print("File ", save_path, " already exists.")
+            continue
+        read_path = os.path.join(settings.dir_from, my_id + settings.read_ext)
+        if not os.path.isfile(read_path):
+            # print("Error: %s. No such file." % read_path)
+            # misMaps += my_id + "\n" # to be changed
+            continue
+        print(read_path)
+        print("Chain: " + my_id)
+        if settings.new_version:
+            genus_change = new_version(read_path, save_path)
+            if genus_change is None:
+                continue
+        if settings.old_version:
+            old_genus_change = old_version(read_path, save_path)
+            if settings.new_version:
+                if np.array_equal(old_genus_change, genus_change):
+                    print("OK")
+                    print(genus_change[-1])
+                else:
+                    print("\n!!!\n!!!Results are different!!!\n!!!")
+        # with open(settings.misMaps, 'w') as f:
+        #    f.write(misMaps)
+
+if __name__ == "__main__":
+    main(sys.argv)