""" Graph utilities and algorithms Graphs are represented with their adjacency matrices, preferably using sparse matrices. """ # Authors: Aric Hagberg # Gael Varoquaux # Jake Vanderplas # License: BSD 3 clause import numpy as np from scipy import sparse from ..metrics.pairwise import pairwise_distances ############################################################################### # Path and connected component analysis. # Code adapted from networkx def single_source_shortest_path_length(graph, source, *, cutoff=None): """Return the length of the shortest path from source to all reachable nodes. Parameters ---------- graph : {sparse matrix, ndarray} of shape (n_nodes, n_nodes) Adjacency matrix of the graph. Sparse matrix of format LIL is preferred. source : int Start node for path. cutoff : int, default=None Depth to stop the search - only paths of length <= cutoff are returned. Returns ------- paths : dict Reachable end nodes mapped to length of path from source, i.e. `{end: path_length}`. Examples -------- >>> from sklearn.utils.graph import single_source_shortest_path_length >>> import numpy as np >>> graph = np.array([[ 0, 1, 0, 0], ... [ 1, 0, 1, 0], ... [ 0, 1, 0, 0], ... [ 0, 0, 0, 0]]) >>> single_source_shortest_path_length(graph, 0) {0: 0, 1: 1, 2: 2} >>> graph = np.ones((6, 6)) >>> sorted(single_source_shortest_path_length(graph, 2).items()) [(0, 1), (1, 1), (2, 0), (3, 1), (4, 1), (5, 1)] """ if sparse.isspmatrix(graph): graph = graph.tolil() else: graph = sparse.lil_matrix(graph) seen = {} # level (number of hops) when seen in BFS level = 0 # the current level next_level = [source] # dict of nodes to check at next level while next_level: this_level = next_level # advance to next level next_level = set() # and start a new list (fringe) for v in this_level: if v not in seen: seen[v] = level # set the level of vertex v next_level.update(graph.rows[v]) if cutoff is not None and cutoff <= level: break level += 1 return seen # return all path lengths as dictionary def _fix_connected_components( X, graph, n_connected_components, component_labels, mode="distance", metric="euclidean", **kwargs, ): """Add connections to sparse graph to connect unconnected components. For each pair of unconnected components, compute all pairwise distances from one component to the other, and add a connection on the closest pair of samples. This is a hacky way to get a graph with a single connected component, which is necessary for example to compute a shortest path between all pairs of samples in the graph. Parameters ---------- X : array of shape (n_samples, n_features) or (n_samples, n_samples) Features to compute the pairwise distances. If `metric = "precomputed"`, X is the matrix of pairwise distances. graph : sparse matrix of shape (n_samples, n_samples) Graph of connection between samples. n_connected_components : int Number of connected components, as computed by `scipy.sparse.csgraph.connected_components`. component_labels : array of shape (n_samples) Labels of connected components, as computed by `scipy.sparse.csgraph.connected_components`. mode : {'connectivity', 'distance'}, default='distance' Type of graph matrix: 'connectivity' corresponds to the connectivity matrix with ones and zeros, and 'distance' corresponds to the distances between neighbors according to the given metric. metric : str Metric used in `sklearn.metrics.pairwise.pairwise_distances`. kwargs : kwargs Keyword arguments passed to `sklearn.metrics.pairwise.pairwise_distances`. Returns ------- graph : sparse matrix of shape (n_samples, n_samples) Graph of connection between samples, with a single connected component. """ if metric == "precomputed" and sparse.issparse(X): raise RuntimeError( "_fix_connected_components with metric='precomputed' requires the " "full distance matrix in X, and does not work with a sparse " "neighbors graph." ) for i in range(n_connected_components): idx_i = np.flatnonzero(component_labels == i) Xi = X[idx_i] for j in range(i): idx_j = np.flatnonzero(component_labels == j) Xj = X[idx_j] if metric == "precomputed": D = X[np.ix_(idx_i, idx_j)] else: D = pairwise_distances(Xi, Xj, metric=metric, **kwargs) ii, jj = np.unravel_index(D.argmin(axis=None), D.shape) if mode == "connectivity": graph[idx_i[ii], idx_j[jj]] = 1 graph[idx_j[jj], idx_i[ii]] = 1 elif mode == "distance": graph[idx_i[ii], idx_j[jj]] = D[ii, jj] graph[idx_j[jj], idx_i[ii]] = D[ii, jj] else: raise ValueError( "Unknown mode=%r, should be one of ['connectivity', 'distance']." % mode ) return graph