11. Spacery losowe po grafach: algorytm wyszukiwania klastrów

Dla dużych grafów istotną informacją jest wykrycie podgrafów, które są silnie ze sobą powiązane. Za pomocą spacerów losowych po grafach zaprojektuj algorytm, który odkrywa strukturę klastrów w grafie (clustering algorithm). Wykorzystaj swój algorytm do wskazania krytycznych wierzchołków, tj. wierzchołków, których usunięcie rozspójnia graf. Przeanalizuj wariant algorytmu dla grafów skierowanych i grafów nieskierowanych.

import networkx as nx
import numpy as np
import matplotlib.pyplot as plt

Graf skierowany z wagami

G = nx.balanced_tree(2, 2)
pos = nx.spring_layout(G)

A = nx.adjacency_matrix(G)
x, y = A.shape

new_G = nx.DiGraph()
for current_node in range(x):
    for adjacent_node in range(y):
        if A[current_node, adjacent_node] == 1:
            p = round(1 / G.degree(current_node), 2)
            new_G.add_edge(current_node, adjacent_node, weight=p)

pos = nx.spring_layout(new_G)
nx.draw(new_G, pos, with_labels=True)
nx.get_edge_attributes(new_G, 'weight')

# DiGraph example
# nx.draw(new_G, pos, with_labels=True, arrows=True, connectionstyle='arc3, rad = 0.1')
# labels = nx.get_edge_attributes(new_G, 'weight')
# nx.draw_networkx_edge_labels(new_G, pos, edge_labels=labels)

{(0, 1): 0.5,
 (0, 2): 0.5,
 (1, 0): 0.33,
 (1, 3): 0.33,
 (1, 4): 0.33,
 (2, 0): 0.33,
 (2, 5): 0.33,
 (2, 6): 0.33,
 (3, 1): 1.0,
 (4, 1): 1.0,
 (5, 2): 1.0,
 (6, 2): 1.0}

Eksperymenty z stationary distribution


def stationary_distrib_1(P):
    eigenvals, eigenvects = np.linalg.eig(P.T)
    close_to_1_idx = np.isclose(eigenvals, 1)
    target_eigenvect = eigenvects[:,close_to_1_idx]
    target_eigenvect = target_eigenvect[:,0]

    return target_eigenvect / sum(target_eigenvect)

def stationary_distrib_2(G, P, k=100, starting_node=0):
    nodes_num = len(G.nodes())
    pi = np.zeros((nodes_num, 1))
    pi[starting_node] = 1
    # pi = np.full((1, nodes_num), 1 / nodes_num).flatten()
    # print(pi)
    for _ in range(k):
        new_pi = np.dot(P.T, pi)
        if np.allclose(pi, new_pi):
            return pi
        pi = new_pi
    return pi.reshape(1, -1).flatten()

def pagerank_test(G):
    print("PageRank")
    p_dict = nx.pagerank(G)
    p_sort = sorted(p_dict.items(), key=lambda x: x[1], reverse=True) 
    print(p_dict)
    for i in p_sort: 
        print(i[0], end=", ")
    print("\n")

Funkcje - transition matrix i adjacency matrix

def adj_matrix(G):
    A = nx.adjacency_matrix(G)
    A = A.todense()
    return np.array(A, dtype = np.float64)

def transition_matrix_1(A):
    D = np.diag(np.sum(A, axis=0))
    return np.dot(np.linalg.inv(D), A)
    
def transition_matrix_2(A):
    return A / A.sum(axis=1, keepdims=True)

Testy funkcji

A = adj_matrix(G)
P1 = transition_matrix_1(A)
P2 = transition_matrix_2(A)

assert np.array_equal(P1, P2)
P = P1

# print("Distrib1")
# S1 = stationary_distrib_1(P)
# S1_dict = {i: S1[i] for i in range(len(S1))}
# p_sort = sorted(S1_dict.items(), key=lambda x: x[1], reverse=True) 
# print(S1_dict)
# for i in p_sort: 
#     print(i[0], end=", ")
# print("\n")

print("Distrib2 - random walk")
S2 = stationary_distrib_2(G, P, k=104, starting_node=0)
S2_dict = {i: S2[i] for i in range(len(S2))}
p_sort = sorted(S2_dict.items(), key=lambda x: x[1], reverse=True) 
print(S2_dict)
for i in p_sort: 
    print(i[0], end=", ")
print("\n")


pagerank_test(G)

Distrib2 - random walk
{0: 0.3333333333333333, 1: 0.0, 2: 0.0, 3: 0.16666666666666666, 4: 0.16666666666666666, 5: 0.16666666666666666, 6: 0.16666666666666666}
0, 3, 4, 5, 6, 1, 2, 

PageRank
{0: 0.1581719960780273, 1: 0.2413134344543876, 2: 0.2413134344543876, 3: 0.08980028375329936, 4: 0.08980028375329936, 5: 0.08980028375329936, 6: 0.08980028375329936}
1, 2, 0, 3, 4, 5, 6,

Cluster detection

def pk_visit(P, k):
    pk = np.array([])
    for i in range(1, k + 1):
        pk = np.append(pk, P**i)
    pk = pk.reshape(k, P.shape[0], P.shape[1])
    return pk

def pk_sum(pk, k, node):
    p = pk[0]
    for i in range(1, k):
        p += pk[i]
    return p[node]

def l1_norm(v1, v2):
    return np.sum(np.abs(v1 - v2))

def sim_k(k, v1, v2):
    return np.exp(2 * k - l1_norm(v1, v2)) - 1

def sim_mat(G, P, k, threshold = 1):
    sim_m = np.array([])
    pk = pk_visit(P.copy(), k)
    for frm, to in G.edges():
        s = sim_k(k, pk_sum(pk.copy(), k, frm), pk_sum(pk.copy(), k, to))
        if s <= threshold:
            G[frm][to]['color'] = 'blue'
            G[frm][to]['weights'] = 0.5
        else:
            G[frm][to]['color'] = 'green'
            G[frm][to]['weights'] = 2
        sim_m = np.append(sim_m, s)
    return sim_m

def mark_clusters(G, k = 1, threshold = 1):
    A = adj_matrix(G)
    P = transition_matrix_2(A)
    
    sim_mat(G, P, k, threshold=threshold)

def draw(G, pos):
    edge_colors = [G[u][v]['color'] for u,v in G.edges()]
    weights = [G[u][v]["weights"] for u,v in G.edges()]
    nx.draw(G, pos, with_labels=True, edge_color=edge_colors, node_color="green", width=weights)

Tests

Gs = [nx.balanced_tree(1, 3), nx.balanced_tree(2, 3), nx.connected_caveman_graph(4, 10), nx.ring_of_cliques(5, 5), nx.ring_of_cliques(10, 10), nx.connected_caveman_graph(7, 7)]

for i, G in enumerate(Gs):
    plt.figure(figsize=(4,4))
    plt.figure(i+1)

    pos = nx.spring_layout(G)
    mark_clusters(G, k=1, threshold=1)
    draw(G, pos)

import random

# Generate synthetic graph with clusters
def generate_clustered_graph():
    G = nx.Graph()
    
    # Define clusters
    clusters = {
        0: range(0, 10),
        1: range(10, 20),
        2: range(20, 30),
        3: range(30, 40),
        4: range(40, 50),
        5: range(50, 60),
    }
    
    # Add edges within clusters
    for cluster_id, nodes in clusters.items():
        for i in nodes:
            for j in nodes:
                if i != j and random.random() < 0.8:  # Higher probability of edge within cluster
                    G.add_edge(i, j)
    
    # Add some edges between clusters
    for i in range(10):
        G.add_edge(random.choice(list(clusters[0])), random.choice(list(clusters[1])))
        G.add_edge(random.choice(list(clusters[1])), random.choice(list(clusters[2])))
        G.add_edge(random.choice(list(clusters[0])), random.choice(list(clusters[2])))
        G.add_edge(random.choice(list(clusters[3])), random.choice(list(clusters[2])))
        G.add_edge(random.choice(list(clusters[4])), random.choice(list(clusters[2])))
        G.add_edge(random.choice(list(clusters[5])), random.choice(list(clusters[1])))
    
    return G

# Create the clustered graph
G = generate_clustered_graph()
pos = nx.spring_layout(G)

plt.figure(figsize=(12, 12))
mark_clusters(G, k=1, threshold=1)
draw(G, pos)

632 KiB Raw Blame History