Source code for directedstructure.core

# Wrappers for the core functionality

from __future__ import annotations

import numpy as np
from numpy.typing import ArrayLike
import networkx as nx
import pandas as pd

from .input_output import _check_input_validity, _UNSET, _set_model_parameters
from ._graph_cache import (
    _store_wins_graph,
    _get_wins_graph,
    _store_ties_graph,
    _get_ties_graph,
)
from .sample import _samples_cached
from .clusterings import consensus_clustering


[docs] def samples( G: nx.Graph, *, groups_model: str = "general_canonical", hierarchy_model: str = _UNSET, interaction: str = "coupled", assortative: bool = _UNSET, mixing_variation: str = _UNSET, variation_in: float | None = _UNSET, variation_out: float | None = _UNSET, degree_correction: float | None = _UNSET, mean_degree_scaling: float | None = _UNSET, individual_depth: float | None = _UNSET, group_depth: float | None = _UNSET, ties_parameter: float | None = _UNSET, num_groups: int | None = _UNSET, initial_partition: list[int] | None = None, initial_scores: list[float] | None = None, seed: int | None = None, num_samples: int = 1000, sweeps_per_sample: int = 10, merge_split_enabled: bool = True, beta: float = 1.0, num_tempering_chains: int = 1, no_cache: bool = False, timeout: float = 60.0, verbose: bool = False, ) -> pd.DataFrame: """ Get pandas DataFrame of samples from the posterior distribution given the provided network. Returns cached samples if the function has been called before with the same parameters. Provide a new seed to generate independent samples. Parameters ---------- G : nx.Graph NetworkX graph to find group structure of. groups_model : str, optional Type of model to fit. Choices fix parameters to special cases of interest, but can be overridden by providing specific parameter values. Options and corrresponding parameter values: - 'general_canonical': All parameters inferred except mean_degree_scaling, which is set to 0 (canonical model) [Default] - 'traditional_SBM': assortative = False, mixing_variation = 'simple', degree_correction = 0, mean_degree_scaling = 0 - 'traditional_DCSBM': assortative = False, mixing_variation = 'simple', degree_correction = 0.5, mean_degree_scaling = 0 (traditional degree-corrected stochastic block model) - 'traditional_GDCSBM': assortative = False, mixing_variation = 'simple', degree_correction = None, mean_degree_scaling = 0 (infer degree correction, generalized) - 'simple_ASBM': assortative = True, mixing_variation = 'simple', degree_correction = 0, mean_degree_scaling = 0 (assortative with simple mixing variation) - 'hybrid_ASBM': assortative = True, mixing_variation = 'internal', degree_correction = 0, mean_degree_scaling = 0 (assortative with internal mixing variation, Zhang and Peixoto 2020) - 'planted_partition': assortative = True, mixing_variation = 'none', degree_correction = 0, mean_degree_scaling = 0 - 'general_ASBM': assortative = True, mixing_variation = 'none', degree_correction = 0, mean_degree_scaling = 0 - 'microcanonical_SBM': assortative = False, mixing_variation = 'simple', degree_correction = 0, mean_degree_scaling = -1 (microcanonical SBM) - 'microcanonical_DCSBM': assortative = False, mixing_variation = 'simple', degree_correction = 0.5, mean_degree_scaling = -1 (microcanonical degree-corrected SBM) - 'general_unified': All parameters inferred including mean_degree_scaling (generalizes all models considered) Defaults to 'general_canonical'. If a different model_name is provided, the corresponding parameters are set unless explicitly overridden. hierarchy_model : str, optional Type of hierarchy model to fit. Options: - 'bradley_terry': Bradley-Terry model for pairwise comparisons - 'bradley_terry_ties': Bradley-Terry model with ties Defaults to 'bradley_terry_ties' with neutral interactions are present and 'bradley_terry' otherwise. If neutral interactions are present and 'bradley_terry' is chosen only dominant interactions are used. interaction : str, optional Type of coupling between groups and hierarchy to use. Options: - 'coupled': groups and hierarchy are coupled (default) - 'independent': groups and hierarchy are inferred independently assortative : bool, optional Whether to allow the model to consider generically assortative group structure. If assortative = False, fix the overall in and out group densities to be equal: rho_in = rho_out. Defaults to True. mixing_variation : str, optional Type of variation in mixing weights to use. Controls variation_in, variation_out parameters. Options: - 'general': infers variation_in and variation_out - 'simple': fixes variation_in = variation_out = 0.5 - 'none': fixes variation_in = variation_out = 0 - 'internal': fixes variation_in = 0.5, variation_out = 0 - 'external': fixes variation_in = 0, variation_out = 0.5 Defaults to 'general'. variation_in : float, optional Value of the variation_in (intra-group mixing variation) parameter. Must be from 0 to 1. variation_in = 0 yields no variation in internal mixing weights, as found in the planted partition model, while variation_in = 0.5 corresponds to typical variation as found in the traditional SBM. A value of None allows the parameter to vary freely. Defaults to None. variation_out : float, optional Value of the variation_out (inter-group mixing variation) parameter. Must be from 0 to 1. variation_out = 0 yields no variation in external mixing weights, as found in the planted partition model, while variation_out = 0.5 corresponds to typical variation as found in the traditional SBM. A value of None allows the parameter to vary freely. Defaults to None. degree_correction : float, optional Value of the degree_correction (in-group degree inequality) parameter. Must be from 0 to 1. A value of 0 corresponds to no degree correction and a value of 0.5 corresponds to typical degree correction. A value of None allows the parameter to vary freely. Defaults to None. mean_degree_scaling : float, optional Value of the mean_degree_scaling (gamma) parameter. Mean degree of nodes in a group of size n scales as n^mean_degree_scaling. Traditional canonical models have gamma = 0, while microcanonical models have gamma = -1. A value of None allows the parameter to vary freely. Defaults to 0. individual_depth : float, optional Value of the individual_depth parameter controlling the spread of individual scores within groups. Must be positive. A value of None allows the parameter to vary freely. Defaults to 1.0. group_depth : float, optional Value of the group_depth parameter controlling the spread of group scores. Must be non-negative. A value of None allows the parameter to vary freely. Defaults to 0.0, which corresponds to independent groups and hierarchy. ties_parameter : float, optional Value of the ties_parameter controlling the frequency of ties in the hierarchy model. A value of None allows the parameter to vary freely. Defaults to 0.0 (no ties, Bradley-Terry model). num_groups : int, optional Number of communities (q) in the graph. If None, the number of communities is inferred from the data. Defaults to None. initial_partition : List[int], optional Initial partition of the graph. If None, a modularity maximized partition is used. Defaults to None. initial_scores : List[float], optional Initial scores of the nodes for the hierarchy model. If None, win percentages are used. Defaults to None. seed : int, optional Random seed for reproducibility. Defaults to None. num_samples : int, optional Number of samples to take. Defaults to 1000. sweeps_per_sample : int, optional Number of MCMC sweeps to perform between each sample. Defaults to 10. merge_split_enabled: bool, optional Whether to use merge-split moves in the MCMC sampling. Defaults to True. beta : float, optional Inverse temperature. Defaults to 1.0. num_tempering_chains : int, optional Number of parallel tempering chains to use. Defaults to 1, no parallel tempering. If greater than 1, samples will include their value of beta ranging from 0 to the provided beta. no_cache : bool, optional If True, do not use cached samples. Defaults to False. timeout : float, optional Maximum time in seconds to take samples for before timing out. Defaults to 60. verbose : bool, optional Whether to print progress and debug information. Defaults to False. Returns --------- pd.DataFrame DataFrame of posterior samples. Each row is a sample, and each column is a variable. """ # Check input validity _check_input_validity( G=G, groups_model=groups_model, hierarchy_model=hierarchy_model, interaction=interaction, assortative=assortative, mixing_variation=mixing_variation, variation_in=variation_in, variation_out=variation_out, degree_correction=degree_correction, mean_degree_scaling=mean_degree_scaling, individual_depth=individual_depth, group_depth=group_depth, ties_parameter=ties_parameter, num_groups=num_groups, initial_partition=initial_partition, initial_scores=initial_scores, seed=seed, num_samples=num_samples, merge_split_enabled=merge_split_enabled, beta=beta, num_tempering_chains=num_tempering_chains, no_cache=no_cache, timeout=timeout, verbose=verbose, ) # Split the graph into a directed graph of wins and an undirected graph of ties G_wins = nx.DiGraph() G_ties = nx.Graph() G_wins.add_nodes_from(G.nodes(data=True)) G_ties.add_nodes_from(G.nodes(data=True)) for u, v, d in G.edges(data=True): if d.get("type") == "dominant": G_wins.add_edge(u, v) elif d.get("type") == "neutral": if hierarchy_model != "bradley_terry": # Bradley-Terry model drops ties G_ties.add_edge(u, v) if verbose: print( f"Graph has {G.number_of_nodes()} nodes, {G_wins.number_of_edges()} dominant, and {G_ties.number_of_edges()} neutral edges." ) # Hash and cache the graphs G_wins_hash = _store_wins_graph(G_wins) G_ties_hash = _store_ties_graph(G_ties) # Normalize possibly-unhashable args initial_partition_tuple = ( tuple(initial_partition) if initial_partition is not None else None ) initial_scores_tuple = tuple(initial_scores) if initial_scores is not None else None # Set model parameters according to model_name if needed ( assortative, mixing_variation, variation_in, variation_out, degree_correction, mean_degree_scaling, individual_depth, group_depth, ties_parameter, num_groups, ) = _set_model_parameters( groups_model, hierarchy_model, interaction, assortative, mixing_variation, variation_in, variation_out, degree_correction, mean_degree_scaling, individual_depth, group_depth, ties_parameter, num_groups, ) return _samples_cached( G_wins_hash, G_ties_hash, assortative, variation_in, variation_out, degree_correction, mean_degree_scaling, individual_depth, group_depth, ties_parameter, num_groups, initial_partition_tuple, initial_scores_tuple, seed, num_samples, sweeps_per_sample, merge_split_enabled, beta, num_tempering_chains, no_cache, timeout, verbose, )
def node_groups( G: nx.Graph, *, model_name: str = "general_canonical", assortative: bool = _UNSET, mixing_variation: str = _UNSET, variation_in: float | None = _UNSET, variation_out: float | None = _UNSET, degree_correction: float | None = _UNSET, mean_degree_scaling: float | None = _UNSET, num_groups: int | None = _UNSET, verbose: bool = False, **kwargs, ) -> pd.DataFrame: """ Find groups of nodes in the network using a stochastic block model with various possible configurations. Parameters ---------- G : nx.Graph NetworkX graph to find group structure of. model_name : str, optional Type of model to fit. Choices fix parameters to special cases of interest, but can be overridden by providing specific parameter values. Options and corrresponding parameter values: - 'general_canonical': All parameters inferred except mean_degree_scaling, which is set to 0 (canonical model) - 'general_unified': All parameters inferred including mean_degree_scaling (generalizes all models considered) - 'traditional_SBM': assortative = False, mixing_variation = 'simple', degree_correction = 0, mean_degree_scaling = 0 - 'traditional_DCSBM': assortative = False, mixing_variation = 'simple', degree_correction = 0.5, mean_degree_scaling = 0 (traditional degree-corrected stochastic block model) - 'traditional_GDCSBM': assortative = False, mixing_variation = 'simple', degree_correction = None, mean_degree_scaling = 0 (infer degree correction, generalized) - 'simple_ASBM': assortative = True, mixing_variation = 'simple', degree_correction = 0, mean_degree_scaling = 0 (assortative with simple mixing variation) - 'hybrid_ASBM': assortative = True, mixing_variation = 'internal', degree_correction = 0, mean_degree_scaling = 0 (assortative with internal mixing variation, Zhang and Peixoto 2020) - 'planted_partition': assortative = True, mixing_variation = 'none', degree_correction = 0, mean_degree_scaling = 0 Defaults to 'general_canonical'. If a different model_name is provided, the corresponding parameters are set unless explicitly overridden. assortative : bool, optional Whether to allow the model to consider generically assortative group structure. If assortative = False, fix the overall in and out group densities to be equal: rho_in = rho_out. Defaults to True. mixing_variation : str, optional Type of variation in mixing weights to use. Controls variation_in, variation_out parameters. Options: - 'general': infers variation_in and variation_out - 'simple': fixes variation_in = variation_out = 0.5 - 'none': fixes variation_in = variation_out = 0 - 'internal': fixes variation_in = 0.5, variation_out = 0 - 'external': fixes variation_in = 0, variation_out = 0.5 Defaults to 'general'. variation_in : float, optional Value of the variation_in (intra-group mixing variation) parameter. Must be from 0 to 1. variation_in = 0 yields no variation in internal mixing weights, as found in the planted partition model, while variation_in = 0.5 corresponds to typical variation as found in the traditional SBM. A value of None allows the parameter to vary freely. Defaults to None. variation_out : float, optional Value of the variation_out (inter-group mixing variation) parameter. Must be from 0 to 1. variation_out = 0 yields no variation in external mixing weights, as found in the planted partition model, while variation_out = 0.5 corresponds to typical variation as found in the traditional SBM. A value of None allows the parameter to vary freely. Defaults to None. degree_correction : float, optional Value of the degree_correction (in-group degree inequality) parameter. Must be from 0 to 1. A value of 0 corresponds to no degree correction and a value of 0.5 corresponds to typical degree correction. A value of None allows the parameter to vary freely. Defaults to None. mean_degree_scaling : float, optional Value of the mean_degree_scaling (gamma) parameter. Mean degree of nodes in a group of size n scales as n^mean_degree_scaling. Traditional canonical models have gamma = 0, while microcanonical models have gamma = -1. A value of None allows the parameter to vary freely. Defaults to 0. num_groups : int, optional Number of communities (q) in the graph. If None, the number of communities is inferred from the data. Defaults to None. verbose : bool, optional Whether to print progress and debug information. Defaults to False. kwargs : optional Additional keyword arguments to specify the sampling procedure. See `sampling` for details. """ # Check input validity _check_input_validity( G=G, model_name=model_name, assortative=assortative, mixing_variation=mixing_variation, variation_in=variation_in, variation_out=variation_out, degree_correction=degree_correction, mean_degree_scaling=mean_degree_scaling, num_groups=num_groups, verbose=verbose, **kwargs, ) # Get the samples for this model configuration (likely cached) samples_df = samples( G, model_name=model_name, assortative=assortative, mixing_variation=mixing_variation, variation_in=variation_in, variation_out=variation_out, degree_correction=degree_correction, mean_degree_scaling=mean_degree_scaling, num_groups=num_groups, **kwargs, ) print(samples_df) # Get the clusterings returned by the samples (as an array) clusterings = samples_df.filter(like="group_").to_numpy() print(clusterings) # Find the consensus out of the returned clusterings clustering = consensus_clustering(clusterings) pass
[docs] def node_properties( G: nx.Graph, consensus_clustering_metric="L1", **kwargs ) -> pd.DataFrame: """ Infer properties of individual nodes in the network. Returns a pandas DataFrame with the consensus group of each node, its score within the hierarchy, and the average status of their group. Parameters ---------- G : nx.Graph NetworkX graph to find group structure of. consensus_clustering_metric : str, optional Metric to use for finding consensus clustering. Options: {'L1', 'L2'}. Defaults to 'L1'. kwargs : optional Additional keyword arguments to specify the model configuration or sampling. See :func:`samples` for full options. """ samples_df = samples(G, **kwargs) # Extract names of nodes from the graph node_names = list(G.nodes()) # Extract list of clusterings from samples dataframe # Find all columns ending with "_group" group_columns = [f"{node}_group" for node in node_names] clusterings = [] for col in group_columns: clusterings.append(samples_df[col].values) clusterings = np.array(clusterings).T # Find consensus clustering with L1 metric consensus_groups = consensus_clustering(clusterings, metric="L1") # Compute the mean and standard deviation of the scores for each node across samples score_columns = [f"{node}_score" for node in node_names] scores_mean = samples_df[score_columns].mean().to_numpy() scores_std = samples_df[score_columns].std().to_numpy() # Add column that gives the average score of each group for each sample, then compute the means and std across the samples group_members = {group: [] for group in np.unique(consensus_groups)} for node, group in zip(node_names, consensus_groups): group_members[group].append(node) group_score_sample_means = {} group_score_means = np.zeros(len(consensus_groups)) group_score_stds = np.zeros(len(consensus_groups)) for group, members in group_members.items(): member_score_columns = [f"{member}_score" for member in members] group_score_sample_means[group] = samples_df[member_score_columns].mean(axis=1) group_score_mean = group_score_sample_means[group].mean() group_score_std = group_score_sample_means[group].std() for node in members: group_score_means[node_names.index(node)] = group_score_mean group_score_stds[node_names.index(node)] = group_score_std node_properties_df = pd.DataFrame( { "node": node_names, "consensus_group": consensus_groups, "score_mean": scores_mean, "score_std": scores_std, "group_score_mean": group_score_means, "group_score_std": group_score_stds, } ) return node_properties_df
[docs] def network_properties(G: nx.Graph, **kwargs) -> pd.DataFrame: """ Infer properties of the overall network structure. Returns a pandas DataFrame with the inferred parameters and their uncertainties, including those fixed by the model specification. These are: - num_groups: number of groups in the network - density_in: average density of edges within groups - density_out: average density of edges between groups - variation_in: variation in the internal mixing weights of nodes within groups - variation_out: variation in the external mixing weights of nodes within groups - degree_correction: level of degree correction, or in-group degree inequality - mean_degree_scaling: scaling of mean degree with group size, where mean degree of nodes in a group of size n scales as n^mean_degree_scaling - individual_depth: spread of individual scores within groups - group_depth: spread of group scores - ties_parameter: frequency of ties in the hierarchy model Parameters ---------- G : nx.Graph NetworkX graph to find group structure of. kwargs : optional Additional keyword arguments to specify the model configuration or sampling. See :func:`samples` for full options. """ samples_df = samples(G, **kwargs) parameter_list = [ "num_groups", "density_in", "density_out", "variation_in", "variation_out", "degree_correction", "mean_degree_scaling", "individual_depth", "group_depth", "ties_parameter", ] # Calculate means and standard deviations for each parameter parameters_df = pd.DataFrame( { "parameter": parameter_list, "mean": [samples_df[param].mean() for param in parameter_list], "std": [samples_df[param].std() for param in parameter_list], } ) return parameters_df
def parameters( G: nx.Graph, **kwargs, ) -> pd.DataFrame: """ Infer global parameters of the stochastic block model fitted to the network. Returns a pandas DataFrame with the inferred parameters and their values, including those fixed by the chosen model. Parameters ---------- G : nx.Graph NetworkX graph to find group structure of. kwargs : optional Additional keyword arguments to specify the model configuration or sampling. See `node_groups` and `sampling` for details. """ # Check input validity _check_input_validity(G=G, **kwargs) # Get the samples for this model configuration (likely cached) samples_df = samples( G, **kwargs, ) # Get the means and standard deviations of each of the inferred global parameters pass def mixing_matrix( G: nx.Graph, node_groups: pd.DataFrame, **kwargs, ) -> pd.DataFrame: """ Estimate the probability of connection between each pair of inferred groups in the network. Uses globally inferred parameters to adjust observed edge densities. TODO: We could fix this to actually just sample the hyperparameters with these fixed group assignments. Parameters ---------- G : nx.Graph NetworkX graph to find group structure of. node_groups : pd.DataFrame DataFrame containing the node groups along which to compute the mixing matrix. kwargs : optional Additional keyword arguments to specify the model configuration or sampling. See `node_groups` and `sampling` for details. """ # Check input validity _check_input_validity(G=G, **kwargs) # Warning if num_groups is provided (ignored since this is determined by node_groups) if "num_groups" in kwargs: import warnings warnings.warn( "num_groups parameter is ignored in mixing_matrix since groups are determined by node_groups.", UserWarning, ) # Get the parameters for this model configuration (likely cached) parameters_df = parameters( G, **kwargs, ) # Count the number of edges between each pair of groups # This should be implemented in C++ since it will be needed by the predictive portion anyway. pass