Source code for directedstructure.core

# Wrappers for the core functionality

from __future__ import annotations

import numpy as np
from numpy.typing import ArrayLike
import networkx as nx
import pandas as pd

from .input_output import _check_input_validity, _UNSET, _set_model_parameters
from ._graph_cache import (
    _store_wins_graph,
    _get_wins_graph,
    _store_ties_graph,
    _get_ties_graph,
)
from .sample import _samples_cached
from .clusterings import consensus_clustering



[docs]
def samples(
    G: nx.Graph,
    *,
    groups_model: str = "general_canonical",
    hierarchy_model: str = _UNSET,
    interaction: str = "coupled",
    assortative: bool = _UNSET,
    mixing_variation: str = _UNSET,
    variation_in: float | None = _UNSET,
    variation_out: float | None = _UNSET,
    degree_correction: float | None = _UNSET,
    mean_degree_scaling: float | None = _UNSET,
    individual_depth: float | None = _UNSET,
    group_depth: float | None = _UNSET,
    ties_parameter: float | None = _UNSET,
    num_groups: int | None = _UNSET,
    initial_partition: list[int] | None = None,
    initial_scores: list[float] | None = None,
    seed: int | None = None,
    num_samples: int = 1000,
    sweeps_per_sample: int = 10,
    merge_split_enabled: bool = True,
    beta: float = 1.0,
    num_tempering_chains: int = 1,
    no_cache: bool = False,
    timeout: float = 60.0,
    verbose: bool = False,
) -> pd.DataFrame:
    """
    Get pandas DataFrame of samples from the posterior distribution given the provided network.
    Returns cached samples if the function has been called before with the same parameters. Provide a new seed to generate independent samples.

    Parameters
    ----------

    G : nx.Graph
        NetworkX graph to find group structure of.
    groups_model : str, optional
        Type of model to fit. Choices fix parameters to special cases of interest, but can be overridden by providing specific parameter values.
        Options and corrresponding parameter values:
            - 'general_canonical': All parameters inferred except mean_degree_scaling, which is set to 0 (canonical model) [Default]
            - 'traditional_SBM': assortative = False, mixing_variation = 'simple', degree_correction = 0, mean_degree_scaling = 0
            - 'traditional_DCSBM': assortative = False, mixing_variation = 'simple', degree_correction = 0.5, mean_degree_scaling = 0 (traditional degree-corrected stochastic block model)
            - 'traditional_GDCSBM': assortative = False, mixing_variation = 'simple', degree_correction = None, mean_degree_scaling = 0 (infer degree correction, generalized)
            - 'simple_ASBM': assortative = True, mixing_variation = 'simple', degree_correction = 0, mean_degree_scaling = 0 (assortative with simple mixing variation)
            - 'hybrid_ASBM': assortative = True, mixing_variation = 'internal', degree_correction = 0, mean_degree_scaling = 0 (assortative with internal mixing variation, Zhang and Peixoto 2020)
            - 'planted_partition': assortative = True, mixing_variation = 'none', degree_correction = 0, mean_degree_scaling = 0
            - 'general_ASBM': assortative = True, mixing_variation = 'none', degree_correction = 0, mean_degree_scaling = 0
            - 'microcanonical_SBM': assortative = False, mixing_variation = 'simple', degree_correction = 0, mean_degree_scaling = -1 (microcanonical SBM)
            - 'microcanonical_DCSBM': assortative = False, mixing_variation = 'simple', degree_correction = 0.5, mean_degree_scaling = -1 (microcanonical degree-corrected SBM)
            - 'general_unified': All parameters inferred including mean_degree_scaling (generalizes all models considered)
        Defaults to 'general_canonical'. If a different model_name is provided, the corresponding parameters are set unless explicitly overridden.
    hierarchy_model : str, optional
        Type of hierarchy model to fit. Options:
            - 'bradley_terry': Bradley-Terry model for pairwise comparisons
            - 'bradley_terry_ties': Bradley-Terry model with ties
        Defaults to 'bradley_terry_ties' with neutral interactions are present and 'bradley_terry' otherwise.
        If neutral interactions are present and 'bradley_terry' is chosen only dominant interactions are used.
    interaction : str, optional
        Type of coupling between groups and hierarchy to use. Options:
            - 'coupled': groups and hierarchy are coupled (default)
            - 'independent': groups and hierarchy are inferred independently
    assortative : bool, optional
        Whether to allow the model to consider generically assortative group structure.
        If assortative = False, fix the overall in and out group densities to be equal: rho_in = rho_out.
        Defaults to True.
    mixing_variation : str, optional
        Type of variation in mixing weights to use. Controls variation_in, variation_out parameters.
        Options:
            - 'general': infers variation_in and variation_out
            - 'simple': fixes variation_in = variation_out = 0.5
            - 'none': fixes variation_in = variation_out = 0
            - 'internal': fixes variation_in = 0.5, variation_out = 0
            - 'external': fixes variation_in = 0, variation_out = 0.5
        Defaults to 'general'.
    variation_in : float, optional
        Value of the variation_in (intra-group mixing variation) parameter. Must be from 0 to 1.
        variation_in = 0 yields no variation in internal mixing weights, as found in the planted partition model, while
        variation_in = 0.5 corresponds to typical variation as found in the traditional SBM.
        A value of None allows the parameter to vary freely. Defaults to None.
    variation_out : float, optional
        Value of the variation_out (inter-group mixing variation) parameter. Must be from 0 to 1.
        variation_out = 0 yields no variation in external mixing weights, as found in the planted partition model, while
        variation_out = 0.5 corresponds to typical variation as found in the traditional SBM.
        A value of None allows the parameter to vary freely. Defaults to None.
    degree_correction : float, optional
        Value of the degree_correction (in-group degree inequality) parameter. Must be from 0 to 1.
        A value of 0 corresponds to no degree correction and a value of 0.5 corresponds to typical degree correction.
        A value of None allows the parameter to vary freely. Defaults to None.
    mean_degree_scaling : float, optional
        Value of the mean_degree_scaling (gamma) parameter. Mean degree of nodes in a group of size n scales as n^mean_degree_scaling.
        Traditional canonical models have gamma = 0, while microcanonical models have gamma = -1.
        A value of None allows the parameter to vary freely. Defaults to 0.
    individual_depth : float, optional
        Value of the individual_depth parameter controlling the spread of individual scores within groups. Must be positive.
        A value of None allows the parameter to vary freely. Defaults to 1.0.
    group_depth : float, optional
        Value of the group_depth parameter controlling the spread of group scores. Must be non-negative.
        A value of None allows the parameter to vary freely. Defaults to 0.0, which corresponds to independent groups and hierarchy.
    ties_parameter : float, optional
        Value of the ties_parameter controlling the frequency of ties in the hierarchy model.
        A value of None allows the parameter to vary freely. Defaults to 0.0 (no ties, Bradley-Terry model).
    num_groups : int, optional
        Number of communities (q) in the graph. If None, the number of communities is inferred from the data. Defaults to None.
    initial_partition : List[int], optional
        Initial partition of the graph. If None, a modularity maximized partition is used. Defaults to None.
    initial_scores : List[float], optional
        Initial scores of the nodes for the hierarchy model. If None, win percentages are used. Defaults to None.
    seed : int, optional
        Random seed for reproducibility. Defaults to None.
    num_samples : int, optional
        Number of samples to take. Defaults to 1000.
    sweeps_per_sample : int, optional
        Number of MCMC sweeps to perform between each sample. Defaults to 10.
    merge_split_enabled: bool, optional
        Whether to use merge-split moves in the MCMC sampling. Defaults to True.
    beta : float, optional
        Inverse temperature. Defaults to 1.0.
    num_tempering_chains : int, optional
        Number of parallel tempering chains to use. Defaults to 1, no parallel tempering.
        If greater than 1, samples will include their value of beta ranging from 0 to the provided beta.
    no_cache : bool, optional
        If True, do not use cached samples. Defaults to False.
    timeout : float, optional
        Maximum time in seconds to take samples for before timing out. Defaults to 60.
    verbose : bool, optional
        Whether to print progress and debug information. Defaults to False.
    Returns
    ---------
    pd.DataFrame
        DataFrame of posterior samples. Each row is a sample, and each column is a variable.
    """

    # Check input validity
    _check_input_validity(
        G=G,
        groups_model=groups_model,
        hierarchy_model=hierarchy_model,
        interaction=interaction,
        assortative=assortative,
        mixing_variation=mixing_variation,
        variation_in=variation_in,
        variation_out=variation_out,
        degree_correction=degree_correction,
        mean_degree_scaling=mean_degree_scaling,
        individual_depth=individual_depth,
        group_depth=group_depth,
        ties_parameter=ties_parameter,
        num_groups=num_groups,
        initial_partition=initial_partition,
        initial_scores=initial_scores,
        seed=seed,
        num_samples=num_samples,
        merge_split_enabled=merge_split_enabled,
        beta=beta,
        num_tempering_chains=num_tempering_chains,
        no_cache=no_cache,
        timeout=timeout,
        verbose=verbose,
    )

    # Split the graph into a directed graph of wins and an undirected graph of ties
    G_wins = nx.DiGraph()
    G_ties = nx.Graph()
    G_wins.add_nodes_from(G.nodes(data=True))
    G_ties.add_nodes_from(G.nodes(data=True))
    for u, v, d in G.edges(data=True):
        if d.get("type") == "dominant":
            G_wins.add_edge(u, v)
        elif d.get("type") == "neutral":
            if hierarchy_model != "bradley_terry":  # Bradley-Terry model drops ties
                G_ties.add_edge(u, v)

    if verbose:
        print(
            f"Graph has {G.number_of_nodes()} nodes, {G_wins.number_of_edges()} dominant, and {G_ties.number_of_edges()} neutral edges."
        )

    # Hash and cache the graphs
    G_wins_hash = _store_wins_graph(G_wins)
    G_ties_hash = _store_ties_graph(G_ties)

    # Normalize possibly-unhashable args
    initial_partition_tuple = (
        tuple(initial_partition) if initial_partition is not None else None
    )
    initial_scores_tuple = tuple(initial_scores) if initial_scores is not None else None

    # Set model parameters according to model_name if needed
    (
        assortative,
        mixing_variation,
        variation_in,
        variation_out,
        degree_correction,
        mean_degree_scaling,
        individual_depth,
        group_depth,
        ties_parameter,
        num_groups,
    ) = _set_model_parameters(
        groups_model,
        hierarchy_model,
        interaction,
        assortative,
        mixing_variation,
        variation_in,
        variation_out,
        degree_correction,
        mean_degree_scaling,
        individual_depth,
        group_depth,
        ties_parameter,
        num_groups,
    )

    return _samples_cached(
        G_wins_hash,
        G_ties_hash,
        assortative,
        variation_in,
        variation_out,
        degree_correction,
        mean_degree_scaling,
        individual_depth,
        group_depth,
        ties_parameter,
        num_groups,
        initial_partition_tuple,
        initial_scores_tuple,
        seed,
        num_samples,
        sweeps_per_sample,
        merge_split_enabled,
        beta,
        num_tempering_chains,
        no_cache,
        timeout,
        verbose,
    )



def node_groups(
    G: nx.Graph,
    *,
    model_name: str = "general_canonical",
    assortative: bool = _UNSET,
    mixing_variation: str = _UNSET,
    variation_in: float | None = _UNSET,
    variation_out: float | None = _UNSET,
    degree_correction: float | None = _UNSET,
    mean_degree_scaling: float | None = _UNSET,
    num_groups: int | None = _UNSET,
    verbose: bool = False,
    **kwargs,
) -> pd.DataFrame:
    """
    Find groups of nodes in the network using a stochastic block model with various possible configurations.

    Parameters
    ----------

    G : nx.Graph
        NetworkX graph to find group structure of.
    model_name : str, optional
        Type of model to fit. Choices fix parameters to special cases of interest, but can be overridden by providing specific parameter values.
        Options and corrresponding parameter values:
            - 'general_canonical': All parameters inferred except mean_degree_scaling, which is set to 0 (canonical model)
            - 'general_unified': All parameters inferred including mean_degree_scaling (generalizes all models considered)
            - 'traditional_SBM': assortative = False, mixing_variation = 'simple', degree_correction = 0, mean_degree_scaling = 0
            - 'traditional_DCSBM': assortative = False, mixing_variation = 'simple', degree_correction = 0.5, mean_degree_scaling = 0 (traditional degree-corrected stochastic block model)
            - 'traditional_GDCSBM': assortative = False, mixing_variation = 'simple', degree_correction = None, mean_degree_scaling = 0 (infer degree correction, generalized)
            - 'simple_ASBM': assortative = True, mixing_variation = 'simple', degree_correction = 0, mean_degree_scaling = 0 (assortative with simple mixing variation)
            - 'hybrid_ASBM': assortative = True, mixing_variation = 'internal', degree_correction = 0, mean_degree_scaling = 0 (assortative with internal mixing variation, Zhang and Peixoto 2020)
            - 'planted_partition': assortative = True, mixing_variation = 'none', degree_correction = 0, mean_degree_scaling = 0
        Defaults to 'general_canonical'. If a different model_name is provided, the corresponding parameters are set unless explicitly overridden.
    assortative : bool, optional
        Whether to allow the model to consider generically assortative group structure.
        If assortative = False, fix the overall in and out group densities to be equal: rho_in = rho_out.
        Defaults to True.
    mixing_variation : str, optional
        Type of variation in mixing weights to use. Controls variation_in, variation_out parameters.
        Options:
            - 'general': infers variation_in and variation_out
            - 'simple': fixes variation_in = variation_out = 0.5
            - 'none': fixes variation_in = variation_out = 0
            - 'internal': fixes variation_in = 0.5, variation_out = 0
            - 'external': fixes variation_in = 0, variation_out = 0.5
        Defaults to 'general'.
    variation_in : float, optional
        Value of the variation_in (intra-group mixing variation) parameter. Must be from 0 to 1.
        variation_in = 0 yields no variation in internal mixing weights, as found in the planted partition model, while
        variation_in = 0.5 corresponds to typical variation as found in the traditional SBM.
        A value of None allows the parameter to vary freely. Defaults to None.
    variation_out : float, optional
        Value of the variation_out (inter-group mixing variation) parameter. Must be from 0 to 1.
        variation_out = 0 yields no variation in external mixing weights, as found in the planted partition model, while
        variation_out = 0.5 corresponds to typical variation as found in the traditional SBM.
        A value of None allows the parameter to vary freely. Defaults to None.
    degree_correction : float, optional
        Value of the degree_correction (in-group degree inequality) parameter. Must be from 0 to 1.
        A value of 0 corresponds to no degree correction and a value of 0.5 corresponds to typical degree correction.
        A value of None allows the parameter to vary freely. Defaults to None.
    mean_degree_scaling : float, optional
        Value of the mean_degree_scaling (gamma) parameter. Mean degree of nodes in a group of size n scales as n^mean_degree_scaling.
        Traditional canonical models have gamma = 0, while microcanonical models have gamma = -1.
        A value of None allows the parameter to vary freely. Defaults to 0.
    num_groups : int, optional
        Number of communities (q) in the graph. If None, the number of communities is inferred from the data. Defaults to None.
    verbose : bool, optional
        Whether to print progress and debug information. Defaults to False.
    kwargs : optional
        Additional keyword arguments to specify the sampling procedure. See `sampling` for details.
    """

    # Check input validity
    _check_input_validity(
        G=G,
        model_name=model_name,
        assortative=assortative,
        mixing_variation=mixing_variation,
        variation_in=variation_in,
        variation_out=variation_out,
        degree_correction=degree_correction,
        mean_degree_scaling=mean_degree_scaling,
        num_groups=num_groups,
        verbose=verbose,
        **kwargs,
    )

    # Get the samples for this model configuration (likely cached)
    samples_df = samples(
        G,
        model_name=model_name,
        assortative=assortative,
        mixing_variation=mixing_variation,
        variation_in=variation_in,
        variation_out=variation_out,
        degree_correction=degree_correction,
        mean_degree_scaling=mean_degree_scaling,
        num_groups=num_groups,
        **kwargs,
    )

    print(samples_df)

    # Get the clusterings returned by the samples (as an array)
    clusterings = samples_df.filter(like="group_").to_numpy()

    print(clusterings)

    # Find the consensus out of the returned clusterings
    clustering = consensus_clustering(clusterings)
    pass



[docs]
def node_properties(
    G: nx.Graph, consensus_clustering_metric="L1", **kwargs
) -> pd.DataFrame:
    """
    Infer properties of individual nodes in the network.
    Returns a pandas DataFrame with the consensus group of each node, its score within the hierarchy, and the average status of their group.

    Parameters
    ----------
    G : nx.Graph
        NetworkX graph to find group structure of.
    consensus_clustering_metric : str, optional
        Metric to use for finding consensus clustering. Options: {'L1', 'L2'}. Defaults to 'L1'.
    kwargs : optional
        Additional keyword arguments to specify the model configuration or sampling. See :func:`samples` for full options.

    """
    samples_df = samples(G, **kwargs)

    # Extract names of nodes from the graph
    node_names = list(G.nodes())

    # Extract list of clusterings from samples dataframe
    # Find all columns ending with "_group"
    group_columns = [f"{node}_group" for node in node_names]
    clusterings = []
    for col in group_columns:
        clusterings.append(samples_df[col].values)

    clusterings = np.array(clusterings).T

    # Find consensus clustering with L1 metric
    consensus_groups = consensus_clustering(clusterings, metric="L1")

    # Compute the mean and standard deviation of the scores for each node across samples
    score_columns = [f"{node}_score" for node in node_names]
    scores_mean = samples_df[score_columns].mean().to_numpy()
    scores_std = samples_df[score_columns].std().to_numpy()

    # Add column that gives the average score of each group for each sample, then compute the means and std across the samples

    group_members = {group: [] for group in np.unique(consensus_groups)}
    for node, group in zip(node_names, consensus_groups):
        group_members[group].append(node)

    group_score_sample_means = {}
    group_score_means = np.zeros(len(consensus_groups))
    group_score_stds = np.zeros(len(consensus_groups))
    for group, members in group_members.items():
        member_score_columns = [f"{member}_score" for member in members]
        group_score_sample_means[group] = samples_df[member_score_columns].mean(axis=1)
        group_score_mean = group_score_sample_means[group].mean()
        group_score_std = group_score_sample_means[group].std()
        for node in members:
            group_score_means[node_names.index(node)] = group_score_mean
            group_score_stds[node_names.index(node)] = group_score_std

    node_properties_df = pd.DataFrame(
        {
            "node": node_names,
            "consensus_group": consensus_groups,
            "score_mean": scores_mean,
            "score_std": scores_std,
            "group_score_mean": group_score_means,
            "group_score_std": group_score_stds,
        }
    )

    return node_properties_df




[docs]
def network_properties(G: nx.Graph, **kwargs) -> pd.DataFrame:
    """
    Infer properties of the overall network structure.
    Returns a pandas DataFrame with the inferred parameters and their uncertainties, including those fixed by the model specification.

    These are:
        - num_groups: number of groups in the network
        - density_in: average density of edges within groups
        - density_out: average density of edges between groups
        - variation_in: variation in the internal mixing weights of nodes within groups
        - variation_out: variation in the external mixing weights of nodes within groups
        - degree_correction: level of degree correction, or in-group degree inequality
        - mean_degree_scaling: scaling of mean degree with group size, where mean degree of nodes in a group of size n scales as n^mean_degree_scaling
        - individual_depth: spread of individual scores within groups
        - group_depth: spread of group scores
        - ties_parameter: frequency of ties in the hierarchy model

    Parameters
    ----------

    G : nx.Graph
        NetworkX graph to find group structure of.
    kwargs : optional
        Additional keyword arguments to specify the model configuration or sampling. See :func:`samples` for full options.
    """
    samples_df = samples(G, **kwargs)

    parameter_list = [
        "num_groups",
        "density_in",
        "density_out",
        "variation_in",
        "variation_out",
        "degree_correction",
        "mean_degree_scaling",
        "individual_depth",
        "group_depth",
        "ties_parameter",
    ]

    # Calculate means and standard deviations for each parameter
    parameters_df = pd.DataFrame(
        {
            "parameter": parameter_list,
            "mean": [samples_df[param].mean() for param in parameter_list],
            "std": [samples_df[param].std() for param in parameter_list],
        }
    )

    return parameters_df



def parameters(
    G: nx.Graph,
    **kwargs,
) -> pd.DataFrame:
    """
    Infer global parameters of the stochastic block model fitted to the network.
    Returns a pandas DataFrame with the inferred parameters and their values, including those fixed by the chosen model.

    Parameters
    ----------

    G : nx.Graph
        NetworkX graph to find group structure of.
    kwargs : optional
        Additional keyword arguments to specify the model configuration or sampling. See `node_groups` and `sampling` for details.
    """
    # Check input validity
    _check_input_validity(G=G, **kwargs)

    # Get the samples for this model configuration (likely cached)
    samples_df = samples(
        G,
        **kwargs,
    )

    # Get the means and standard deviations of each of the inferred global parameters

    pass


def mixing_matrix(
    G: nx.Graph,
    node_groups: pd.DataFrame,
    **kwargs,
) -> pd.DataFrame:
    """
    Estimate the probability of connection between each pair of inferred groups in the network.
    Uses globally inferred parameters to adjust observed edge densities.
    TODO: We could fix this to actually just sample the hyperparameters with these fixed group assignments.

    Parameters
    ----------

    G : nx.Graph
        NetworkX graph to find group structure of.
    node_groups : pd.DataFrame
        DataFrame containing the node groups along which to compute the mixing matrix.
    kwargs : optional
        Additional keyword arguments to specify the model configuration or sampling. See `node_groups` and `sampling` for details.
    """
    # Check input validity
    _check_input_validity(G=G, **kwargs)

    # Warning if num_groups is provided (ignored since this is determined by node_groups)
    if "num_groups" in kwargs:
        import warnings

        warnings.warn(
            "num_groups parameter is ignored in mixing_matrix since groups are determined by node_groups.",
            UserWarning,
        )

    # Get the parameters for this model configuration (likely cached)
    parameters_df = parameters(
        G,
        **kwargs,
    )

    # Count the number of edges between each pair of groups
    # This should be implemented in C++ since it will be needed by the predictive portion anyway.

    pass