Source code for KGBN.BMatrix

## Created by Tazein on 1/29/24
"""
Resources for loading boolean networks from files (strings)
"""

import numpy as np
import pandas as pd
import re
from itertools import product
import os

################## equations for simulation variables ##################

BUILT_INS = {'0', '1', 'True', 'False'}


[docs]
def get_equations(file = None, string = None):
    if file is not None:
        with open(file, 'r') as file:
            string = file.readlines()
    elif string is not None:
        string = string.split('\n')
    else:
        raise ValueError("Either file or string must be provided")
    seen_genes = set()
    duplicate_genes = []
    equations = []
    for equation in string:
        equation = equation.strip()
        if len(equation) > 0 and not equation.startswith('#'):
            # Remove inline comments
            if '#' in equation:
                equation = equation.split('#')[0].strip()
            if len(equation) > 0 and '=' in equation:
                parts = equation.split('=')
                gene = parts[0].strip()
                if gene and gene not in seen_genes:
                    seen_genes.add(gene)
                    equations.append(equation)
                elif gene and gene in seen_genes:
                    duplicate_genes.append(gene)
    if len(duplicate_genes) > 0:
        print(f"Duplicate genes found: {duplicate_genes}\nUsing the first occurrence of each gene.")
    return equations




[docs]
def get_gene_dict(equations):
    left_side = []
    for equation in equations:
        parts = equation.split('=')
        value = parts[0].strip()
        if value not in left_side:
            left_side.append(value)
    
    genes = left_side
    
    # making a dictionary for the genes starting from 0
    gene_dict = {gene: i for i, gene in enumerate(genes)}
    return(gene_dict)



[docs]
def get_upstream_genes(equations): 
    "Returns a string of gene names, space-separated, for each of the equations."
    #get only the right side of the equations
    right_side = []
    for equation in equations:
        parts = equation.split('=')
        value = parts[1].strip()
        right_side.append(value)
    functions = right_side

    #getting rid of the Boolean characters ! | & and ()
    characters_to_remove = "!|&()"
    values = []
    for function in functions:
        translation_table = str.maketrans({c: ' ' for c in characters_to_remove})
        cleaned_expression = function.translate(translation_table) 
        tokens = list(set(cleaned_expression.split()))
        tokens = [x for x in tokens if x not in BUILT_INS]
        cleaned_expression = ' '.join(tokens)
        values.append(cleaned_expression)
    upstream_genes = values

    return(upstream_genes)

    

[docs]
def get_connectivity_matrix(equations,upstream_genes,gene_dict):    
    #now we actually make the connectivity matrix
    result_list = []

    for function in upstream_genes:
        genes = function.split()
        values = tuple([gene_dict[gene] for gene in genes])
        result_list.append(values)
    result_array = np.array(result_list, dtype=object) #they are not all the same length
    
    #now we fix the length by adding the -1 (aka the padding) 
    max_length = max(len(t) for t in result_array)
    connectivity_matrix = [tuple(np.pad(t, (0, max_length - len(t)), constant_values=-1)) for t in result_array]
    connectivity_matrix = np.array(connectivity_matrix, dtype=int)
    
    return(connectivity_matrix)



[docs]
def get_truth_table(equations,upstream_genes,show_functions=None):
    
    if show_functions is None: 
        show_functions = False

    #get only the right side of the equations
    right_side = []
    for equation in equations:
        parts = equation.split('=')
        value = parts[1].strip()
        right_side.append(value)
    functions = right_side

    functions = [function.replace('!', ' not ').replace('|', ' or ').replace('&',' and ') for function in functions]
        
    truth = []
    var1 = []
    i = 1

    for i in range(len(equations)):
        function = functions[i]   
        if show_functions is False:
            pass
        else:
            print(function)
            
        variables = [upstream_genes[i]] #get the genes in the expression (ex: FLT3)
        variables = variables[0].split()
        combinations = product([0, 1], repeat=len(variables)) #gets all possiblities
        i += 1
    
        for combo in combinations:
            values = dict(zip(variables, combo))
        
            if len(variables) == 1 and variables[0] == function: #if the node is equal to itself (aka FLT3=FLT3)
                output = values[variables[0]]

            else:
                output = (int(eval(function, values))) #evaluates the equations

            var1.append(output) #adds the output to var1
        
        truth.append(tuple(var1))
        var1 = []

    truth_table = np.array(truth, dtype=object) #they are not all the same length

    #now we fix the length by adding the -1 (aka the padding) 
    max_length = max(len(t) for t in truth_table)
    truth_table = [tuple(np.pad(t, (0, max_length - len(t)), constant_values=-1)) for t in truth_table]
    truth_table = np.array(truth_table)
    
    return(truth_table)


################## knocking in/out genes after creating variables but before simulation ##################

##can also use mutation_dict for perturbed_dict, just replace the file 

[docs]
def get_mutation_dict(file):
    mutation_dict = {}
    
    with open(file) as f:
        for line in f:
            if bool(re.search('=', line)) == False: #there is no = sign
                print('There is a formatting error: ' + str(line) + '\nMake sure that it is formatted with an equal sign. For example: FLT3 = 1')
                return
            
            key, val = line.split("=")
            mutation_dict[key.strip()] = int(val.strip())
    
    return(mutation_dict)



[docs]
def get_knocking_genes(profile, mutation_dict, connectivity_matrix, gene_dict, perturbed_genes=None, perturbed_dict=None):
    ngenes = len(gene_dict)
    mutated_connectivity_matrix = connectivity_matrix.copy()  # Create a copy of connectivity_matrix for each iteration 
    x0 = np.random.randint(2, size=ngenes)  # Random initial state resets with every profile

    
    if perturbed_genes is None: 
        perturbed_genes = []
        
    if perturbed_dict is None:
        perturbed_dict = {}
        
    if profile is not None: #if there is a profile
        mutation_profile = list(set(profile.split(',')))  # Removes any repeat values 
        
    if profile is not None and mutation_dict is None: #there are no mutation_dict (aka no mutations) 
        mutation_profile = ''
            
    if perturbed_genes is not None: #if there are perturbed genes
        perturbed_genes = perturbed_genes if isinstance(perturbed_genes, list) else perturbed_genes.split(',')
        perturbed_genes = list(set(perturbed_genes))  # Removes any repeat values

        
    # Setting that gene's value to wild value
    for gene in mutation_dict:
        if gene == '' or gene == 'NA':
            print('no_mutation')
        else:
            if ( mutation_dict[gene] > 0  ) :
                x0[gene_dict[gene]] = 0 
            else :
                x0[gene_dict[gene]] = 1

    # Make the mutated_connectivity_matrix rows in mutation_profile all -1 
    for gene in mutation_profile:
        if gene == '' or gene == 'NA':
            print('no_mutation')
        else:
            mutated_connectivity_matrix[[gene_dict[gene]], :] = -1  # Knock the connectivity_matrix to -1
            x0[gene_dict[gene]] = mutation_dict.get(gene, 0)  # Setting that gene's value to mutation value
            
    for gene in perturbed_genes:
        if len(gene) == 0:
            print('no perturbed genes in the simulation')
        else:
            mutated_connectivity_matrix[[gene_dict[gene]], :] = -1  # Knock the connectivity_matrix to -1
            x0[gene_dict[gene]] = perturbed_dict.get(gene, 0)  # Setting that gene's value to mutation value
            
    return(mutated_connectivity_matrix,x0)



################## equations for calculating phenotype and network ##################

#getting the equations uses the same function (equations(file))


[docs]
def get_cal_upstream_genes(equations):
    right_side = []
    for equation in equations:
        parts = equation.split('=')
        value = parts[1].strip()
        right_side.append(value)
    functions = right_side

    #getting rid of the Boolean characters ! | & and ()
    characters_to_remove = "!|&()"
    values = []
    for function in functions:
        translation_table = str.maketrans("", "", characters_to_remove)
        cleaned_expression = function.translate(translation_table)  
        cleaned_expression = ' '.join(list(set(cleaned_expression.split())))
        values.append(cleaned_expression)
    cal_upstream_genes = values
    
    for i in range(len(cal_upstream_genes)):
        cal_upstream_genes[i] = cal_upstream_genes[i].split()
    
    return cal_upstream_genes



[docs]
def get_cal_functions(equations):
    right_side = []
    for equation in equations:
        parts = equation.split('=')
        value = parts[1].strip()
        right_side.append(value)
    cal_functions = right_side

    cal_functions = [function.replace('!', '-').replace('|', '+').replace('&','+') for function in cal_functions]

    characters_to_remove = "()"
    values = []
    for function in cal_functions:
        translation_table = str.maketrans("", "", characters_to_remove)
        cleaned_expression = function.translate(translation_table)  
        values.append(cleaned_expression)
        cal_functions = values
    
    #cleaning up the cal_functions format so it can be eval()
    cal_functions = [function.replace(' - ', ' -') for function in cal_functions]
    cal_functions = [function.replace('- ', '-') for function in cal_functions]
    cal_functions = [function.replace('- ', '-') for function in cal_functions]
    cal_functions = [function.replace('  ', ' ') for function in cal_functions]

    values = []
    for function in cal_functions:
        new_func = re.sub(r'(-\w+)', r'(\1)', function)
        values.append(new_func)
        cal_functions = values
        
    return(cal_functions)


#assumes that cal_functions == len(scores_dict)

[docs]
def get_calculating_scores(network_traj, cal_functions, cal_upstream_genes, gene_dict, cal_range=None, scores_dict=None, title=None):
    if scores_dict is None:
        scores_dict = {"Apoptosis": [], "Differentiation": [], "Proliferation": [], "Network": []}
        
    if cal_range is None:
        cal_range = network_traj[-100000:]
    
    if title is None:
        title = ["Apoptosis", "Differentiation", "Proliferation", "Network"]
    
    for i in range(len(cal_functions)):
        score_function = cal_functions[i]
        variables = cal_upstream_genes[i]
        scores = []  # List to store scores for this iteration

        for row in cal_range:
            gene_values = []  # Clear gene_values for each row
            for gene in variables:
                value = row[gene_dict[gene]]
                gene_values.append(value)

            values = dict(zip(variables, gene_values))
            output = int(eval(score_function, values))

            scores.append(output)  # Append the score to the list for this iteration

        scores_dict[title[i]] = scores
        
    # Calculate the 'Network' scores
    scores = []
    Apoptosis = np.mean( scores_dict['Apoptosis'] )
    Differentiation = np.mean( scores_dict['Differentiation'] )
    Proliferation = np.mean( scores_dict['Proliferation'] )
    
    final_scores_dict = {} 
    
    final_scores_dict['Apoptosis']  = Apoptosis
    final_scores_dict['Differentiation'] = Differentiation
    final_scores_dict['Proliferation'] = Proliferation
    #for i in range(len(cal_range)): 
    #output = Proliferation[i] - (Differentiation[i] + Apoptosis[i])
    #scores.append(output)
    
    final_score = Proliferation - Differentiation - Apoptosis
    final_scores_dict['Network'] = final_score #scores
    #final_score = np.mean(scores_dict['Network'])
    
    return (final_scores_dict,final_score)


################## load BN/PBN from files/strings ##################

[docs]
def load_network_from_file(filename, initial_state=None):
    """
    Given a file representing a boolean network, this generates a BooleanNetwork object.

    Formatting:
   
        gene = equation
   
    - all genes must have their own equation in a line (sometimes the equation is just A = A)
    - each equation must have an equal sign and a space before and after it
    - If the equation is a constant value (0 or 1), meaning that the gene is set as mutated/perturbed
    
    Parameters:
    -----------
    filename : str
        Path to the file containing the network definition
    initial_state : array-like or dict, optional
        Initial values for each node. If array-like, order matches gene order in file.
        If dict, keys are gene names and values are initial states (0 or 1).
        If None, random initial values are used.
    """
    from .booleanNetwork import BooleanNetwork
    equations = get_equations(filename)
    ngenes = len(equations)
    gene_dict = get_gene_dict(equations)
    upstream_genes = get_upstream_genes(equations)
    connectivity_matrix = get_connectivity_matrix(equations, upstream_genes, gene_dict)
    truth_table = get_truth_table(equations, upstream_genes)
    
    # Initialize x0 array
    if initial_state is None:
        print('No initial state provided, using a random initial state')
        x0 = np.random.randint(2, size=ngenes) #random inital state 
    elif isinstance(initial_state, dict):
        # Dictionary input: keys are gene names, values are initial states
        x0 = np.zeros(ngenes, dtype=int)
        for gene, value in initial_state.items():
            if gene in gene_dict:
                x0[gene_dict[gene]] = int(value)
        print(f'Initial state set from dictionary. Genes not specified default to 0.')
    else:
        x0 = np.array(initial_state)
    
    # Handle constant values (0 or 1) in equations
    for equation in equations:
        parts = equation.split('=')
        gene = parts[0].strip()
        value = parts[1].strip()
        
        # Check if the equation is a constant value (0 or 1)
        if value == '0' or value == '1':
            gene_index = gene_dict[gene]
            # Set the gene's initial state to the constant value
            x0[gene_index] = int(value)
            # Set the connectivity matrix row to -1 to indicate it's a constant/perturbed gene
            connectivity_matrix[gene_index, :] = -1
        
    # create a Boolean network object
    network = BooleanNetwork(ngenes, connectivity_matrix, truth_table, x0,
            nodeDict=gene_dict, equations=equations)
    print(f"Network loaded successfully. There are {ngenes} genes in the network.")
    return network



[docs]
def load_network_from_string(network_string, initial_state=None):
    """
    Given a string representing a boolean network, this generates a BooleanNetwork object.

    Formatting:

    - all genes must have their own equation (sometimes the equation is just A = A)
    - each equation must have an equal sign and a space before and after it
    - If the equation is a constant value (0 or 1), meaning that the gene is set as mutated/perturbed
    
    Parameters:
    -----------
    network_string : str
        String containing the network definition
    initial_state : array-like or dict, optional
        Initial values for each node. If array-like, order matches gene order in string.
        If dict, keys are gene names and values are initial states (0 or 1).
        If None, random initial values are used.
    """
    from .booleanNetwork import BooleanNetwork
    equations = get_equations(string = network_string)

    ngenes = len(equations)
    gene_dict = get_gene_dict(equations)
    upstream_genes = get_upstream_genes(equations)
    connectivity_matrix = get_connectivity_matrix(equations, upstream_genes, gene_dict)
    truth_table = get_truth_table(equations, upstream_genes)
    if initial_state is None:
        print('No initial state provided, using a random initial state')
        x0 = np.random.randint(2, size=ngenes) #random inital state 
    elif isinstance(initial_state, dict):
        # Dictionary input: keys are gene names, values are initial states
        x0 = np.zeros(ngenes, dtype=int)
        for gene, value in initial_state.items():
            if gene in gene_dict:
                x0[gene_dict[gene]] = int(value)
        print(f'Initial state set from dictionary. Genes not specified default to 0.')
    else:
        x0 = np.array(initial_state)
    
    # Handle constant values (0 or 1) in equations
    for equation in equations:
        parts = equation.split('=')
        gene = parts[0].strip()
        value = parts[1].strip()
        
        # Check if the equation is a constant value (0 or 1)
        if value == '0' or value == '1':
            gene_index = gene_dict[gene]
            x0[gene_index] = int(value)
            connectivity_matrix[gene_index, :] = -1
    
    network = BooleanNetwork(ngenes, connectivity_matrix, truth_table, x0,
            nodeDict=gene_dict, equations=equations)
    print(f"Network loaded successfully. There are {ngenes} genes in the network.")
    return network



[docs]
def load_pbn_from_file(filename, initial_state=None):
    """
    Given a file representing a probabilistic boolean network, this generates a ProbabilisticBN object.

    File format example::
    
        x1 = (x1 | x2 | x3) & (!x1 | x2 | x3), 0.6
        x1 = (x1 | x2 | x3) & (x1 | !x2 | !x3) & (!x1 | x2 | x3), 0.4
        x2 = (x1 | x2 | x3) & (x1 | !x2 | !x3) & (!x1 | !x2 | x3)
        x3 = (!x1 & x2 & x3) | (x1 & !x2 & x3) | (x1 & x2 & !x3) | (x1 & x2 & x3), 0.5
        x3 = (x1 & x2 & x3), 0.5
    

    Each line has format: node = boolean_function, probability
    The probability part can be omitted if there is only one function for that gene.
    The boolean_function can also be a constant value (0 or 1), meaning that the gene is set as mutated/perturbed
    
    Parameters:
    -----------
    filename : str
        Path to the file containing the PBN definition
    initial_state : array-like or dict, optional
        Initial values for each node. If array-like, order matches gene order in file.
        If dict, keys are gene names and values are initial states (0 or 1).
        If None, random initial values are used.
        
    Returns:
    --------
    ProbabilisticBN
        A probabilistic boolean network object
    """
    from .PBN import ProbabilisticBN
    with open(filename, 'r') as file:
        lines = file.readlines()
    # Filter out empty lines and comment lines (starting with #)
    # For non-comment lines, remove inline comments (after #)
    processed_lines = []
    for line in lines:
        line = line.strip()
        if len(line) > 0 and not line.startswith('#'):
            # Remove inline comments
            if '#' in line:
                line = line.split('#')[0].strip()
            if len(line) > 0:
                processed_lines.append(line)
    
    lines = processed_lines

    gene_funcs = {}
    gene_probs = {}
    seen_gene_funcs = {}  # Track which gene-function combinations we've seen
    
    for line in lines:
        # Check if line contains probability
        if ',' in line:
            func_part, prob_part = line.rsplit(',', 1)
            probability = float(prob_part.strip())
        else:
            func_part = line
            probability = None  # Will be set to 1.0 later if it's the only function
        
        if '=' not in func_part:
            continue
            
        gene, equation = func_part.split('=', 1)
        gene = gene.strip()
        equation = equation.strip()
        
        # Check for duplicate gene-function pairs
        gene_func_key = (gene, equation, probability)
        if gene_func_key in seen_gene_funcs:
            continue  # Skip duplicate
        seen_gene_funcs[gene_func_key] = True
        
        if gene not in gene_funcs:
            gene_funcs[gene] = []
            gene_probs[gene] = []
            
        gene_funcs[gene].append(equation)
        gene_probs[gene].append(probability)
    
    # Validate and set probabilities
    for gene in gene_funcs:
        if len(gene_funcs[gene]) == 1:
            # If there's only one function, set probability to 1.0 if not specified
            if gene_probs[gene][0] is None:
                gene_probs[gene][0] = 1.0
        else:
            # Check if all probabilities are specified
            if None in gene_probs[gene]:
                raise ValueError(f"Gene {gene} has multiple functions but not all have probabilities specified")
            
            # Check if probabilities sum to 1
            prob_sum = sum(gene_probs[gene])
            if not np.isclose(prob_sum, 1.0, atol=1e-6):
                raise ValueError(f"Probabilities for gene {gene} sum to {prob_sum}, not 1.0")
    
    # Create a mapping of gene names to indices
    gene_dict = {gene: i for i, gene in enumerate(gene_funcs.keys())}
    ngenes = len(gene_dict)
    
    # Create array for number of functions per node
    nf = np.zeros(ngenes, dtype=int)
    for gene, funcs in gene_funcs.items():
        idx = gene_dict[gene]
        nf[idx] = len(funcs)
    
    # Create probability matrix
    max_funcs = max(nf)
    cij = np.full((ngenes, max_funcs), -1.0)
    for gene, probs in gene_probs.items():
        idx = gene_dict[gene]
        for j, prob in enumerate(probs):
            cij[idx, j] = prob
    
    # Process each function to get connectivity and truth tables
    all_equations = []
    for gene in gene_funcs:
        for func in gene_funcs[gene]:
            all_equations.append(f"{gene} = {func}")
    
    upstream_genes = get_upstream_genes(all_equations)
    
    # Create connectivity matrix and truth table
    connectivity_matrix = get_connectivity_matrix(all_equations, upstream_genes, gene_dict)
    truth_table = get_truth_table(all_equations, upstream_genes)
    
    # Set initial state
    if initial_state is None:
        print('No initial state provided, using a random initial state')
        x0 = np.random.randint(2, size=ngenes)
    else:
        x0 = np.array(initial_state)
    
    # Handle constant values (0 or 1) in equations
    for equation in all_equations:
        parts = equation.split('=')
        gene = parts[0].strip()
        value = parts[1].strip()
        
        # Check if the equation is a constant value (0 or 1)
        if value == '0' or value == '1':
            gene_index = gene_dict[gene]
            # Set the gene's initial state to the constant value
            x0[gene_index] = int(value)
            # Set the connectivity matrix row to -1 to indicate it's a constant
            connectivity_matrix[gene_index, :] = -1
    
    # Create and return the PBN
    network = ProbabilisticBN(ngenes, connectivity_matrix, nf, truth_table, cij, x0, nodeDict=gene_dict)
    network.gene_functions = gene_funcs  # Store the function strings
    network.equations = all_equations  # Store the expanded equations
    print(f"PBN loaded successfully. There are {ngenes} genes in the network.")
    return network



[docs]
def load_pbn_from_string(network_string, initial_state=None):
    """
    Given a string representing a probabilistic boolean network, this generates a ProbabilisticBN object.
    
    String format should match the file format expected by load_pbn_from_file.
    
    Parameters:
    -----------
    network_string : str
        String containing the PBN definition
    initial_state : array-like or dict, optional
        Initial values for each node. If array-like, order matches gene order in string.
        If dict, keys are gene names and values are initial states (0 or 1).
        If None, random initial values are used.
        
    Returns:
    --------
    ProbabilisticBN
        A probabilistic boolean network object
    """
    from .PBN import ProbabilisticBN
    
    # Split the string into lines
    lines = [x.strip() for x in network_string.strip().split('\n') if x.strip()]
    
    # Filter out empty lines and comment lines (starting with #)
    # For non-comment lines, remove inline comments (after #)
    processed_lines = []
    for line in lines:
        if len(line) > 0 and not line.startswith('#'):
            # Remove inline comments
            if '#' in line:
                line = line.split('#')[0].strip()
            if len(line) > 0:
                processed_lines.append(line)
    
    lines = processed_lines

    gene_funcs = {}
    gene_probs = {}
    seen_gene_funcs = {}  # Track which gene-function combinations we've seen
    
    for line in lines:
        # Skip empty lines
        if not line:
            continue
            
        # Split by the last comma to separate function and probability
        # Check if line contains probability
        if ',' in line:
            func_part, prob_part = line.rsplit(',', 1)
            probability = float(prob_part.strip())
        else:
            func_part = line
            probability = None  # Will be set to 1.0 later if it's the only function
        
        if '=' not in func_part:
            continue
            
        gene, equation = func_part.split('=', 1)
        gene = gene.strip()
        equation = equation.strip()
        
        # Check for duplicate gene-function pairs
        gene_func_key = (gene, equation, probability)
        if gene_func_key in seen_gene_funcs:
            continue  # Skip duplicate
        seen_gene_funcs[gene_func_key] = True
        
        if gene not in gene_funcs:
            gene_funcs[gene] = []
            gene_probs[gene] = []
            
        gene_funcs[gene].append(equation)
        gene_probs[gene].append(probability)

    # Validate and set probabilities
    for gene in gene_funcs:
        if len(gene_funcs[gene]) == 1:
            # If there's only one function, set probability to 1.0 if not specified
            if gene_probs[gene][0] is None:
                gene_probs[gene][0] = 1.0
        else:
            # Check if all probabilities are specified
            if None in gene_probs[gene]:
                raise ValueError(f"Gene {gene} has multiple functions but not all have probabilities specified")
            
            # Check if probabilities sum to 1
            prob_sum = sum(gene_probs[gene])
            if not np.isclose(prob_sum, 1.0, atol=1e-6):
                raise ValueError(f"Probabilities for gene {gene} sum to {prob_sum}, not 1.0")

    # Create a mapping of gene names to indices
    gene_dict = {gene: i for i, gene in enumerate(gene_funcs.keys())}
    ngenes = len(gene_dict)
    
    # Create array for number of functions per node
    nf = np.zeros(ngenes, dtype=int)
    for gene, funcs in gene_funcs.items():
        idx = gene_dict[gene]
        nf[idx] = len(funcs)
    
    # Create probability matrix
    max_funcs = max(nf)
    cij = np.full((ngenes, max_funcs), -1.0)
    for gene, probs in gene_probs.items():
        idx = gene_dict[gene]
        for j, prob in enumerate(probs):
            cij[idx, j] = prob
    
    # Process each function to get connectivity and truth tables
    all_equations = []
    for gene in gene_funcs:
        for func in gene_funcs[gene]:
            all_equations.append(f"{gene} = {func}")
    
    upstream_genes = get_upstream_genes(all_equations)
    
    # Create connectivity matrix and truth table
    connectivity_matrix = get_connectivity_matrix(all_equations, upstream_genes, gene_dict)
    truth_table = get_truth_table(all_equations, upstream_genes)
    
    # Set initial state
    if initial_state is None:
        print('No initial state provided, using a random initial state')
        x0 = np.random.randint(2, size=ngenes)
    elif isinstance(initial_state, dict):
        # Dictionary input: keys are gene names, values are initial states
        x0 = np.zeros(ngenes, dtype=int)
        for gene, value in initial_state.items():
            if gene in gene_dict:
                x0[gene_dict[gene]] = int(value)
        print(f'Initial state set from dictionary. Genes not specified default to 0.')
    else:
        x0 = np.array(initial_state)
    
    # Handle constant values (0 or 1) in equations
    for equation in all_equations:
        parts = equation.split('=')
        gene = parts[0].strip()
        value = parts[1].strip()
        
        # Check if the equation is a constant value (0 or 1)
        if value == '0' or value == '1':
            gene_index = gene_dict[gene]
            x0[gene_index] = int(value)
            connectivity_matrix[gene_index, :] = -1
    
    # Create and return the PBN
    network = ProbabilisticBN(ngenes, connectivity_matrix, nf, truth_table, cij, x0, nodeDict=gene_dict)
    network.gene_functions = gene_funcs  # Store the function strings
    network.equations = all_equations  # Store the expanded equations
    print(f"PBN loaded successfully. There are {ngenes} genes in the network.")
    return network



[docs]
def load_network(source, initial_state=None, network_type='auto'):
    """
    Unified function to load a Boolean Network or Probabilistic Boolean Network from a file or string.
    
    This function automatically detects whether the input is a file path or a network string,
    and whether it represents a Boolean Network (BN) or Probabilistic Boolean Network (PBN).
    
    Parameters:
    -----------
    source : str
        Either a file path to a network definition file, or a string containing the network definition.
        The function automatically detects which one it is.
        
    initial_state : array-like or dict, optional
        Initial values for each node. If array-like, order matches gene order in the network.
        If dict, keys are gene names and values are initial states (0 or 1).
        If None, random initial values are used.
        
    network_type : str, optional
        Type of network to load. Options are:
        - 'auto' (default): Automatically detect BN vs PBN based on presence of probabilities
        - 'bn': Force loading as Boolean Network
        - 'pbn': Force loading as Probabilistic Boolean Network
    
    Returns:
    --------
    BooleanNetwork or ProbabilisticBN
        A network object of the appropriate type
    
    Examples:
    ---------
    Load a BN from file:
    
    >>> network = load_network('my_network.txt')
    
    Load a BN from string:
    
    >>> network_str = '''
    ... A = A
    ... B = A & C
    ... C = B | A
    ... '''
    >>> network = load_network(network_str)
    
    Load a PBN from file:
    
    >>> pbn = load_network('my_pbn.txt')
    
    Load a PBN from string with initial state:
    
    >>> pbn_str = '''
    ... x1 = (x1 | x2), 0.6
    ... x1 = (!x1 & x2), 0.4
    ... x2 = x1 & x2
    ... '''
    >>> pbn = load_network(pbn_str, initial_state={'x1': 1, 'x2': 0})
    
    Notes:
    ------
    - For BN format: Each line should have format 'gene = boolean_expression'
    - For PBN format: Each line should have format 'gene = boolean_expression, probability'
    - Probabilities can be omitted if there's only one function for a gene
    - Duplicate equations for the same gene are automatically filtered (first occurrence is kept)
    - Comments (lines starting with #) and inline comments (after #) are ignored
    - Constant values (0 or 1) are supported for genes
    """
    # Detect if source is a file or string
    is_file = os.path.isfile(source)
    
    # Get raw lines for detection
    if is_file:
        with open(source, 'r') as f:
            raw_lines = f.readlines()
    else:
        raw_lines = source.strip().split('\n')
    
    # Clean and filter lines for detection
    detection_lines = []
    for line in raw_lines:
        line = line.strip()
        if len(line) > 0 and not line.startswith('#'):
            # Remove inline comments
            if '#' in line:
                line = line.split('#')[0].strip()
            if len(line) > 0 and '=' in line:
                detection_lines.append(line)
    
    # Auto-detect network type if needed
    if network_type == 'auto':
        is_pbn = False
        for line in detection_lines:
            # Check if line contains a comma after the equation (probability indicator)
            if '=' in line:
                parts = line.split('=', 1)
                if len(parts) == 2:
                    right_side = parts[1].strip()
                    # Check if there's a comma that's not inside parentheses
                    paren_count = 0
                    for i, char in enumerate(right_side):
                        if char == '(':
                            paren_count += 1
                        elif char == ')':
                            paren_count -= 1
                        elif char == ',' and paren_count == 0:
                            # Found a comma outside parentheses - this is a probability
                            is_pbn = True
                            break
            if is_pbn:
                break
        
        network_type = 'pbn' if is_pbn else 'bn'
    
    # Load the appropriate network type
    if network_type == 'bn':
        if is_file:
            return load_network_from_file(source, initial_state)
        else:
            return load_network_from_string(source, initial_state)
    elif network_type == 'pbn':
        if is_file:
            return load_pbn_from_file(source, initial_state)
        else:
            return load_pbn_from_string(source, initial_state)
    else:
        raise ValueError(f"Invalid network_type: {network_type}. Must be 'auto', 'bn', or 'pbn'")



[docs]
def rename_nodes(network, mapping, expand_complexes=False):
    """
    Rename nodes in a boolean network based on a mapping.

    Parameters:
    -----------
    network : str
        Network definition as file path or string
    mapping : str or dict
        Excel file path with 'Node' and 'NewName' columns, or dictionary with
        original names as keys and new names as values
    expand_complexes : bool, optional
        Whether to expand complex nodes into their components. Default is False.
    Returns:
    --------
    str
        Updated network string with renamed nodes and optionally expanded complexes
    """
    # Load network from file if needed
    if os.path.isfile(network):
        with open(network, 'r') as f:
            network_string = f.read()
    else:
        network_string = network

    # Process mapping input - either file or dictionary
    node_mapping = {}
    complex_nodes = {}
    single_node_mapping = {}

    if isinstance(mapping, str):
        mapping_df = pd.read_excel(mapping)
        for _, row in mapping_df.iterrows():
            original_node = row['Node'].strip()
            new_name = row['NewName']
            if pd.isnull(new_name):
                new_name = ''
            else:
                new_name = str(new_name).strip()
            if not original_node or not new_name:
                continue
            
            if ',' in new_name:
                if expand_complexes:
                    # Parse components for expansion
                    components = [comp.strip() for comp in new_name.split(',') if comp.strip()]
                    if components:
                        node_mapping[original_node] = components
                        complex_nodes[original_node] = components
                # If expand_complexes=False, skip complex nodes (keep original name)
            else:
                node_mapping[original_node] = [new_name]
                single_node_mapping[original_node] = new_name
    elif isinstance(mapping, dict):
        for original_node, new_name in mapping.items():
            original_node = str(original_node).strip()
            if not original_node:
                continue
            if isinstance(new_name, list):
                if expand_complexes:
                    components = [str(comp).strip() for comp in new_name if str(comp).strip()]
                    if components:
                        node_mapping[original_node] = components
                        complex_nodes[original_node] = components
                # If expand_complexes=False, skip complex nodes (keep original name)
            else:
                new_name = str(new_name).strip()
                if not new_name:
                    continue
                if ',' in new_name:
                    if expand_complexes:
                        components = [comp.strip() for comp in new_name.split(',') if comp.strip()]
                        if components:
                            node_mapping[original_node] = components
                            complex_nodes[original_node] = components
                    # If expand_complexes=False, skip complex nodes (keep original name)
                else:
                    node_mapping[original_node] = [new_name]
                    single_node_mapping[original_node] = new_name
    else:
        raise TypeError("mapping must be either a file path or a dictionary")

    # Parse original network string into equations, ignore blank lines
    original_equations = [line.strip() for line in network_string.strip().split('\n') if line.strip()]
    equation_dict = {}

    for eq in original_equations:
        if '=' in eq:
            left, right = eq.split('=', 1)
            node = left.strip()
            rule = right.strip()
            if not node or not rule:
                continue
            equation_dict[node] = rule

    new_equations = []
    
    if expand_complexes:
        # Track which component equations have been created to avoid duplicates
        processed_components = set()
        
        # 1 Process complex expansions only
        for original_node, rule in equation_dict.items():
            if original_node in node_mapping:
                # This is a complex node that should be expanded
                new_rule = rule
                # Replace complexes with their expanded forms
                for complex_node, components in complex_nodes.items():
                    if not complex_node or not components:
                        continue
                    pattern = r'\b' + re.escape(complex_node) + r'\b'
                    replacement = '(' + ' & '.join(components) + ')'
                    new_rule = re.sub(pattern, replacement, new_rule)
                
                # Replace single nodes
                for old_node, new_node in single_node_mapping.items():
                    if not old_node or not new_node:
                        continue
                    pattern = r'\b' + re.escape(old_node) + r'\b'
                    new_rule = re.sub(pattern, new_node, new_rule)
                
                # Create equations for new components
                new_nodes = node_mapping[original_node]
                for new_node in new_nodes:
                    if not new_node:
                        continue
                    # Only create equation if this component hasn't been processed yet
                    # and doesn't already exist as an original node
                    if new_node not in processed_components and new_node not in equation_dict:
                        new_equations.append(f"{new_node} = {new_rule}")
                        processed_components.add(new_node)
        
        # 2 Process remaining nodes (including renamed original nodes)
        for original_node, rule in equation_dict.items():
            if original_node not in node_mapping:
                # This is not a complex node, process normally
                new_rule = rule
                # Replace complexes with their expanded forms
                for complex_node, components in complex_nodes.items():
                    if not complex_node or not components:
                        continue
                    pattern = r'\b' + re.escape(complex_node) + r'\b'
                    replacement = '(' + ' & '.join(components) + ')'
                    new_rule = re.sub(pattern, replacement, new_rule)
                
                # Replace single nodes
                for old_node, new_node in single_node_mapping.items():
                    if not old_node or not new_node:
                        continue
                    pattern = r'\b' + re.escape(old_node) + r'\b'
                    new_rule = re.sub(pattern, new_node, new_rule)
                
                # Handle left side
                if original_node in single_node_mapping:
                    renamed_node = single_node_mapping[original_node]
                    new_equations.append(f"{renamed_node} = {new_rule}")
                else:
                    new_equations.append(f"{original_node} = {new_rule}")
    else:
        # Simple rename - no complex expansion
        for original_node, rule in equation_dict.items():
            new_rule = rule
            # Only replace single nodes (no complex expansion)
            for old_node, new_node in single_node_mapping.items():
                if not old_node or not new_node:
                    continue
                pattern = r'\b' + re.escape(old_node) + r'\b'
                new_rule = re.sub(pattern, new_node, new_rule)
            
            # Handle left side
            if original_node in single_node_mapping:
                renamed_node = single_node_mapping[original_node]
                new_equations.append(f"{renamed_node} = {new_rule}")
            else:
                new_equations.append(f"{original_node} = {new_rule}")
    
    # reorder the equations alphabetically
    new_equations.sort(key=lambda x: x.split('=')[0])

    return '\n'.join(new_equations)