Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 27 additions & 14 deletions bioneuralnet/clustering/correlated_louvain.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,16 +74,23 @@ def __init__(
self.tune = tune

self.logger.info(
f"CorrelatedLouvain(k3={self.K3}, k4={self.K4}, "
f"nodes={self.G.number_of_nodes()}, edges={self.G.number_of_edges()}, "
f"features={self.B.shape[1] if self.B is not None else 0})"
)

self.logger.debug(
f"Initialized CorrelatedLouvain with k3 = {self.K3}, k4 = {self.K4}, "
)
if self.B is not None:
self.logger.info(f"Original omics data shape: {self.B.shape}")
self.logger.debug(f"Original omics data shape: {self.B.shape}")

self.logger.info(f"Original graph has {self.G.number_of_nodes()} nodes.")
self.logger.debug(f"Original graph has {self.G.number_of_nodes()} nodes.")

if self.B is not None:
self.logger.info(f"Final omics data shape: {self.B.shape}")
self.logger.info(
self.logger.debug(f"Final omics data shape: {self.B.shape}")

self.logger.debug(
f"Graph has {self.G.number_of_nodes()} nodes and {self.G.number_of_edges()} edges."
)

Expand All @@ -95,7 +102,7 @@ def __init__(

self.clusters: dict[Any, Any] = {}
self.device = torch.device("cuda" if gpu and torch.cuda.is_available() else "cpu")
self.logger.info(f"Initialized Correlated Louvain. device={self.device}")
self.logger.debug(f"Initialized Correlated Louvain. device={self.device}")

def _compute_community_cohesion(self, nodes) -> float:
"""Compute average absolute pairwise correlation of omics features within a community.
Expand Down Expand Up @@ -131,28 +138,28 @@ def _compute_community_correlation(self, nodes) -> tuple:
Drops columns that are completely zero.
"""
try:
self.logger.info(
self.logger.debug(
f"Computing community correlation for {len(nodes)} nodes..."
)
node_cols = [str(n) for n in nodes if str(n) in self.B.columns]
if not node_cols:
self.logger.info(
self.logger.debug(
"No valid columns found for these nodes; returning (0.0, 1.0)."
)
return 0.0, 1.0
B_sub = self.B.loc[:, node_cols]
zero_mask = (B_sub == 0).all(axis=0)
num_zero_columns = int(zero_mask.sum())
if num_zero_columns > 0:
self.logger.info(
self.logger.debug(
f"WARNING: {num_zero_columns} columns are all zeros in community subset."
)
B_sub = B_sub.loc[:, ~zero_mask]
if B_sub.shape[1] == 0:
self.logger.info("All columns dropped; returning (0.0, 1.0).")
self.logger.debug("All columns dropped; returning (0.0, 1.0).")
return 0.0, 1.0

self.logger.info(
self.logger.debug(
f"B_sub shape: {B_sub.shape}, first few columns: {node_cols[:5]}"
)
scaler = StandardScaler()
Expand All @@ -167,7 +174,7 @@ def _compute_community_correlation(self, nodes) -> tuple:
corr, pvalue = pearsonr(pc1, target)
return corr, pvalue
except Exception as e:
self.logger.info(f"Error in _compute_community_correlation: {e}")
self.logger.error(f"Error in _compute_community_correlation: {e}")
raise

def _quality_correlated(self, partition) -> float:
Expand All @@ -180,7 +187,7 @@ def _quality_correlated(self, partition) -> float:

# Unsupervised mode: Y is None
if self.Y is None:
self.logger.info("Phenotype data not provided; using unsupervised cohesion.")
self.logger.debug("Phenotype data not provided; using unsupervised cohesion.")

if self.B is None:
return Q
Expand All @@ -195,14 +202,14 @@ def _quality_correlated(self, partition) -> float:

avg_cohesion = np.mean(community_cohesions) if community_cohesions else 0.0
quality = self.K3 * Q + self.K4 * avg_cohesion
self.logger.info(
self.logger.debug(
f"Computed quality (unsupervised): Q = {Q:.4f}, avg_cohesion = {avg_cohesion:.4f}, combined = {quality:.4f}"
)
return quality

# Supervised mode: Y is provided
if self.B is None:
self.logger.info(
self.logger.debug(
"Omics data not provided; returning standard modularity."
)
return Q
Expand Down Expand Up @@ -266,6 +273,12 @@ def run(self, as_dfs: bool = False) -> Union[dict, list]:
self.logger.info(f"Final quality: {quality:.4f}")
self.partition = partition

n_clusters = len(set(partition.values()))
self.logger.info(
f"CorrelatedLouvain found {n_clusters} communities "
f"(nodes={self.G.number_of_nodes()})"
)

if as_dfs:
self.logger.info("Raw partition output:", self.partition)
clusters_dfs = self.partition_to_adjacency(self.partition)
Expand Down
23 changes: 19 additions & 4 deletions bioneuralnet/clustering/hybrid_louvain.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,18 @@ class HybridLouvain:

Attributes:

G (nx.Graph): NetworkX graph object.
G (Union[nx.Graph, pd.DataFrame]): Input graph as a NetworkX Graph or adjacency DataFrame.
B (pd.DataFrame): Omics data.
Y (pd.DataFrame): Phenotype data.
k3 (float): Weight for Correlated Louvain.
k4 (float): Weight for Correlated Louvain.
max_iter (int): Maximum number of iterations.
weight (str): Edge weight parameter name.
tune (bool): Flag to enable tuning of parameters
tune (bool): Flag to enable tuning of parameters
Copy link

Copilot AI Nov 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trailing whitespace should be removed for consistency.

Suggested change
tune (bool): Flag to enable tuning of parameters
tune (bool): Flag to enable tuning of parameters

Copilot uses AI. Check for mistakes.
"""
def __init__(
self,
G: nx.Graph,
G: Union[nx.Graph, pd.DataFrame],
B: pd.DataFrame,
Y: pd.DataFrame,
k3: float = 0.2,
Expand All @@ -43,6 +43,13 @@ def __init__(
set_seed(seed)
self.logger.info("Initializing HybridLouvain...")

if isinstance(G, pd.DataFrame):
self.logger.info("Input G is a DataFrame; converting adjacency matrix to NetworkX graph.")
G = nx.from_pandas_adjacency(G)

if not isinstance(G, nx.Graph):
raise TypeError("G must be a networkx.Graph or a pandas DataFrame adjacency matrix.")

self.G = G
graph_nodes = set(map(str, G.nodes()))

Expand Down Expand Up @@ -233,7 +240,15 @@ def run(self, as_dfs: bool = False) -> Union[dict, list]:
refined_nodes = pagerank_results.get("cluster_nodes", [])
new_size = len(refined_nodes)
all_clusters[iteration] = refined_nodes
self.logger.info(f"Refined subgraph size: {new_size}")

cond = pagerank_results.get("conductance", None)
corr = pagerank_results.get("correlation", None)
score = pagerank_results.get("composite_score", None)

self.logger.info(
f"Iteration {iteration+1}: cluster size={new_size}, "
f"Conductance={cond:.3f} Correlation={corr:.3f} score={score:.3f}"
Copy link

Copilot AI Nov 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If any of the values (conductance, correlation, or composite_score) are None, the string formatting with .3f will raise a TypeError. Add a check to handle None values gracefully, e.g., by using a default value or conditional formatting like f"{cond:.3f}" if cond is not None else "N/A".

Suggested change
f"Conductance={cond:.3f} Correlation={corr:.3f} score={score:.3f}"
f"Conductance={cond:.3f}" if cond is not None else "Conductance=N/A" + " "
f"Correlation={corr:.3f}" if corr is not None else "Correlation=N/A" + " "
f"score={score:.3f}" if score is not None else "score=N/A"

Copilot uses AI. Check for mistakes.
)

if new_size == prev_size or new_size <= 1:
self.logger.info(
Expand Down
2 changes: 1 addition & 1 deletion bioneuralnet/datasets/lgg/target.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
patient,vital_status
patient,target
Copy link

Copilot AI Nov 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changing the column name from vital_status to target is a breaking change that will affect any code referencing this specific column name. Ensure that all references to vital_status in the codebase have been updated to target, or provide backward compatibility by supporting both column names.

Copilot uses AI. Check for mistakes.
TCGA-CS-4938,0
TCGA-CS-4941,1
TCGA-CS-4942,1
Expand Down
84 changes: 65 additions & 19 deletions bioneuralnet/downstream_task/dpmon.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@
from ray import tune
from ray.tune import Checkpoint
from ray.tune import CLIReporter
from ray.tune.error import TuneError
from ray.tune.stopper import TrialPlateauStopper
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.basic_variant import BasicVariantGenerator
from sklearn.model_selection import train_test_split,StratifiedKFold,RepeatedStratifiedKFold
from sklearn.preprocessing import label_binarize
from scipy.stats import pointbiserialr
Expand Down Expand Up @@ -69,6 +71,7 @@ class DPMON:
cv (bool): If True, use K-fold cross-validation; otherwise use repeated train/test splits.
cuda (int): CUDA device index to use when gpu=True.
seed (int): Random seed for reproducibility.
seed_trials (bool): If True, use a fixed seed for hyperparameter sampling to ensure reproducibility across trials.
output_dir (Path): Directory where logs, checkpoints, and results are written.
"""
def __init__(
Expand Down Expand Up @@ -97,6 +100,7 @@ def __init__(
cv: bool = False,
cuda: int = 0,
seed: int = 1804,
seed_trials: bool = False,
output_dir: Optional[str] = None,
):
if adjacency_matrix.empty:
Expand Down Expand Up @@ -153,6 +157,7 @@ def __init__(
self.gpu = gpu
self.cuda = cuda
self.seed = seed
self.seed_trials = seed_trials
self.cv = cv

if output_dir is None:
Expand Down Expand Up @@ -199,6 +204,7 @@ def run(self) -> Tuple[pd.DataFrame, object, torch.Tensor | None]:
"cuda": self.cuda,
"tune": self.tune,
"seed": self.seed,
"seed_trials": self.seed_trials,
"cv": self.cv,
}

Expand Down Expand Up @@ -305,10 +311,8 @@ def prepare_node_features(adjacency_matrix: pd.DataFrame, omics_datasets: List[p
omics_data = omics_datasets[0]

if phenotype_col in omics_data.columns:
pheno = omics_data[phenotype_col]
omics_feature_df = omics_data.drop(columns=[phenotype_col])
else:
pheno = None
omics_feature_df = omics_data

nodes = sorted(network_features.intersection(omics_feature_df.columns))
Expand Down Expand Up @@ -529,7 +533,6 @@ def run_standard_training(dpmon_params, adjacency_matrix, combined_omics, clinic
best_global_model_state = None
best_global_embeddings = None

cv_predictions_list = []
fold_accuracies = []
fold_f1_macros = []
fold_f1_weighteds = []
Expand Down Expand Up @@ -642,7 +645,6 @@ def run_standard_training(dpmon_params, adjacency_matrix, combined_omics, clinic

try:
n_classes = probs_np.shape[1]
unique_classes = np.unique(y_test_np)

# binary
if n_classes == 2:
Expand Down Expand Up @@ -778,7 +780,7 @@ def run_hyperparameter_tuning(X_train, y_train, adjacency_matrix, clinical_data,
"nn_hidden_dim2": tune.choice([32, 64, 128]),
"ae_encoding_dim": tune.choice([4, 8, 16]),
"num_epochs": tune.choice([512, 1024, 2048]),
Copy link

Copilot AI Nov 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The dropout range has been changed from [0.0, 0.1, 0.2, 0.3, 0.4, 0.5] to [0.2, 0.3, 0.4, 0.5, 0.6], removing lower dropout values (0.0, 0.1) and adding a higher one (0.6). This suggests a deliberate shift toward more aggressive regularization. Consider documenting this change in comments or the commit message to explain why the lower dropout values were removed, as this could impact model performance.

Suggested change
"num_epochs": tune.choice([512, 1024, 2048]),
"num_epochs": tune.choice([512, 1024, 2048]),
# Dropout range updated: removed lower values (0.0, 0.1) and added higher (0.6) to encourage more aggressive regularization.
# This change may impact model performance; see commit message for rationale.

Copilot uses AI. Check for mistakes.
"gnn_dropout": tune.choice([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]),
"gnn_dropout": tune.choice([0.2, 0.3, 0.4, 0.5, 0.6]),
"gnn_activation": tune.choice(["relu", "elu"]),
"dim_reduction": tune.choice(["ae","linear", "mlp"]),
}
Expand All @@ -798,7 +800,7 @@ def run_hyperparameter_tuning(X_train, y_train, adjacency_matrix, clinical_data,
grace_period=30,
reduction_factor=2
)
gpu_resources = 1 if dpmon_params["gpu"] else 0

best_configs = []

omics_data = omics_dataset[0]
Expand Down Expand Up @@ -881,19 +883,63 @@ def tune_train_n(config):
def short_dirname_creator(trial):
return f"T{trial.trial_id}"

result = tune.run(
tune_train_n,
resources_per_trial={"cpu": 1, "gpu": 0.06} , #1 and 0.05
config=pipeline_configs,
num_samples=40, #50
verbose=0,
scheduler=scheduler,
stop=stopper,
name="tune_dp",
progress_reporter=reporter,
trial_dirname_creator=short_dirname_creator,
checkpoint_score_attr="min-val_loss",
)
cpu_per_trial = 2
use_gpu = bool(dpmon_params.get("gpu", False)) and torch.cuda.is_available()
if dpmon_params.get("gpu", False) and not torch.cuda.is_available():
logger.warning("gpu=True but CUDA is not available; Ray Tune will run on CPU only (gpu_per_trial=0.0).")

gpu_per_trial = 0.05 if use_gpu else 0.0

num_samples = 50
max_retries = 5

seed_trials = dpmon_params.get("seed_trials", False)

if seed_trials:
logger.debug(f"seed_trials=True: Using FIXED seed {dpmon_params['seed']} for hyperparameter sampling.")
else:
logger.debug("seed_trials=False: Using RANDOM hyperparameter sampling.")

for attempt in range(max_retries):
try:
if seed_trials:
search_alg = BasicVariantGenerator(random_state=np.random.RandomState(dpmon_params["seed"]))
else:
search_alg = None

result = tune.run(
tune_train_n,
search_alg=search_alg,
resources_per_trial={"cpu": cpu_per_trial, "gpu": gpu_per_trial},
config=pipeline_configs,
num_samples=num_samples,
verbose=0,
scheduler=scheduler,
stop=stopper,
name="tune_dp",
progress_reporter=reporter,
trial_dirname_creator=short_dirname_creator,
checkpoint_score_attr="min-val_loss",
)
break
except TuneError as e:
msg = str(e)
if "Trials did not complete" not in msg and "OutOfMemoryError" not in msg:
raise

new_num_samples = max(1, num_samples // 2)
if new_num_samples == num_samples:
raise

Comment on lines +930 to +933
Copy link

Copilot AI Nov 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The condition if new_num_samples == num_samples will only be true when num_samples == 1 (since max(1, 1 // 2) == 1). However, this check happens after the assignment, so if the initial num_samples was already 1, the code will raise an exception. Consider checking if num_samples <= 1 before attempting to halve it, to provide a clearer error path.

Suggested change
new_num_samples = max(1, num_samples // 2)
if new_num_samples == num_samples:
raise
if num_samples <= 1:
raise
new_num_samples = max(1, num_samples // 2)

Copilot uses AI. Check for mistakes.
logger.warning(
f"Ray Tune failed with a likely resource / OOM error (attempt {attempt + 1}). "
f"Reducing num_samples from {num_samples} to {new_num_samples} "
f"(cpu_per_trial={cpu_per_trial}, gpu_per_trial={gpu_per_trial})."
)
num_samples = new_num_samples

Copy link

Copilot AI Nov 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The else clause after the for loop is used here, which executes only if the loop completes without breaking. This is correct Python syntax but can be confusing. Consider adding a comment to clarify that this else clause triggers only if all retry attempts are exhausted without a successful break.

Suggested change
# The else clause below executes only if the for loop completes without a break,
# i.e., if all retry attempts are exhausted without a successful run.

Copilot uses AI. Check for mistakes.
else:
raise RuntimeError("Hyperparameter tuning failed after reducing resources several times.")

best_trial = result.get_best_trial("val_loss", "min", "last")
logger.debug("Best trial config: {}".format(best_trial.config))
Expand Down
Loading