diff --git a/.gitignore b/.gitignore index 3bcb9ef..455aa94 100644 --- a/.gitignore +++ b/.gitignore @@ -122,3 +122,10 @@ lib/ .DS_Store cancer_output_1/GlobalNetwork.csv Parkinson +KIPAN +dpmon_cv_results +dpmon_cv_results_GCN_FINAL +dpmon_cv_results_SAGE_FINAL +dpmon_tuning +GBMLGG +PAAN diff --git a/README.md b/README.md index 6219e85..41530c1 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,7 @@ [![GitHub Contributors](https://img.shields.io/github/contributors/UCD-BDLab/BioNeuralNet)](https://github.com/UCD-BDLab/BioNeuralNet/graphs/contributors) [![Downloads](https://static.pepy.tech/badge/bioneuralnet)](https://pepy.tech/project/bioneuralnet) [![Documentation](https://img.shields.io/badge/docs-read%20the%20docs-blue.svg)](https://bioneuralnet.readthedocs.io/en/latest/) -[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.17503084.svg)](https://doi.org/10.5281/zenodo.17503084) - +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.17503083.svg)](https://doi.org/10.5281/zenodo.17503083) ## Welcome to BioNeuralNet 1.1.3 diff --git a/bioneuralnet/datasets/dataset_loader.py b/bioneuralnet/datasets/dataset_loader.py index f5e99c7..72c4833 100644 --- a/bioneuralnet/datasets/dataset_loader.py +++ b/bioneuralnet/datasets/dataset_loader.py @@ -5,7 +5,7 @@ class DatasetLoader: def __init__(self, dataset_name: str): """ Args: - dataset_name (str): "example1", "monet", or "tcga_brca". + dataset_name (str): "example1", "monet", "brca", "gbm", or "kipan". returns: @@ -14,7 +14,7 @@ def __init__(self, dataset_name: str): Example: - tcga_brca = DatasetLoader("tcga_brca") + tcga_brca = DatasetLoader("brca") tcga_brca.shape # {'brca_mirna': (108, 1000), 'brca_pam50': (108, 50), ...} mirna = tcga_brca.data["brca_mirna"] @@ -58,14 +58,20 @@ def _load_data(self): self.data["clinical"] = pd.read_csv(folder / "clinical.csv", index_col=0) self.data["rna"] = pd.read_csv(folder / "rna.csv", index_col=0) self.data["meth"] = pd.read_csv(folder / "meth.csv", index_col=0) - #meth_part1 = pd.read_csv(folder / "meth_1.csv", index_col=0) - #meth_part2= pd.read_csv(folder / "meth_2.csv", index_col=0) - #rna_part1 = pd.read_csv(folder / "rna_1.csv", index_col=0) - #rna_part2 = pd.read_csv(folder / "rna_2.csv", index_col=0) + # elif self.dataset_name == "gbm": + # self.data["mirna"] = pd.read_csv(folder / "mirna.csv", index_col=0) + # self.data["target"] = pd.read_csv(folder / "target.csv", index_col=0) + # self.data["clinical"] = pd.read_csv(folder / "clinical.csv", index_col=0) + # self.data["rna"] = pd.read_csv(folder / "rna.csv", index_col=0) + # self.data["meth"] = pd.read_csv(folder / "meth.csv", index_col=0) - #self.data["meth"] = pd.concat([meth_part1, meth_part2], axis=0) - #self.data["rna"] = pd.concat([rna_part1, rna_part2], axis=0) + # elif self.dataset_name == "kipan": + # self.data["mirna"] = pd.read_csv(folder / "mirna.csv", index_col=0) + # self.data["target"] = pd.read_csv(folder / "target.csv", index_col=0) + # self.data["clinical"] = pd.read_csv(folder / "clinical.csv", index_col=0) + # self.data["rna"] = pd.read_csv(folder / "rna.csv", index_col=0) + # self.data["meth"] = pd.read_csv(folder / "meth.csv", index_col=0) else: raise ValueError(f"Dataset '{self.dataset_name}' is not recognized.") diff --git a/bioneuralnet/downstream_task/dpmon.py b/bioneuralnet/downstream_task/dpmon.py index 035536a..fe03f9f 100644 --- a/bioneuralnet/downstream_task/dpmon.py +++ b/bioneuralnet/downstream_task/dpmon.py @@ -577,7 +577,7 @@ def __init__( self.gnn = SAGE( input_dim=gnn_input_dim, hidden_dim=gnn_hidden_dim, - output_dim=gnn_hidden_dim, + #output_dim=gnn_hidden_dim, layer_num=gnn_layer_num, final_layer="none", ) diff --git a/bioneuralnet/utils/__init__.py b/bioneuralnet/utils/__init__.py index f558e82..19d44a8 100644 --- a/bioneuralnet/utils/__init__.py +++ b/bioneuralnet/utils/__init__.py @@ -1,7 +1,7 @@ from .logger import get_logger from .rdata_convert import rdata_to_df -from .data import variance_summary, zero_fraction_summary, expression_summary, correlation_summary, explore_data_stats +from .data import variance_summary, zero_fraction_summary, expression_summary, correlation_summary, explore_data_stats, impute_omics, impute_omics_knn, set_seed, normalize_omics, beta_to_m from .preprocess import preprocess_clinical, clean_inf_nan, select_top_k_variance, select_top_k_correlation, select_top_randomforest, top_anova_f_features, prune_network, prune_network_by_quantile, network_remove_low_variance, network_remove_high_zero_fraction from .graph import gen_similarity_graph, gen_correlation_graph, gen_threshold_graph, gen_gaussian_knn_graph, gen_lasso_graph, gen_mst_graph, gen_snn_graph -__all__ = ["get_logger", "rdata_to_df", "variance_summary", "zero_fraction_summary", "expression_summary", "correlation_summary", "explore_data_stats", "preprocess_clinical", "clean_inf_nan", "select_top_k_variance", "select_top_k_correlation", "select_top_randomforest", "top_anova_f_features", "prune_network", "prune_network_by_quantile", "network_remove_low_variance", "network_remove_high_zero_fraction", "gen_similarity_graph", "gen_correlation_graph", "gen_threshold_graph", "gen_gaussian_knn_graph", "gen_lasso_graph", "gen_mst_graph", "gen_snn_graph"] +__all__ = ["get_logger", "rdata_to_df", "variance_summary", "zero_fraction_summary", "expression_summary", "correlation_summary", "explore_data_stats", "impute_omics", "impute_omics_knn", "set_seed", "normalize_omics", "beta_to_m", "preprocess_clinical", "clean_inf_nan", "select_top_k_variance", "select_top_k_correlation", "select_top_randomforest", "top_anova_f_features", "prune_network", "prune_network_by_quantile", "network_remove_low_variance", "network_remove_high_zero_fraction", "gen_similarity_graph", "gen_correlation_graph", "gen_threshold_graph", "gen_gaussian_knn_graph", "gen_lasso_graph", "gen_mst_graph", "gen_snn_graph"] diff --git a/bioneuralnet/utils/data.py b/bioneuralnet/utils/data.py index 406a8fd..2d44086 100644 --- a/bioneuralnet/utils/data.py +++ b/bioneuralnet/utils/data.py @@ -1,13 +1,29 @@ +import os +import random +import torch import pandas as pd import numpy as np from typing import Optional +from sklearn.preprocessing import StandardScaler, MinMaxScaler +from sklearn.impute import KNNImputer from .logger import get_logger logger = get_logger(__name__) def variance_summary(df: pd.DataFrame, low_var_threshold: Optional[float] = None) -> dict: """ - Compute summary statistics for column variances in the DataFrame + Computes key summary statistics for the feature (column) variances within an omics DataFrame. + + This is useful for assessing feature distribution and identifying low-variance features prior to modeling. + + Args: + + df (pd.DataFrame): The input omics DataFrame (samples as rows, features as columns). + low_var_threshold (Optional[float]): A threshold used to count features falling below this variance level. + + Returns: + + dict: A dictionary containing the mean, median, min, max, and standard deviation of the column variances. If a threshold is provided, it also includes 'num_low_variance_features'. """ variances = df.var() @@ -25,7 +41,18 @@ def variance_summary(df: pd.DataFrame, low_var_threshold: Optional[float] = None def zero_fraction_summary(df: pd.DataFrame, high_zero_threshold: Optional[float] = None) -> dict: """ - Compute summary statistics for the fraction of zeros in each column + Computes statistics on the fraction of zero values present in each feature (column). + + This helps identify feature sparsity, which is common in omics data (e.g., RNA-seq FPKM). + + Args: + + df (pd.DataFrame): The input omics DataFrame. + high_zero_threshold (Optional[float]): A threshold used to count features whose zero-fraction exceeds this value. + + Returns: + + dict: A dictionary containing the mean, median, min, max, and standard deviation of the zero fractions across all columns. If a threshold is provided, it includes 'num_high_zero_features'. """ zero_fraction = (df == 0).sum(axis=0) / df.shape[0] @@ -43,7 +70,17 @@ def zero_fraction_summary(df: pd.DataFrame, high_zero_threshold: Optional[float] def expression_summary(df: pd.DataFrame) -> dict: """ - Compute summary statistics for the mean expression of features + Computes summary statistics for the mean expression (average value) of all features. + + Provides insight into the overall magnitude and central tendency of the data values. + + Args: + + df (pd.DataFrame): The input omics DataFrame. + + Returns: + + dict: A dictionary containing the mean, median, min, max, and standard deviation of the feature means. """ mean_expression = df.mean() @@ -60,7 +97,17 @@ def expression_summary(df: pd.DataFrame) -> dict: def correlation_summary(df: pd.DataFrame) -> dict: """ - Compute summary statistics of the maximum pairwise correlation + Computes summary statistics on the maximum pairwise (absolute) correlation observed for each feature in the DataFrame. + + This helps identify features that are highly redundant or collinear. + + Args: + + df (pd.DataFrame): The input omics DataFrame. + + Returns: + + dict: A dictionary containing the mean, median, min, max, and standard deviation of the maximum absolute correlation values. """ corr_matrix = df.corr().abs() np.fill_diagonal(corr_matrix.values, 0) @@ -77,21 +124,201 @@ def correlation_summary(df: pd.DataFrame) -> dict: def explore_data_stats(omics_df: pd.DataFrame, name: str = "Data") -> None: """ - Print key statistics for an omics DataFrame including variance, zero fraction, + Prints a comprehensive set of key statistics for an omics DataFrame. + + Combines variance, zero fraction, expression, and correlation summaries for rapid data quality assessment. + + Args: + + omics_df (pd.DataFrame): The input omics DataFrame. + name (str): A descriptive name for the dataset (e.g., 'X_rna_final') for clear output labeling. + + Returns: + + None: Prints the statistics directly to the console. """ - print(f"Statistics for {name}:") + + logger.info(f"Statistics for {name}:") var_stats = variance_summary(omics_df, low_var_threshold=1e-4) - print(f"Variance Summary: {var_stats}") + logger.info(f"Variance Summary: {var_stats}") zero_stats = zero_fraction_summary(omics_df, high_zero_threshold=0.50) - print(f"Zero Fraction Summary: {zero_stats}") + logger.info(f"Zero Fraction Summary: {zero_stats}") expr_stats = expression_summary(omics_df) - print(f"Expression Summary: {expr_stats}") + logger.info(f"Expression Summary: {expr_stats}") try: corr_stats = correlation_summary(omics_df) - print(f"Correlation Summary: {corr_stats}") + logger.info(f"Correlation Summary: {corr_stats}") + except Exception as e: - print(f"Correlation Summary: Could not compute due to: {e}") - print("\n") + logger.info(f"Correlation Summary: Could not compute due to: {e}") + + logger.info("\n") + +def impute_omics(omics_df: pd.DataFrame, method: str = "mean") -> pd.DataFrame: + """ + Imputes missing values (NaNs) in the omics DataFrame using a specified strategy. + + Args: + + omics_df (pd.DataFrame): The input DataFrame containing missing values. + method (str): The imputation strategy to use. Must be 'mean', 'median', or 'zero'. + + Returns: + + pd.DataFrame: The DataFrame with missing values filled. + + Raises: + + ValueError: If the specified imputation method is not recognized. + """ + if method == "mean": + return omics_df.fillna(omics_df.mean()) + elif method == "median": + return omics_df.fillna(omics_df.median()) + elif method == "zero": + return omics_df.fillna(0) + else: + raise ValueError(f"Imputation method '{method}' not recognized.") + +def impute_omics_knn(omics_df: pd.DataFrame, n_neighbors: int = 5) -> pd.DataFrame: + """ + Imputes missing values (NaNs) using the K-Nearest Neighbors (KNN) approach. + + KNN imputation replaces missing values with the average value from the 'n_neighbors' most similar samples/features. This is often more accurate than simple mean imputation. + + Args: + + omics_df (pd.DataFrame): The input DataFrame containing missing values (NaNs). + n_neighbors (int): The number of nearest neighbors to consider for imputation. + + Returns: + + pd.DataFrame: The DataFrame with missing values filled using KNN. + """ + + has_non_numeric = False + for col in omics_df.columns: + if not pd.api.types.is_numeric_dtype(omics_df[col]): + has_non_numeric = True + break + + if has_non_numeric: + logger.error("KNNImputer requires numeric data. Non-numeric columns found.") + + logger.info(f"Starting KNN imputation (k={n_neighbors}) on DataFrame (shape: {omics_df.shape}).") + imputer = KNNImputer(n_neighbors=n_neighbors) + imputed_data = imputer.fit_transform(omics_df.values) + imputed_df = pd.DataFrame(imputed_data, index=omics_df.index, columns=omics_df.columns) + logger.info("KNN imputation complete") + + return imputed_df + +def normalize_omics(omics_df: pd.DataFrame, method: str = "standard") -> pd.DataFrame: + """ + Scales or transforms feature data using common normalization techniques. + + Args: + + omics_df (pd.DataFrame): The input omics DataFrame. + method (str): The scaling strategy. Must be 'standard' (Z-score), 'minmax', or 'log2'. + + + Returns: + + pd.DataFrame: The DataFrame with features normalized according to the specified method. + + Raises: + + ValueError: If the specified normalization method is not recognized. + """ + logger.info(f"Starting normalization on DataFrame (shape: {omics_df.shape}) using method: '{method}'.") + data = omics_df.values + + if method == "standard": + scaler = StandardScaler() + scaled_data = scaler.fit_transform(data) + elif method == "minmax": + scaler = MinMaxScaler() + scaled_data = scaler.fit_transform(data) + elif method == "log2": + if np.any(data < 0): + logger.warning("Log2 transformation applied to data containing negative values. This can lead to unpredictable results") + scaled_data = np.log2(data + 1) + else: + logger.error(f"Normalization method '{method}' not recognized.") + raise ValueError(f"Normalization method '{method}' not recognized.") + + final_df = pd.DataFrame(scaled_data, index=omics_df.index, columns=omics_df.columns) + logger.info("Normalization complete.") + return final_df + +def set_seed(seed_value: int) -> None: + """ + Sets seeds for maximum reproducibility across Python, NumPy, and PyTorch. + + This function sets global random seeds and configures PyTorch/CUDNN to use deterministic algorithms, ensuring that the experiment produces the exact same numerical result across different runs. + + Args: + + seed_value (int): The integer value to use as the random seed. + + Returns: + + None + + """ + logger.info(f"Setting global seed for reproducibility to: {seed_value}") + + os.environ['PYTHONHASHSEED'] = str(seed_value) + random.seed(seed_value) + np.random.seed(seed_value) + torch.manual_seed(seed_value) + + if torch.cuda.is_available(): + logger.info("CUDA available. Applying seed to all GPU operations") + torch.cuda.manual_seed(seed_value) + torch.cuda.manual_seed_all(seed_value) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + else: + logger.info("CUDA not available. Seeding only CPU operations") + + logger.info("Seed setting complete") + +def beta_to_m(df, eps=1e-6): + """ + Converts methylation Beta-values (ratio of methylated intensity to total intensity) to M-values using log2 transformation. + + M-values follow a normal distribution, improving statistical analysis, especially for differential methylation studies, by transforming the constrained [0, 1] Beta scale to an unbounded log-transformed scale (-inf to +inf). + + Args: + + df (pd.DataFrame): The input DataFrame containing Beta-values (0 to 1). + eps (float): A small epsilon value used to clip Beta-values (B) away from 0 and 1, preventing logarithm errors (log(0) or division by zero). + + Returns: + + pd.DataFrame: A new DataFrame containing the log2-transformed M-values, calculated as log2(B / (1 - B)). + """ + logger.info(f"Starting Beta-to-M value conversion (shape: {df.shape}). Epsilon: {eps}") + + has_non_numeric = False + for col in df.columns: + if not pd.api.types.is_numeric_dtype(df[col]): + has_non_numeric = True + break + + if has_non_numeric: + logger.warning("Coercing non-numeric Beta-values to numeric (NaNs will be introduced)") + + df_numeric = df.apply(pd.to_numeric, errors='coerce') + + B = np.clip(df_numeric.values, eps, 1.0 - eps) + M = np.log2(B / (1.0 - B)) + + logger.info("Beta-to-M conversion complete.") + + return pd.DataFrame(M, index=df_numeric.index, columns=df_numeric.columns) diff --git a/docs/source/index.rst b/docs/source/index.rst index 31d60ed..1479c0b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -13,8 +13,8 @@ BioNeuralNet: Graph Neural Networks for Multi-Omics Network Analysis .. image:: https://img.shields.io/badge/GitHub-View%20Code-blue :target: https://github.com/UCD-BDLab/BioNeuralNet -.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.17503084.svg - :target: https://doi.org/10.5281/zenodo.17503084 +.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.17503083.svg + :target: https://doi.org/10.5281/zenodo.17503083 .. figure:: _static/LOGO_TB.png :align: center diff --git a/tests/test_data_utils.py b/tests/test_data_utils.py index 28c5536..b04281f 100644 --- a/tests/test_data_utils.py +++ b/tests/test_data_utils.py @@ -1,13 +1,21 @@ import unittest +from unittest.mock import patch import pandas as pd import numpy as np import io import sys +import logging +import torch from bioneuralnet.utils.data import variance_summary from bioneuralnet.utils.data import zero_fraction_summary from bioneuralnet.utils.data import expression_summary from bioneuralnet.utils.data import correlation_summary from bioneuralnet.utils.data import explore_data_stats +from bioneuralnet.utils.data import impute_omics +from bioneuralnet.utils.data import impute_omics_knn +from bioneuralnet.utils.data import normalize_omics +from bioneuralnet.utils.data import beta_to_m +from bioneuralnet.utils.data import set_seed class TestDataUtils(unittest.TestCase): def setUp(self): @@ -30,6 +38,21 @@ def setUp(self): "V": [3.0, 2.0, 1.0], "W": [1.0, 0.0, 1.0], }) + self.df_nan = pd.DataFrame({ + "C1": [1.0, 2.0, np.nan, 4.0], + "C2": [10.0, np.nan, 30.0, 40.0], + "C3": [5.0, 5.0, 5.0, 5.0], + }) + self.df_beta = pd.DataFrame({ + "B1": [0.1, 0.5, 0.9], + "B2": [0.0, 1.0, 0.5], + }) + + self.mock_logger = logging.getLogger('test_logger') + self.mock_logger.setLevel(logging.INFO) + self.mock_stream = io.StringIO() + self.mock_handler = logging.StreamHandler(self.mock_stream) + self.mock_logger.addHandler(self.mock_handler) def test_variance_summary_no_threshold(self): stats = variance_summary(self.df_var, low_var_threshold=None) @@ -93,17 +116,65 @@ def test_correlation_summary(self): self.assertAlmostEqual(stats["max_corr_max"], max_corr.max()) self.assertAlmostEqual(stats["max_corr_std"], max_corr.std()) - def test_explore_data_stats_prints_all_sections(self): - buf = io.StringIO() - old_stdout = sys.stdout - sys.stdout = buf - - try: - explore_data_stats(self.df_corr, name="TestDF") - finally: - sys.stdout = old_stdout - - output = buf.getvalue() + def test_impute_omics_mean(self): + df_imputed = impute_omics(self.df_nan, method="mean") + self.assertAlmostEqual(df_imputed.loc[2, "C1"], 2.3333333333333335) + self.assertAlmostEqual(df_imputed.loc[1, "C2"], 26.666666666666668) + self.assertEqual(df_imputed.isna().sum().sum(), 0) + + def test_impute_omics_median(self): + df_imputed = impute_omics(self.df_nan, method="median") + self.assertAlmostEqual(df_imputed.loc[1, "C2"], 30.0) + self.assertEqual(df_imputed.isna().sum().sum(), 0) + + def test_impute_omics_knn(self): + df_imputed = impute_omics_knn(self.df_nan, n_neighbors=2) + self.assertEqual(df_imputed.shape, self.df_nan.shape) + self.assertEqual(df_imputed.isna().sum().sum(), 0) + self.assertNotAlmostEqual(df_imputed.loc[1, "C2"], 26.666666666666668) + + def test_normalize_omics_standard(self): + df_normalized = normalize_omics(self.df_nan.dropna(), method="standard") + self.assertAlmostEqual(df_normalized.mean().sum(), 0.0, places=5) + expected_std_sum = np.sqrt(2) * 2 + self.assertAlmostEqual(df_normalized.std().sum(), expected_std_sum, places=5) + + def test_normalize_omics_log2(self): + df_log = normalize_omics(self.df_var, method="log2") + self.assertAlmostEqual(df_log.loc[0, "A"], 1.0) + self.assertAlmostEqual(df_log.loc[0, "B"], 1.5849625) + + def test_beta_to_m_conversion(self): + df_m_values = beta_to_m(self.df_beta, eps=1e-6) + self.assertAlmostEqual(df_m_values.loc[0, "B1"], -3.169925) + self.assertAlmostEqual(df_m_values.loc[2, "B2"], 0.0) + self.assertAlmostEqual(df_m_values.loc[0, "B2"], -19.931567126628412) + + def test_set_seed_reproducibility(self): + test_seed = 177 + + set_seed(test_seed) + result1 = np.random.rand(5) + + set_seed(test_seed) + result2 = np.random.rand(5) + + np.testing.assert_array_equal(result1, result2) + set_seed(test_seed + 1) + tensor1 = torch.rand(5) + + set_seed(test_seed + 1) + tensor2 = torch.rand(5) + + self.assertTrue(torch.equal(tensor1, tensor2)) + + @patch('bioneuralnet.utils.data.logger') + def test_explore_data_stats_logs_all_sections(self, mock_logger): + mock_logger.addHandler(self.mock_handler) + explore_data_stats(self.df_corr, name="TestDF") + + all_call_args = [call[0][0] for call in mock_logger.info.call_args_list] + output = "\n".join(all_call_args) self.assertIn("Statistics for TestDF:", output) self.assertIn("Variance Summary:", output)