Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -122,3 +122,10 @@ lib/
.DS_Store
cancer_output_1/GlobalNetwork.csv
Parkinson
KIPAN
dpmon_cv_results
dpmon_cv_results_GCN_FINAL
dpmon_cv_results_SAGE_FINAL
dpmon_tuning
GBMLGG
PAAN
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
[![GitHub Contributors](https://img.shields.io/github/contributors/UCD-BDLab/BioNeuralNet)](https://github.com/UCD-BDLab/BioNeuralNet/graphs/contributors)
[![Downloads](https://static.pepy.tech/badge/bioneuralnet)](https://pepy.tech/project/bioneuralnet)
[![Documentation](https://img.shields.io/badge/docs-read%20the%20docs-blue.svg)](https://bioneuralnet.readthedocs.io/en/latest/)
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.17503084.svg)](https://doi.org/10.5281/zenodo.17503084)

[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.17503083.svg)](https://doi.org/10.5281/zenodo.17503083)

## Welcome to BioNeuralNet 1.1.3

Expand Down
22 changes: 14 additions & 8 deletions bioneuralnet/datasets/dataset_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ class DatasetLoader:
def __init__(self, dataset_name: str):
"""
Args:
dataset_name (str): "example1", "monet", or "tcga_brca".
dataset_name (str): "example1", "monet", "brca", "gbm", or "kipan".

returns:

Expand All @@ -14,7 +14,7 @@ def __init__(self, dataset_name: str):

Example:

tcga_brca = DatasetLoader("tcga_brca")
tcga_brca = DatasetLoader("brca")
tcga_brca.shape
# {'brca_mirna': (108, 1000), 'brca_pam50': (108, 50), ...}
mirna = tcga_brca.data["brca_mirna"]
Expand Down Expand Up @@ -58,14 +58,20 @@ def _load_data(self):
self.data["clinical"] = pd.read_csv(folder / "clinical.csv", index_col=0)
self.data["rna"] = pd.read_csv(folder / "rna.csv", index_col=0)
self.data["meth"] = pd.read_csv(folder / "meth.csv", index_col=0)
#meth_part1 = pd.read_csv(folder / "meth_1.csv", index_col=0)
#meth_part2= pd.read_csv(folder / "meth_2.csv", index_col=0)

#rna_part1 = pd.read_csv(folder / "rna_1.csv", index_col=0)
#rna_part2 = pd.read_csv(folder / "rna_2.csv", index_col=0)
# elif self.dataset_name == "gbm":
# self.data["mirna"] = pd.read_csv(folder / "mirna.csv", index_col=0)
# self.data["target"] = pd.read_csv(folder / "target.csv", index_col=0)
# self.data["clinical"] = pd.read_csv(folder / "clinical.csv", index_col=0)
# self.data["rna"] = pd.read_csv(folder / "rna.csv", index_col=0)
# self.data["meth"] = pd.read_csv(folder / "meth.csv", index_col=0)

#self.data["meth"] = pd.concat([meth_part1, meth_part2], axis=0)
#self.data["rna"] = pd.concat([rna_part1, rna_part2], axis=0)
# elif self.dataset_name == "kipan":
# self.data["mirna"] = pd.read_csv(folder / "mirna.csv", index_col=0)
# self.data["target"] = pd.read_csv(folder / "target.csv", index_col=0)
# self.data["clinical"] = pd.read_csv(folder / "clinical.csv", index_col=0)
# self.data["rna"] = pd.read_csv(folder / "rna.csv", index_col=0)
# self.data["meth"] = pd.read_csv(folder / "meth.csv", index_col=0)

else:
raise ValueError(f"Dataset '{self.dataset_name}' is not recognized.")
Expand Down
2 changes: 1 addition & 1 deletion bioneuralnet/downstream_task/dpmon.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ def __init__(
self.gnn = SAGE(
input_dim=gnn_input_dim,
hidden_dim=gnn_hidden_dim,
output_dim=gnn_hidden_dim,
#output_dim=gnn_hidden_dim,
layer_num=gnn_layer_num,
final_layer="none",
)
Expand Down
4 changes: 2 additions & 2 deletions bioneuralnet/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .logger import get_logger
from .rdata_convert import rdata_to_df
from .data import variance_summary, zero_fraction_summary, expression_summary, correlation_summary, explore_data_stats
from .data import variance_summary, zero_fraction_summary, expression_summary, correlation_summary, explore_data_stats, impute_omics, impute_omics_knn, set_seed, normalize_omics, beta_to_m
from .preprocess import preprocess_clinical, clean_inf_nan, select_top_k_variance, select_top_k_correlation, select_top_randomforest, top_anova_f_features, prune_network, prune_network_by_quantile, network_remove_low_variance, network_remove_high_zero_fraction
from .graph import gen_similarity_graph, gen_correlation_graph, gen_threshold_graph, gen_gaussian_knn_graph, gen_lasso_graph, gen_mst_graph, gen_snn_graph

__all__ = ["get_logger", "rdata_to_df", "variance_summary", "zero_fraction_summary", "expression_summary", "correlation_summary", "explore_data_stats", "preprocess_clinical", "clean_inf_nan", "select_top_k_variance", "select_top_k_correlation", "select_top_randomforest", "top_anova_f_features", "prune_network", "prune_network_by_quantile", "network_remove_low_variance", "network_remove_high_zero_fraction", "gen_similarity_graph", "gen_correlation_graph", "gen_threshold_graph", "gen_gaussian_knn_graph", "gen_lasso_graph", "gen_mst_graph", "gen_snn_graph"]
__all__ = ["get_logger", "rdata_to_df", "variance_summary", "zero_fraction_summary", "expression_summary", "correlation_summary", "explore_data_stats", "impute_omics", "impute_omics_knn", "set_seed", "normalize_omics", "beta_to_m", "preprocess_clinical", "clean_inf_nan", "select_top_k_variance", "select_top_k_correlation", "select_top_randomforest", "top_anova_f_features", "prune_network", "prune_network_by_quantile", "network_remove_low_variance", "network_remove_high_zero_fraction", "gen_similarity_graph", "gen_correlation_graph", "gen_threshold_graph", "gen_gaussian_knn_graph", "gen_lasso_graph", "gen_mst_graph", "gen_snn_graph"]
251 changes: 239 additions & 12 deletions bioneuralnet/utils/data.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,29 @@
import os
import random
import torch
import pandas as pd
import numpy as np
from typing import Optional
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import KNNImputer
from .logger import get_logger

logger = get_logger(__name__)

def variance_summary(df: pd.DataFrame, low_var_threshold: Optional[float] = None) -> dict:
"""
Compute summary statistics for column variances in the DataFrame
Computes key summary statistics for the feature (column) variances within an omics DataFrame.

This is useful for assessing feature distribution and identifying low-variance features prior to modeling.

Args:

df (pd.DataFrame): The input omics DataFrame (samples as rows, features as columns).
low_var_threshold (Optional[float]): A threshold used to count features falling below this variance level.

Returns:

dict: A dictionary containing the mean, median, min, max, and standard deviation of the column variances. If a threshold is provided, it also includes 'num_low_variance_features'.
"""

variances = df.var()
Expand All @@ -25,7 +41,18 @@ def variance_summary(df: pd.DataFrame, low_var_threshold: Optional[float] = None

def zero_fraction_summary(df: pd.DataFrame, high_zero_threshold: Optional[float] = None) -> dict:
"""
Compute summary statistics for the fraction of zeros in each column
Computes statistics on the fraction of zero values present in each feature (column).

This helps identify feature sparsity, which is common in omics data (e.g., RNA-seq FPKM).

Args:

df (pd.DataFrame): The input omics DataFrame.
high_zero_threshold (Optional[float]): A threshold used to count features whose zero-fraction exceeds this value.

Returns:

dict: A dictionary containing the mean, median, min, max, and standard deviation of the zero fractions across all columns. If a threshold is provided, it includes 'num_high_zero_features'.
"""

zero_fraction = (df == 0).sum(axis=0) / df.shape[0]
Expand All @@ -43,7 +70,17 @@ def zero_fraction_summary(df: pd.DataFrame, high_zero_threshold: Optional[float]

def expression_summary(df: pd.DataFrame) -> dict:
"""
Compute summary statistics for the mean expression of features
Computes summary statistics for the mean expression (average value) of all features.

Provides insight into the overall magnitude and central tendency of the data values.

Args:

df (pd.DataFrame): The input omics DataFrame.

Returns:

dict: A dictionary containing the mean, median, min, max, and standard deviation of the feature means.
"""

mean_expression = df.mean()
Expand All @@ -60,7 +97,17 @@ def expression_summary(df: pd.DataFrame) -> dict:

def correlation_summary(df: pd.DataFrame) -> dict:
"""
Compute summary statistics of the maximum pairwise correlation
Computes summary statistics on the maximum pairwise (absolute) correlation observed for each feature in the DataFrame.

This helps identify features that are highly redundant or collinear.

Args:

df (pd.DataFrame): The input omics DataFrame.

Returns:

dict: A dictionary containing the mean, median, min, max, and standard deviation of the maximum absolute correlation values.
"""
corr_matrix = df.corr().abs()
np.fill_diagonal(corr_matrix.values, 0)
Expand All @@ -77,21 +124,201 @@ def correlation_summary(df: pd.DataFrame) -> dict:

def explore_data_stats(omics_df: pd.DataFrame, name: str = "Data") -> None:
"""
Print key statistics for an omics DataFrame including variance, zero fraction,
Prints a comprehensive set of key statistics for an omics DataFrame.

Combines variance, zero fraction, expression, and correlation summaries for rapid data quality assessment.

Args:

omics_df (pd.DataFrame): The input omics DataFrame.
name (str): A descriptive name for the dataset (e.g., 'X_rna_final') for clear output labeling.

Returns:

None: Prints the statistics directly to the console.
"""
print(f"Statistics for {name}:")

logger.info(f"Statistics for {name}:")
var_stats = variance_summary(omics_df, low_var_threshold=1e-4)
print(f"Variance Summary: {var_stats}")
logger.info(f"Variance Summary: {var_stats}")

zero_stats = zero_fraction_summary(omics_df, high_zero_threshold=0.50)
print(f"Zero Fraction Summary: {zero_stats}")
logger.info(f"Zero Fraction Summary: {zero_stats}")

expr_stats = expression_summary(omics_df)
print(f"Expression Summary: {expr_stats}")
logger.info(f"Expression Summary: {expr_stats}")

try:
corr_stats = correlation_summary(omics_df)
print(f"Correlation Summary: {corr_stats}")
logger.info(f"Correlation Summary: {corr_stats}")

except Exception as e:
print(f"Correlation Summary: Could not compute due to: {e}")
print("\n")
logger.info(f"Correlation Summary: Could not compute due to: {e}")

logger.info("\n")

def impute_omics(omics_df: pd.DataFrame, method: str = "mean") -> pd.DataFrame:
"""
Imputes missing values (NaNs) in the omics DataFrame using a specified strategy.

Args:

omics_df (pd.DataFrame): The input DataFrame containing missing values.
method (str): The imputation strategy to use. Must be 'mean', 'median', or 'zero'.

Returns:

pd.DataFrame: The DataFrame with missing values filled.

Raises:

ValueError: If the specified imputation method is not recognized.
"""
if method == "mean":
return omics_df.fillna(omics_df.mean())
elif method == "median":
return omics_df.fillna(omics_df.median())
elif method == "zero":
return omics_df.fillna(0)
else:
raise ValueError(f"Imputation method '{method}' not recognized.")

def impute_omics_knn(omics_df: pd.DataFrame, n_neighbors: int = 5) -> pd.DataFrame:
"""
Imputes missing values (NaNs) using the K-Nearest Neighbors (KNN) approach.

KNN imputation replaces missing values with the average value from the 'n_neighbors' most similar samples/features. This is often more accurate than simple mean imputation.

Args:

omics_df (pd.DataFrame): The input DataFrame containing missing values (NaNs).
n_neighbors (int): The number of nearest neighbors to consider for imputation.

Returns:

pd.DataFrame: The DataFrame with missing values filled using KNN.
"""

has_non_numeric = False
for col in omics_df.columns:
if not pd.api.types.is_numeric_dtype(omics_df[col]):
has_non_numeric = True
break

if has_non_numeric:
logger.error("KNNImputer requires numeric data. Non-numeric columns found.")

logger.info(f"Starting KNN imputation (k={n_neighbors}) on DataFrame (shape: {omics_df.shape}).")
imputer = KNNImputer(n_neighbors=n_neighbors)
imputed_data = imputer.fit_transform(omics_df.values)
imputed_df = pd.DataFrame(imputed_data, index=omics_df.index, columns=omics_df.columns)
logger.info("KNN imputation complete")

return imputed_df

def normalize_omics(omics_df: pd.DataFrame, method: str = "standard") -> pd.DataFrame:
"""
Scales or transforms feature data using common normalization techniques.

Args:

omics_df (pd.DataFrame): The input omics DataFrame.
method (str): The scaling strategy. Must be 'standard' (Z-score), 'minmax', or 'log2'.


Returns:

pd.DataFrame: The DataFrame with features normalized according to the specified method.

Raises:

ValueError: If the specified normalization method is not recognized.
"""
logger.info(f"Starting normalization on DataFrame (shape: {omics_df.shape}) using method: '{method}'.")
data = omics_df.values

if method == "standard":
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)
elif method == "minmax":
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)
elif method == "log2":
if np.any(data < 0):
logger.warning("Log2 transformation applied to data containing negative values. This can lead to unpredictable results")
scaled_data = np.log2(data + 1)
else:
logger.error(f"Normalization method '{method}' not recognized.")
raise ValueError(f"Normalization method '{method}' not recognized.")

final_df = pd.DataFrame(scaled_data, index=omics_df.index, columns=omics_df.columns)
logger.info("Normalization complete.")
return final_df

def set_seed(seed_value: int) -> None:
"""
Sets seeds for maximum reproducibility across Python, NumPy, and PyTorch.

This function sets global random seeds and configures PyTorch/CUDNN to use deterministic algorithms, ensuring that the experiment produces the exact same numerical result across different runs.

Args:

seed_value (int): The integer value to use as the random seed.

Returns:

None

"""
logger.info(f"Setting global seed for reproducibility to: {seed_value}")

os.environ['PYTHONHASHSEED'] = str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)

if torch.cuda.is_available():
logger.info("CUDA available. Applying seed to all GPU operations")
torch.cuda.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
else:
logger.info("CUDA not available. Seeding only CPU operations")

logger.info("Seed setting complete")

def beta_to_m(df, eps=1e-6):
"""
Converts methylation Beta-values (ratio of methylated intensity to total intensity) to M-values using log2 transformation.

M-values follow a normal distribution, improving statistical analysis, especially for differential methylation studies, by transforming the constrained [0, 1] Beta scale to an unbounded log-transformed scale (-inf to +inf).

Args:

df (pd.DataFrame): The input DataFrame containing Beta-values (0 to 1).
eps (float): A small epsilon value used to clip Beta-values (B) away from 0 and 1, preventing logarithm errors (log(0) or division by zero).

Returns:

pd.DataFrame: A new DataFrame containing the log2-transformed M-values, calculated as log2(B / (1 - B)).
"""
logger.info(f"Starting Beta-to-M value conversion (shape: {df.shape}). Epsilon: {eps}")

has_non_numeric = False
for col in df.columns:
if not pd.api.types.is_numeric_dtype(df[col]):
has_non_numeric = True
break

if has_non_numeric:
logger.warning("Coercing non-numeric Beta-values to numeric (NaNs will be introduced)")

df_numeric = df.apply(pd.to_numeric, errors='coerce')

B = np.clip(df_numeric.values, eps, 1.0 - eps)
M = np.log2(B / (1.0 - B))

logger.info("Beta-to-M conversion complete.")

return pd.DataFrame(M, index=df_numeric.index, columns=df_numeric.columns)
4 changes: 2 additions & 2 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ BioNeuralNet: Graph Neural Networks for Multi-Omics Network Analysis
.. image:: https://img.shields.io/badge/GitHub-View%20Code-blue
:target: https://github.com/UCD-BDLab/BioNeuralNet

.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.17503084.svg
:target: https://doi.org/10.5281/zenodo.17503084
.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.17503083.svg
:target: https://doi.org/10.5281/zenodo.17503083

.. figure:: _static/LOGO_TB.png
:align: center
Expand Down
Loading
Loading