From 4d72d654508b6d70c19a1962a23dc65b6de8d158 Mon Sep 17 00:00:00 2001 From: KarinSchork Date: Mon, 12 Jan 2026 16:25:30 +0100 Subject: [PATCH 01/11] start with plan for clustering workflow --- R/WIP_workflow_clustering_heatmap.R | 32 +++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 R/WIP_workflow_clustering_heatmap.R diff --git a/R/WIP_workflow_clustering_heatmap.R b/R/WIP_workflow_clustering_heatmap.R new file mode 100644 index 0000000..bd8002e --- /dev/null +++ b/R/WIP_workflow_clustering_heatmap.R @@ -0,0 +1,32 @@ + + + +### Prerequisite: +## have data set ready normalized, log2 transformed, if needed missing value imputation + + +### parameters +# colours for the clusters + + + +### Steps: + +# filter out samples with only missing values +# then, if necessary build the topannotation for the heatmap + +# cluster the data with hclust (and columns) +# build dendrograms for clustering +# if needed, colour the dendrogram + +# then plot the heatmap +# plot lineplots (one for each cluster) + + + + + + + + + From bf9dfddd578adfff7e8edfeaa199d259af4f8e2a Mon Sep 17 00:00:00 2001 From: KarinSchork Date: Tue, 20 Jan 2026 18:02:40 +0100 Subject: [PATCH 02/11] build workflow for clustering --- NAMESPACE | 5 +- R/WIP_Clustering_Heatmap_Lineplots.R | 160 +++++++++++++++------------ R/WIP_workflow_clustering_heatmap.R | 146 ++++++++++++++++++++---- R/workflow_ttest.R | 2 +- man/Clustering_heatmap_lineplots.Rd | 44 -------- man/Lineplots.Rd | 22 ++++ man/PCA_Plot.Rd | 4 + man/clustering.Rd | 47 ++++++++ man/filter_PCA_data.Rd | 11 +- man/getClusterInfos.Rd | 27 +++++ man/workflow_clustering.Rd | 89 +++++++++++++++ man/workflow_ttest.Rd | 2 +- 12 files changed, 417 insertions(+), 142 deletions(-) delete mode 100644 man/Clustering_heatmap_lineplots.Rd create mode 100644 man/Lineplots.Rd create mode 100644 man/clustering.Rd create mode 100644 man/getClusterInfos.Rd create mode 100644 man/workflow_clustering.Rd diff --git a/NAMESPACE b/NAMESPACE index c0f6d97..96b9f4a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,8 +3,8 @@ export(ANOVA) export(Boxplots) export(Boxplots_candidates) -export(Clustering_heatmap_lineplots) export(Heatmap_with_groups) +export(Lineplots) export(MA_Plots) export(PCA_Plot) export(ValidValuePlot) @@ -15,11 +15,14 @@ export(add_labels) export(automatedNormalization) export(calculate_significance_categories_ANOVA) export(calculate_significance_categories_ttest) +export(clustering) +export(getClusterInfos) export(prepareData) export(prepareTtestData) export(pvalue_foldchange_histogram) export(ttest) export(workflow_ANOVA) export(workflow_QC) +export(workflow_clustering) export(workflow_ttest) importFrom(magrittr,"%>%") diff --git a/R/WIP_Clustering_Heatmap_Lineplots.R b/R/WIP_Clustering_Heatmap_Lineplots.R index 1c2ed65..1cb6e64 100644 --- a/R/WIP_Clustering_Heatmap_Lineplots.R +++ b/R/WIP_Clustering_Heatmap_Lineplots.R @@ -1,80 +1,82 @@ -#' Clustering, Heatmap and Lineplots + + +#' Cluster proteins for similar patterns across samples #' -#' @param D **data.frame** \cr Dataframe with log-transformed protein intensities, e.g. filtered for significant proteins form the ANOVA or t-test results. -#' @param id **data.frame** \cr dataframe with id information e.g. protein names, gene names, accessions etc. -#' @param output_path **char(1)** \cr Path where results will be saved. -#' @param suffix **char(1)** \cr Suffix for the file names, should start with a underscore. -#' @param nr_clusters **int(1)** \cr Number of clusters. Default is NULL, meaning that the optimal number of clusters will be determined by [dendextend::find_k()]. -#' @param row_split **logi(1)** \cr If TRUE, there will be space between row clusters in the heatmap. -#' @param dist_method **char(1)** \cr distance method for clustering, default is "correlation" (centered Pearson correlation) +#' @param D \strong{data.frame} \cr +#' Dataframe containing protein intensities. +#' @param dist_method \strong{character(1)} \cr +#' Distance measure to use for the hierarchical clustering. In principle, +#' all methods available in \code{\link[amap]{Dist}} are possible, however +#' correlation-based metrics like "correlation", "pearson" or "spearman" +#' are recommended. The default is "correlation", which uses the centere +#' Pearson correlation. +#' @param nr_clusters \strong{integer(1)} \cr +#' Number of clusters to cut the dendrogram into. If \code{NULL} (default), +#' the optimal number of clusters is determined based on silhouette values +#' using the \code{\link[dendextend]{find_k}} function. +#' @param cluster_colours \strong{character vector} \cr +#' Colours to use for the different clusters. If \code{NULL} (default), +#' the default ggplot color palette is used. +#' @param colour_dend \strong{logical(1)} \cr +#' If \code{TRUE} (default), the branches of the dendrogram are coloured +#' according to the clusters, using the defined cluster_colours. #' -#' @return save heatmap and data frame with cluster information, as well as line plots +#' @returns A list containing the following entries: +#' \item{row_dend}{The dendrogram object for the rows (proteins).} +#' \item{nr_clusters}{The number of clusters.} +#' \item{cluster_colours}{The colours used for the different clusters.} #' #' @export #' -#' @examples # TODO -Clustering_heatmap_lineplots <- function(D, - id, - output_path, - suffix = "", - nr_clusters = NULL, - row_split = TRUE, - dist_method = "correlation", - plot_height_heatmap = 15, - plot_width_heatmap = 15, - plot_height_lineplot= 10, - plot_width_lineplot = 15) { - - ### TODO: What to do with NAs - ### TODO: what to do with constant rows (may happen for extremely low of high abundant proteins) - - id_columns <- 1:ncol(id) - +#' @examples +clustering <- function(D, + dist_method = "correlation", + nr_clusters = NULL, + cluster_colours = NULL, + colour_dend = TRUE) { rownames(D) <- 1:nrow(D) # reset rownames (important to match cluster information later) # cluster the proteins with centered Pearson correlation as distance function row_dend <- stats::as.dendrogram(stats::hclust(amap::Dist(D, method = dist_method))) if (is.null(nr_clusters)) { - # find optimal number of clusters based on silhouette values - nr_clusters <- dendextend::find_k(row_dend)$k + # find optimal number of clusters based on silhouette values + nr_clusters <- dendextend::find_k(row_dend)$k } ## define colours for each cluster cluster_colours <- scales::hue_pal()(nr_clusters) - ## colour branches of the dendrogram to plot next to the heatmap - row_dend_color = dendextend::color_branches(row_dend, k = nr_clusters, col = cluster_colours) - if (row_split) { - row_split = nr_clusters - } else { - row_split = NULL + if (colour_dend) { + ## colour branches of the dendrogram to plot next to the heatmap + row_dend = dendextend::color_branches(row_dend, k = nr_clusters, col = cluster_colours) } + return(list(row_dend = row_dend, nr_clusters = nr_clusters, cluster_colours = cluster_colours)) +} - ht <- ProtStatsWF::Heatmap_with_groups(D = D, - id = id, - # TODO: no filtering at the moment but it may be necessary/useful depending on the data - filtermissings = ncol(D), - cluster_rows = row_dend_color, - cluster_columns = FALSE, - log_data = FALSE, - #output_path = output_path, - #suffix = paste(suffix, nr_clusters, sep = "_"), - ### TODO: allow omitting rows with missing values - na_method = "impute", - row_split = nr_clusters, - row_gap = grid::unit(5, "mm")) - grDevices::png(paste0(output_path, "/heatmap", suffix, "_", nr_clusters, ".png"), - height = plot_height_heatmap, - width = plot_width_heatmap, units = "cm", res = 300) - graphics::plot(ht)#[["heatmap"]]) - grDevices::dev.off() + +#' Get cluster information from heatmap +#' +#' @param heatmap \strong{Heatmap object} \cr +#' Heatmap object generated by \code{\link[ComplexHeatmap]{Heatmap}}. +#' @param nr_clusters \strong{integer(1)} \cr +#' Number of clusters to cut the dendrogram into. +#' @param D \strong{data.frame} \cr +#' Dataframe containing only protein intensities. +#' @param id \strong{data.frame} \cr +#' Dataframe containing the ID columns for the parameter D e.g. containing further columns like protein or gene names. +#' +#' @returns A data.frame containing the cluster assignment for each protein along with the original ID columns and the intensity values. +#' @export +#' +#' @examples +getClusterInfos <- function(heatmap, nr_clusters, D, id) { ### get cluster for each protein (cluster number from heatmap doesn't correspond to apply cutree() on the dendrogram. This is why we need to get the cluster number from the heatmap directly). - ht_draw <- ComplexHeatmap::draw(ht)#$heatmap) + ht_draw <- ComplexHeatmap::draw(heatmap)#$heatmap) x <- ComplexHeatmap::row_dend(ht_draw) cluster <- integer(nrow(D)) for (j in 1:nr_clusters) { @@ -82,28 +84,41 @@ Clustering_heatmap_lineplots <- function(D, cluster[cluster_members] <- j } - - ### write table with cluster results - ### TODO: add z-scores to the table + ### TODO: add z-scores to the table or generate separate table for that. RES_clustering <- cbind(id, cluster = cluster, D) - openxlsx::write.xlsx(RES_clustering, paste0(output_path, "/cluster_table", suffix, "_", nr_clusters, ".xlsx")) + + return(RES_clustering) + + +} - ############### - ### draw lineplot for each cluster, coloured by distance to the cluster center - D_zscore <- cbind(ht@matrix, cluster = cluster) - #id_columns <- 1:ncol(id) + +#' Generate Lineplots for each cluster. +#' +#' @param D_zscore \strong{data.frame} \cr +#' Dataframe containing the z-score normalized protein intensities along +#' with a column "cluster" indicating the cluster assignment for each protein. +#' @param cluster_colours \strong{character vector} \cr +#' Colours to use for the different clusters. +#' +#' @returns A list of ggplot2 objects, each containing the lineplot for one cluster. +#' @export +#' +#' @examples +Lineplots <- function(D_zscore, cluster_colours) { + ### TODO: currently only plots without imputation. This should be changed to allow for imputation as well. - grDevices::pdf(paste0(output_path, "/Lineplots", suffix, "_", nr_clusters, ".pdf"), - width = plot_width_lineplot, - height = plot_height_lineplot) + nr_clusters <- max(D_zscore$cluster) + + lineplots <- list() for (i in 1:nr_clusters) { ## choose only data points from the specific cluster - D_tmp <- D_zscore[cluster == i, -c(ncol(D_zscore)), drop = FALSE] # remove id columns and cluster column + D_tmp <- D_zscore[D_zscore$cluster == i, -c(ncol(D_zscore)), drop = FALSE] # remove cluster column ## calculate mean profile of the cluster mean_profile <- colMeans(D_tmp, na.rm = TRUE) @@ -120,7 +135,7 @@ Clustering_heatmap_lineplots <- function(D, X_long <- dplyr::mutate(X_long, ClusterCenter = dplyr::case_when(is.na(Dists_euclidean) ~ "Cluster Center", TRUE ~ "Cluster Members")) - variable <- value <- ClusterCenter <- NULL # to silence notes while checking the package + variable <- value <- ClusterCenter <- id <- NULL # to silence notes while checking the package pl <- ggplot2::ggplot(data = X_long, ggplot2::aes(x = variable, y = value, group = id, colour = Dists_euclidean, linetype = ClusterCenter)) + #, linewidth = ClusterCenter)) + @@ -137,14 +152,17 @@ Clustering_heatmap_lineplots <- function(D, ggplot2::guides(linetype = ggplot2::guide_legend(override.aes = list(linewidth = 1.3), order = 1))+ ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, colour = cluster_colours[i]), legend.position = "bottom") - print(pl) + lineplots[[i]] <- pl + } + return(lineplots) + +} + + - } - grDevices::dev.off() -} diff --git a/R/WIP_workflow_clustering_heatmap.R b/R/WIP_workflow_clustering_heatmap.R index bd8002e..d614de1 100644 --- a/R/WIP_workflow_clustering_heatmap.R +++ b/R/WIP_workflow_clustering_heatmap.R @@ -1,28 +1,128 @@ - -### Prerequisite: -## have data set ready normalized, log2 transformed, if needed missing value imputation - - -### parameters -# colours for the clusters - - - -### Steps: - -# filter out samples with only missing values -# then, if necessary build the topannotation for the heatmap - -# cluster the data with hclust (and columns) -# build dendrograms for clustering -# if needed, colour the dendrogram - -# then plot the heatmap -# plot lineplots (one for each cluster) - - +#' Workflow that clusters proteins for similar patterns over the samples and +#' produces a heatmap and lineplots +#' +#' @param data_path \strong{character} \cr +#' The path to an .xlsx file containing the input data. +#' @param output_path \strong{character} \cr +#' The path to the output folder. +#' @param intensity_columns \strong{integer vector} \cr +#' The numbers of the intensity columns in the table. +#' @param nr_clusters \strong{integer(1)} \cr +#' Number of clusters to cut the dendrogram into. If \code{NULL} (default), +#' the optimal number of clusters is determined based on silhouette values +#' using the \code{\link[dendextend]{find_k}} function. +#' @param cluster_colours \strong{character vector} \cr +#' Colours to use for the different clusters. If \code{NULL} (default), +#' the default ggplot color palette is used. +#' @param row_split \strong{logical(1)} \cr If TRUE, there will be space between row clusters in the heatmap. +#' @param dist_method \strong{character(1)} \cr +#' Distance measure to use for the hierarchical clustering. In principle, +#' all methods available in \code{\link[amap]{Dist}} are possible, however +#' correlation-based metrics like "correlation", "pearson" or "spearman" +#' are recommended. The default is "correlation", which uses the centere +#' Pearson correlation. +#' @param colour_dend \strong{logical(1)} \cr +#' If \code{TRUE} (default), the branches of the dendrogram are coloured +#' according to the clusters, using the defined cluster_colours. +#' +#' @param suffix \strong{character} \cr +#' The suffix for the output files. It needs to start with an underscore. +#' @param plot_height_heatmap \strong{numeric} \cr +#' The height for the heatmap in cm. +#' @param plot_width_heatmap \strong{numeric} \cr +#' The width for the heatmap in cm. +#' @param plot_height_lineplot \strong{numeric} \cr +#' The height for the lineplots in cm. +#' @param plot_width_lineplot \strong{numeric} \cr +#' The width for the lineplotsin cm. +#' @param plot_dpi \strong{numeric} \cr +#' The plot resolution for the heatmap. +#' @param column_name_protein \strong{character(1)} \cr +#' The name of the column containing the protein identifiers. +#' @param ... Additional parameters passed to \code{\link[ProtStatsWF]{Heatmap_with_groups}}. +#' +#' @returns Nothing, but saves a heatmap, a set of lineplots (one per cluster) +#' ans a cluster table to the output folder. +#' @export +#' +#' @examples +workflow_clustering <- function(data_path, + output_path, + intensity_columns, + + nr_clusters = NULL, + cluster_colours = NULL, + row_split = TRUE, + dist_method = "correlation", + colour_dend = TRUE, + + suffix = "", + plot_height_heatmap = 15, + plot_width_heatmap = 15, + plot_height_lineplot= 10, + plot_width_lineplot = 15, + plot_dpi = 300, + + column_name_protein = "Protein", + ...) { + + ### TODO: What to do with NAs + ### TODO: what to do with constant rows (may happen for extremely low of high abundant proteins) + + ### TODO: option to aggregate data by group before clustering + + #### Prepare Data #### + dataPrep <- prepareTtestData(data_path = data_path , intensity_columns = intensity_columns) + + clust <- clustering(D, + dist_method = dist_method, + nr_clusters = nr_clusters, + cluster_colours = cluster_colours, + colour_dend = colour_dend) + + + ht <- ProtStatsWF::Heatmap_with_groups(D = dataPrep$D, + id = dataPrep$id, + # TODO: no filtering at the moment but it may be necessary/useful depending on the data + filtermissings = ncol(D), + cluster_rows = clust$row_dend, + cluster_columns = FALSE, + log_data = FALSE, + ### TODO: allow omitting rows with missing values + na_method = "impute", + row_split = nr_clusters, + row_gap = grid::unit(5, "mm"), + ...) + + grDevices::png(paste0(output_path, "/heatmap", suffix, "_", nr_clusters, ".png"), + height = plot_height_heatmap, + width = plot_width_heatmap, units = "cm", res = plot_dpi) + graphics::plot(ht) + grDevices::dev.off() + + + clusterInfo <- getClusterInfos(heatmap = ht, nr_clusters = nr_clusters, D = dataPrep$D, id = dataPrep$id) + openxlsx::write.xlsx(clusterInfo, paste0(output_path, "/cluster_table", suffix, "_", nr_clusters, ".xlsx")) + + D_zscore <- cbind(ht@matrix, cluster = clusterInfo$cluster) + + lineplots <- Lineplots(D_zscore = D_zscore, cluster_colours = clust$cluster_colours) + + grDevices::pdf(paste0(output_path, "/Lineplots", suffix, "_", nr_clusters, ".pdf"), + width = plot_width_lineplot/2.54, + height = plot_height_lineplot/2.54) + + for(i in 1:clust$nr_clusters) { + print(lineplots[i]) + } + + grDevices::dev.off() + + return(invisible(NULL)) + +} diff --git a/R/workflow_ttest.R b/R/workflow_ttest.R index b63f35e..d26b7ea 100644 --- a/R/workflow_ttest.R +++ b/R/workflow_ttest.R @@ -43,7 +43,7 @@ #' The minimum number of valid values to be an on protein. #' #' @param suffix \strong{character} \cr -#' The suffix of the file names should have one. +#' The suffix for the output files. It needs to start with an underscore. #' @param plot_device \strong{character} \cr #' The type of the output file, e.g. "pdf" or "png". #' @param plot_height \strong{numeric} \cr diff --git a/man/Clustering_heatmap_lineplots.Rd b/man/Clustering_heatmap_lineplots.Rd deleted file mode 100644 index c148819..0000000 --- a/man/Clustering_heatmap_lineplots.Rd +++ /dev/null @@ -1,44 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/WIP_Clustering_Heatmap_Lineplots.R -\name{Clustering_heatmap_lineplots} -\alias{Clustering_heatmap_lineplots} -\title{Clustering, Heatmap and Lineplots} -\usage{ -Clustering_heatmap_lineplots( - D, - id, - output_path, - suffix = "", - nr_clusters = NULL, - row_split = TRUE, - dist_method = "correlation", - plot_height_heatmap = 15, - plot_width_heatmap = 15, - plot_height_lineplot = 10, - plot_width_lineplot = 15 -) -} -\arguments{ -\item{D}{\strong{data.frame} \cr Dataframe with log-transformed protein intensities, e.g. filtered for significant proteins form the ANOVA or t-test results.} - -\item{id}{\strong{data.frame} \cr dataframe with id information e.g. protein names, gene names, accessions etc.} - -\item{output_path}{\strong{char(1)} \cr Path where results will be saved.} - -\item{suffix}{\strong{char(1)} \cr Suffix for the file names, should start with a underscore.} - -\item{nr_clusters}{\strong{int(1)} \cr Number of clusters. Default is NULL, meaning that the optimal number of clusters will be determined by \code{\link[dendextend:find_k]{dendextend::find_k()}}.} - -\item{row_split}{\strong{logi(1)} \cr If TRUE, there will be space between row clusters in the heatmap.} - -\item{dist_method}{\strong{char(1)} \cr distance method for clustering, default is "correlation" (centered Pearson correlation)} -} -\value{ -save heatmap and data frame with cluster information, as well as line plots -} -\description{ -Clustering, Heatmap and Lineplots -} -\examples{ -# TODO -} diff --git a/man/Lineplots.Rd b/man/Lineplots.Rd new file mode 100644 index 0000000..71d0150 --- /dev/null +++ b/man/Lineplots.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/WIP_Clustering_Heatmap_Lineplots.R +\name{Lineplots} +\alias{Lineplots} +\title{Generate Lineplots for each cluster.} +\usage{ +Lineplots(D_zscore, cluster_colours) +} +\arguments{ +\item{D_zscore}{\strong{data.frame} \cr +Dataframe containing the z-score normalized protein intensities along +with a column "cluster" indicating the cluster assignment for each protein.} + +\item{cluster_colours}{\strong{character vector} \cr +Colours to use for the different clusters.} +} +\value{ +A list of ggplot2 objects, each containing the lineplot for one cluster. +} +\description{ +Generate Lineplots for each cluster. +} diff --git a/man/PCA_Plot.Rd b/man/PCA_Plot.Rd index 78fa998..55c165f 100644 --- a/man/PCA_Plot.Rd +++ b/man/PCA_Plot.Rd @@ -6,6 +6,7 @@ \usage{ PCA_Plot( D, + id = NULL, groupvar1 = NULL, groupvar2 = NULL, impute = FALSE, @@ -32,6 +33,9 @@ PCA_Plot( \item{D}{\strong{data.frame} \cr The data set containing intensities of the sample.} +\item{id}{\strong{data.frame} \cr +The corresponding ID columns for the parameter D e.g. containing further columns like protein or gene names} + \item{groupvar1}{\strong{character vector} \cr The variable used for colors.} diff --git a/man/clustering.Rd b/man/clustering.Rd new file mode 100644 index 0000000..5e50ff6 --- /dev/null +++ b/man/clustering.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/WIP_Clustering_Heatmap_Lineplots.R +\name{clustering} +\alias{clustering} +\title{Cluster proteins for similar patterns across samples} +\usage{ +clustering( + D, + dist_method = "correlation", + nr_clusters = NULL, + cluster_colours = NULL, + colour_dend = TRUE +) +} +\arguments{ +\item{D}{\strong{data.frame} \cr +Dataframe containing protein intensities.} + +\item{dist_method}{\strong{character(1)} \cr +Distance measure to use for the hierarchical clustering. In principle, +all methods available in \code{\link[amap]{Dist}} are possible, however +correlation-based metrics like "correlation", "pearson" or "spearman" +are recommended. The default is "correlation", which uses the centere +Pearson correlation.} + +\item{nr_clusters}{\strong{integer(1)} \cr +Number of clusters to cut the dendrogram into. If \code{NULL} (default), +the optimal number of clusters is determined based on silhouette values +using the \code{\link[dendextend]{find_k}} function.} + +\item{cluster_colours}{\strong{character vector} \cr +Colours to use for the different clusters. If \code{NULL} (default), +the default ggplot color palette is used.} + +\item{colour_dend}{\strong{logical(1)} \cr +If \code{TRUE} (default), the branches of the dendrogram are coloured +according to the clusters, using the defined cluster_colours.} +} +\value{ +A list containing the following entries: +\item{row_dend}{The dendrogram object for the rows (proteins).} +\item{nr_clusters}{The number of clusters.} +\item{cluster_colours}{The colours used for the different clusters.} +} +\description{ +Cluster proteins for similar patterns across samples +} diff --git a/man/filter_PCA_data.Rd b/man/filter_PCA_data.Rd index 68b59cf..1d16d28 100644 --- a/man/filter_PCA_data.Rd +++ b/man/filter_PCA_data.Rd @@ -4,12 +4,21 @@ \alias{filter_PCA_data} \title{A method for filtering the data for PCA.} \usage{ -filter_PCA_data(D, impute = FALSE, impute_method = "mean", propNA = 0) +filter_PCA_data( + D, + id = NULL, + impute = FALSE, + impute_method = "mean", + propNA = 0 +) } \arguments{ \item{D}{\strong{data.frame} \cr The data set containing intensities of the sample.} +\item{id}{\strong{data.frame} \cr +The corresponding ID columns for the parameter D e.g. containing further columns like protein or gene names} + \item{impute}{\strong{logical} \cr If \code{TRUE}, missing values will be imputed.} diff --git a/man/getClusterInfos.Rd b/man/getClusterInfos.Rd new file mode 100644 index 0000000..2c9f228 --- /dev/null +++ b/man/getClusterInfos.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/WIP_Clustering_Heatmap_Lineplots.R +\name{getClusterInfos} +\alias{getClusterInfos} +\title{Get cluster information from heatmap} +\usage{ +getClusterInfos(heatmap, nr_clusters, D, id) +} +\arguments{ +\item{heatmap}{\strong{Heatmap object} \cr +Heatmap object generated by \code{\link[ComplexHeatmap]{Heatmap}}.} + +\item{nr_clusters}{\strong{integer(1)} \cr +Number of clusters to cut the dendrogram into.} + +\item{D}{\strong{data.frame} \cr +Dataframe containing only protein intensities.} + +\item{id}{\strong{data.frame} \cr +Dataframe containing the ID columns for the parameter D e.g. containing further columns like protein or gene names.} +} +\value{ +A data.frame containing the cluster assignment for each protein along with the original ID columns and the intensity values. +} +\description{ +Get cluster information from heatmap +} diff --git a/man/workflow_clustering.Rd b/man/workflow_clustering.Rd new file mode 100644 index 0000000..d416cde --- /dev/null +++ b/man/workflow_clustering.Rd @@ -0,0 +1,89 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/WIP_workflow_clustering_heatmap.R +\name{workflow_clustering} +\alias{workflow_clustering} +\title{Workflow that clusters proteins for similar patterns over the samples and +produces a heatmap and lineplots} +\usage{ +workflow_clustering( + data_path, + output_path, + intensity_columns, + nr_clusters = NULL, + cluster_colours = NULL, + row_split = TRUE, + dist_method = "correlation", + colour_dend = TRUE, + suffix = "", + plot_height_heatmap = 15, + plot_width_heatmap = 15, + plot_height_lineplot = 10, + plot_width_lineplot = 15, + plot_dpi = 300, + column_name_protein = "Protein", + ... +) +} +\arguments{ +\item{data_path}{\strong{character} \cr +The path to an .xlsx file containing the input data.} + +\item{output_path}{\strong{character} \cr +The path to the output folder.} + +\item{intensity_columns}{\strong{integer vector} \cr +The numbers of the intensity columns in the table.} + +\item{nr_clusters}{\strong{integer(1)} \cr +Number of clusters to cut the dendrogram into. If \code{NULL} (default), +the optimal number of clusters is determined based on silhouette values +using the \code{\link[dendextend]{find_k}} function.} + +\item{cluster_colours}{\strong{character vector} \cr +Colours to use for the different clusters. If \code{NULL} (default), +the default ggplot color palette is used.} + +\item{row_split}{\strong{logical(1)} \cr If TRUE, there will be space between row clusters in the heatmap.} + +\item{dist_method}{\strong{character(1)} \cr +Distance measure to use for the hierarchical clustering. In principle, +all methods available in \code{\link[amap]{Dist}} are possible, however +correlation-based metrics like "correlation", "pearson" or "spearman" +are recommended. The default is "correlation", which uses the centere +Pearson correlation.} + +\item{colour_dend}{\strong{logical(1)} \cr +If \code{TRUE} (default), the branches of the dendrogram are coloured +according to the clusters, using the defined cluster_colours.} + +\item{suffix}{\strong{character} \cr +The suffix for the output files. It needs to start with an underscore.} + +\item{plot_height_heatmap}{\strong{numeric} \cr +The height for the heatmap in cm.} + +\item{plot_width_heatmap}{\strong{numeric} \cr +The width for the heatmap in cm.} + +\item{plot_height_lineplot}{\strong{numeric} \cr +The height for the lineplots in cm.} + +\item{plot_width_lineplot}{\strong{numeric} \cr +The width for the lineplotsin cm.} + +\item{plot_dpi}{\strong{numeric} \cr +The plot resolution for the heatmap.} + +\item{column_name_protein}{\strong{character(1)} \cr +The name of the column containing the protein identifiers.} + +\item{...}{Additional parameters passed to \code{\link[ProtStatsWF]{Heatmap_with_groups}}.} +} +\value{ +Nothing, but saves a heatmap, a set of lineplots (one per cluster) +ans a cluster table to the output folder. +} +\description{ +Workflow that clusters proteins for similar patterns over the samples and +produces a heatmap and lineplots +} diff --git a/man/workflow_ttest.Rd b/man/workflow_ttest.Rd index 1bca49a..d4b2627 100644 --- a/man/workflow_ttest.Rd +++ b/man/workflow_ttest.Rd @@ -63,7 +63,7 @@ The maximum number of valid values to be an off protein.} The minimum number of valid values to be an on protein.} \item{suffix}{\strong{character} \cr -The suffix of the file names should have one.} +The suffix for the output files. It needs to start with an underscore.} \item{plot_device}{\strong{character} \cr The type of the output file, e.g. "pdf" or "png".} From 6b50251c29ed165197d1813c7b14667eadca3e84 Mon Sep 17 00:00:00 2001 From: KarinSchork Date: Tue, 20 Jan 2026 18:02:50 +0100 Subject: [PATCH 03/11] Increment version number to 0.2.0 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 55d7524..ff68517 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: ProtStatsWF Title: Statistics Workflows for Proteomics Data -Version: 0.1.1 +Version: 0.2.0 Authors@R: c( person("Karin", "Schork", , "karin.schork@rub.de", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-3756-4347")), From 7bfd411c18048419ce6bf70ccb3d8edc5ec252e0 Mon Sep 17 00:00:00 2001 From: Karin Schork Date: Wed, 21 Jan 2026 14:35:28 +0100 Subject: [PATCH 04/11] Update R/WIP_workflow_clustering_heatmap.R Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/WIP_workflow_clustering_heatmap.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/WIP_workflow_clustering_heatmap.R b/R/WIP_workflow_clustering_heatmap.R index d614de1..6767bc1 100644 --- a/R/WIP_workflow_clustering_heatmap.R +++ b/R/WIP_workflow_clustering_heatmap.R @@ -36,7 +36,7 @@ #' @param plot_height_lineplot \strong{numeric} \cr #' The height for the lineplots in cm. #' @param plot_width_lineplot \strong{numeric} \cr -#' The width for the lineplotsin cm. +#' The width for the lineplots in cm. #' @param plot_dpi \strong{numeric} \cr #' The plot resolution for the heatmap. #' @param column_name_protein \strong{character(1)} \cr From 5057d08933c36c9b98f0a73e62156443fbb74703 Mon Sep 17 00:00:00 2001 From: Karin Schork Date: Wed, 21 Jan 2026 14:35:46 +0100 Subject: [PATCH 05/11] Update R/WIP_Clustering_Heatmap_Lineplots.R Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/WIP_Clustering_Heatmap_Lineplots.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/R/WIP_Clustering_Heatmap_Lineplots.R b/R/WIP_Clustering_Heatmap_Lineplots.R index 1cb6e64..7965c5f 100644 --- a/R/WIP_Clustering_Heatmap_Lineplots.R +++ b/R/WIP_Clustering_Heatmap_Lineplots.R @@ -44,8 +44,10 @@ clustering <- function(D, nr_clusters <- dendextend::find_k(row_dend)$k } - ## define colours for each cluster - cluster_colours <- scales::hue_pal()(nr_clusters) + ## define colours for each cluster (only if not provided by user) + if (is.null(cluster_colours)) { + cluster_colours <- scales::hue_pal()(nr_clusters) + } if (colour_dend) { ## colour branches of the dendrogram to plot next to the heatmap From 51665c67bb2e1efbd78fbd6038b90d16d31be6b2 Mon Sep 17 00:00:00 2001 From: Karin Schork Date: Wed, 21 Jan 2026 14:35:56 +0100 Subject: [PATCH 06/11] Update R/WIP_workflow_clustering_heatmap.R Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/WIP_workflow_clustering_heatmap.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/WIP_workflow_clustering_heatmap.R b/R/WIP_workflow_clustering_heatmap.R index 6767bc1..043fe93 100644 --- a/R/WIP_workflow_clustering_heatmap.R +++ b/R/WIP_workflow_clustering_heatmap.R @@ -44,7 +44,7 @@ #' @param ... Additional parameters passed to \code{\link[ProtStatsWF]{Heatmap_with_groups}}. #' #' @returns Nothing, but saves a heatmap, a set of lineplots (one per cluster) -#' ans a cluster table to the output folder. +#' and a cluster table to the output folder. #' @export #' #' @examples From 02a21d7617961c512843f6f4800234098cf76e87 Mon Sep 17 00:00:00 2001 From: Karin Schork Date: Wed, 21 Jan 2026 14:36:05 +0100 Subject: [PATCH 07/11] Update man/clustering.Rd Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- man/clustering.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/clustering.Rd b/man/clustering.Rd index 5e50ff6..f40b9be 100644 --- a/man/clustering.Rd +++ b/man/clustering.Rd @@ -20,7 +20,7 @@ Dataframe containing protein intensities.} Distance measure to use for the hierarchical clustering. In principle, all methods available in \code{\link[amap]{Dist}} are possible, however correlation-based metrics like "correlation", "pearson" or "spearman" -are recommended. The default is "correlation", which uses the centere +are recommended. The default is "correlation", which uses the centered Pearson correlation.} \item{nr_clusters}{\strong{integer(1)} \cr From 9658c5a752d9d466af1e85053592e49e4e8978f6 Mon Sep 17 00:00:00 2001 From: Karin Schork Date: Wed, 21 Jan 2026 14:36:29 +0100 Subject: [PATCH 08/11] Update R/WIP_workflow_clustering_heatmap.R Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/WIP_workflow_clustering_heatmap.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/WIP_workflow_clustering_heatmap.R b/R/WIP_workflow_clustering_heatmap.R index 043fe93..139818f 100644 --- a/R/WIP_workflow_clustering_heatmap.R +++ b/R/WIP_workflow_clustering_heatmap.R @@ -92,25 +92,25 @@ workflow_clustering <- function(data_path, log_data = FALSE, ### TODO: allow omitting rows with missing values na_method = "impute", - row_split = nr_clusters, + row_split = clust$nr_clusters, row_gap = grid::unit(5, "mm"), ...) - grDevices::png(paste0(output_path, "/heatmap", suffix, "_", nr_clusters, ".png"), + grDevices::png(paste0(output_path, "/heatmap", suffix, "_", clust$nr_clusters, ".png"), height = plot_height_heatmap, width = plot_width_heatmap, units = "cm", res = plot_dpi) graphics::plot(ht) grDevices::dev.off() - clusterInfo <- getClusterInfos(heatmap = ht, nr_clusters = nr_clusters, D = dataPrep$D, id = dataPrep$id) - openxlsx::write.xlsx(clusterInfo, paste0(output_path, "/cluster_table", suffix, "_", nr_clusters, ".xlsx")) + clusterInfo <- getClusterInfos(heatmap = ht, nr_clusters = clust$nr_clusters, D = dataPrep$D, id = dataPrep$id) + openxlsx::write.xlsx(clusterInfo, paste0(output_path, "/cluster_table", suffix, "_", clust$nr_clusters, ".xlsx")) D_zscore <- cbind(ht@matrix, cluster = clusterInfo$cluster) lineplots <- Lineplots(D_zscore = D_zscore, cluster_colours = clust$cluster_colours) - grDevices::pdf(paste0(output_path, "/Lineplots", suffix, "_", nr_clusters, ".pdf"), + grDevices::pdf(paste0(output_path, "/Lineplots", suffix, "_", clust$nr_clusters, ".pdf"), width = plot_width_lineplot/2.54, height = plot_height_lineplot/2.54) From 4e364d2117f15c7e2c485fc17071318cbe0e6a91 Mon Sep 17 00:00:00 2001 From: Karin Schork Date: Wed, 21 Jan 2026 14:36:46 +0100 Subject: [PATCH 09/11] Update R/WIP_Clustering_Heatmap_Lineplots.R Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/WIP_Clustering_Heatmap_Lineplots.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/WIP_Clustering_Heatmap_Lineplots.R b/R/WIP_Clustering_Heatmap_Lineplots.R index 7965c5f..dbfdf5d 100644 --- a/R/WIP_Clustering_Heatmap_Lineplots.R +++ b/R/WIP_Clustering_Heatmap_Lineplots.R @@ -9,7 +9,7 @@ #' Distance measure to use for the hierarchical clustering. In principle, #' all methods available in \code{\link[amap]{Dist}} are possible, however #' correlation-based metrics like "correlation", "pearson" or "spearman" -#' are recommended. The default is "correlation", which uses the centere +#' are recommended. The default is "correlation", which uses the centered #' Pearson correlation. #' @param nr_clusters \strong{integer(1)} \cr #' Number of clusters to cut the dendrogram into. If \code{NULL} (default), From d0b2ea8edcde358ce9eb706287b90303a5500b91 Mon Sep 17 00:00:00 2001 From: Karin Schork Date: Wed, 21 Jan 2026 14:37:00 +0100 Subject: [PATCH 10/11] Update man/workflow_clustering.Rd Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- man/workflow_clustering.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/workflow_clustering.Rd b/man/workflow_clustering.Rd index d416cde..0699b2f 100644 --- a/man/workflow_clustering.Rd +++ b/man/workflow_clustering.Rd @@ -81,7 +81,7 @@ The name of the column containing the protein identifiers.} } \value{ Nothing, but saves a heatmap, a set of lineplots (one per cluster) -ans a cluster table to the output folder. +and a cluster table to the output folder. } \description{ Workflow that clusters proteins for similar patterns over the samples and From 7e762a3152b93a10d8b7fc788c0dc1403e9ae95f Mon Sep 17 00:00:00 2001 From: KarinSchork Date: Wed, 21 Jan 2026 15:21:27 +0100 Subject: [PATCH 11/11] fix clustering workflow --- ...atmap_Lineplots.R => Clustering_Lineplots.R} | 7 +++++++ R/Heatmap.R | 1 + R/helpers_ttest_ANOVA.R | 9 ++++++++- ...ustering_heatmap.R => workflow_clustering.R} | 17 +++++++++++------ 4 files changed, 27 insertions(+), 7 deletions(-) rename R/{WIP_Clustering_Heatmap_Lineplots.R => Clustering_Lineplots.R} (98%) rename R/{WIP_workflow_clustering_heatmap.R => workflow_clustering.R} (93%) diff --git a/R/WIP_Clustering_Heatmap_Lineplots.R b/R/Clustering_Lineplots.R similarity index 98% rename from R/WIP_Clustering_Heatmap_Lineplots.R rename to R/Clustering_Lineplots.R index dbfdf5d..214c362 100644 --- a/R/WIP_Clustering_Heatmap_Lineplots.R +++ b/R/Clustering_Lineplots.R @@ -35,6 +35,9 @@ clustering <- function(D, nr_clusters = NULL, cluster_colours = NULL, colour_dend = TRUE) { + + D2 <<- D + rownames(D) <- 1:nrow(D) # reset rownames (important to match cluster information later) # cluster the proteins with centered Pearson correlation as distance function row_dend <- stats::as.dendrogram(stats::hclust(amap::Dist(D, method = dist_method))) @@ -86,6 +89,10 @@ getClusterInfos <- function(heatmap, nr_clusters, D, id) { cluster[cluster_members] <- j } + # print(dim(id)) + # print(dim(D)) + # print(length(cluster)) + ### TODO: add z-scores to the table or generate separate table for that. RES_clustering <- cbind(id, cluster = cluster, D) diff --git a/R/Heatmap.R b/R/Heatmap.R index 5e839e5..a1be503 100644 --- a/R/Heatmap.R +++ b/R/Heatmap.R @@ -196,6 +196,7 @@ Heatmap_with_groups <- function(D, #row.names(data.asmatrix) <- row_labels + data.asmatrix2 <<- data.asmatrix ht <- ComplexHeatmap::Heatmap(data.asmatrix, column_title = title, diff --git a/R/helpers_ttest_ANOVA.R b/R/helpers_ttest_ANOVA.R index c446c2e..075c851 100644 --- a/R/helpers_ttest_ANOVA.R +++ b/R/helpers_ttest_ANOVA.R @@ -18,16 +18,23 @@ #'} prepareTtestData <- function(data_path, - intensity_columns + intensity_columns, remove_missings = FALSE ){ D <- openxlsx::read.xlsx(data_path, na.strings = c("NA", "NaN", "Filtered","#NV")) id <- D[, -intensity_columns, drop = FALSE] + D <- D[, intensity_columns] D[D == 0] <- NA + if (remove_missings) { + keep <- rowSums(is.na(D)) == 0 + D <- D[keep, , drop = FALSE] + id <- id[keep, , drop = FALSE] + } + group <- factor(limma::strsplit2(colnames(D), "_")[,1]) number_of_groups <- length(levels(group)) diff --git a/R/WIP_workflow_clustering_heatmap.R b/R/workflow_clustering.R similarity index 93% rename from R/WIP_workflow_clustering_heatmap.R rename to R/workflow_clustering.R index 139818f..d88ccc4 100644 --- a/R/WIP_workflow_clustering_heatmap.R +++ b/R/workflow_clustering.R @@ -61,8 +61,8 @@ workflow_clustering <- function(data_path, suffix = "", plot_height_heatmap = 15, plot_width_heatmap = 15, - plot_height_lineplot= 10, - plot_width_lineplot = 15, + plot_height_lineplot= 20, + plot_width_lineplot = 25, plot_dpi = 300, column_name_protein = "Protein", @@ -74,9 +74,12 @@ workflow_clustering <- function(data_path, ### TODO: option to aggregate data by group before clustering #### Prepare Data #### - dataPrep <- prepareTtestData(data_path = data_path , intensity_columns = intensity_columns) + dataPrep <- prepareTtestData(data_path = data_path , intensity_columns = intensity_columns, + remove_missings = TRUE) - clust <- clustering(D, + Dprep2id <<- dataPrep$id + + clust <- clustering(dataPrep$D, dist_method = dist_method, nr_clusters = nr_clusters, cluster_colours = cluster_colours, @@ -103,10 +106,12 @@ workflow_clustering <- function(data_path, grDevices::dev.off() - clusterInfo <- getClusterInfos(heatmap = ht, nr_clusters = clust$nr_clusters, D = dataPrep$D, id = dataPrep$id) + clusterInfo <- getClusterInfos(heatmap = ht, nr_clusters = clust$nr_clusters, D = dataPrep$D, id = dataPrep$ID) openxlsx::write.xlsx(clusterInfo, paste0(output_path, "/cluster_table", suffix, "_", clust$nr_clusters, ".xlsx")) - D_zscore <- cbind(ht@matrix, cluster = clusterInfo$cluster) + D_zscore <- data.frame(ht@matrix, cluster = clusterInfo$cluster) + + D_zscore2 <<- D_zscore lineplots <- Lineplots(D_zscore = D_zscore, cluster_colours = clust$cluster_colours)