mpc-bioinformatics · KarinSchork · Jan 21, 2026 · Jan 12, 2026 · Jan 20, 2026 · Jan 20, 2026
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: ProtStatsWF
 Title: Statistics Workflows for Proteomics Data
-Version: 0.1.1
+Version: 0.2.0
 Authors@R: c(
     person("Karin", "Schork", , "karin.schork@rub.de", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0003-3756-4347")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -3,8 +3,8 @@
 export(ANOVA)
 export(Boxplots)
 export(Boxplots_candidates)
-export(Clustering_heatmap_lineplots)
 export(Heatmap_with_groups)
+export(Lineplots)
 export(MA_Plots)
 export(PCA_Plot)
 export(ValidValuePlot)
@@ -15,11 +15,14 @@ export(add_labels)
 export(automatedNormalization)
 export(calculate_significance_categories_ANOVA)
 export(calculate_significance_categories_ttest)
+export(clustering)
+export(getClusterInfos)
 export(prepareData)
 export(prepareTtestData)
 export(pvalue_foldchange_histogram)
 export(ttest)
 export(workflow_ANOVA)
 export(workflow_QC)
+export(workflow_clustering)
 export(workflow_ttest)
 importFrom(magrittr,"%>%")
diff --git a/R/Clustering_Lineplots.R b/R/Clustering_Lineplots.R
@@ -0,0 +1,177 @@
+
+
+
+#' Cluster proteins for similar patterns across samples
+#'
+#' @param D \strong{data.frame} \cr
+#'        Dataframe containing protein intensities.
+#' @param dist_method \strong{character(1)} \cr
+#'        Distance measure to use for the hierarchical clustering. In principle,
+#'        all methods available in \code{\link[amap]{Dist}} are possible, however
+#'        correlation-based metrics like "correlation",  "pearson" or "spearman"
+#'        are recommended. The default is "correlation", which uses the centered
+#'        Pearson correlation.
+#' @param nr_clusters \strong{integer(1)} \cr
+#'        Number of clusters to cut the dendrogram into. If \code{NULL} (default),
+#'        the optimal number of clusters is determined based on silhouette values
+#'        using the \code{\link[dendextend]{find_k}} function.
+#' @param cluster_colours \strong{character vector} \cr
+#'       Colours to use for the different clusters. If \code{NULL} (default),
+#'       the default ggplot color palette is used.
+#' @param colour_dend \strong{logical(1)} \cr
+#'       If \code{TRUE} (default), the branches of the dendrogram are coloured
+#'       according to the clusters, using the defined cluster_colours.
+#'
+#' @returns A list containing the following entries:
+#' \item{row_dend}{The dendrogram object for the rows (proteins).}
+#' \item{nr_clusters}{The number of clusters.}
+#' \item{cluster_colours}{The colours used for the different clusters.}
+#'
+#' @export
+#'
+#' @examples
+clustering <- function(D,
+                       dist_method = "correlation",
+                       nr_clusters = NULL,
+                       cluster_colours = NULL,
+                       colour_dend = TRUE) {
+
+  D2 <<- D
+
+  rownames(D) <- 1:nrow(D)  # reset rownames (important to match cluster information later)
+  # cluster the proteins with centered Pearson correlation as distance function
+  row_dend <- stats::as.dendrogram(stats::hclust(amap::Dist(D, method = dist_method)))
+
+  if (is.null(nr_clusters)) {
+    # find optimal number of clusters based on silhouette values
+    nr_clusters <- dendextend::find_k(row_dend)$k
+  }
+
+  ## define colours for each cluster (only if not provided by user)
+  if (is.null(cluster_colours)) {
+    cluster_colours <- scales::hue_pal()(nr_clusters)
+  }
+
+  if (colour_dend) {
+    ## colour branches of the dendrogram to plot next to the heatmap
+    row_dend = dendextend::color_branches(row_dend, k = nr_clusters, col = cluster_colours)
+  }
+
+  return(list(row_dend = row_dend, nr_clusters = nr_clusters, cluster_colours = cluster_colours))
+}
+
+
+
+
+
+#' Get cluster information from heatmap
+#'
+#' @param heatmap \strong{Heatmap object} \cr
+#'        Heatmap object generated by \code{\link[ComplexHeatmap]{Heatmap}}.
+#' @param nr_clusters \strong{integer(1)} \cr
+#'        Number of clusters to cut the dendrogram into.
+#' @param D \strong{data.frame} \cr
+#'        Dataframe containing only protein intensities.
+#' @param id \strong{data.frame} \cr
+#'       Dataframe containing the ID columns for the parameter D e.g. containing further columns like protein or gene names.
+#'
+#' @returns A data.frame containing the cluster assignment for each protein along with the original ID columns and the intensity values.
+#' @export
+#'
+#' @examples
+getClusterInfos <- function(heatmap, nr_clusters, D, id) {
+  ### get cluster for each protein (cluster number from heatmap doesn't correspond to apply cutree() on the dendrogram. This is why we need to get the cluster number from the heatmap directly).
+  ht_draw <- ComplexHeatmap::draw(heatmap)#$heatmap)
+  x <- ComplexHeatmap::row_dend(ht_draw)
+  cluster <- integer(nrow(D))
+  for (j in 1:nr_clusters) {
+    cluster_members <- as.integer(names(dendextend::cutree(x[[j]],1))) ### get cluster members
+    cluster[cluster_members] <- j
+  }
+
+  # print(dim(id))
+  # print(dim(D))
+  # print(length(cluster))
+
+  ### TODO: add z-scores to the table or generate separate table for that.
+  RES_clustering <- cbind(id, cluster = cluster, D)
+
+  return(RES_clustering)
+
+
+}
+
+
+
+
+#' Generate Lineplots for each cluster.
+#'
+#' @param D_zscore \strong{data.frame} \cr
+#'       Dataframe containing the z-score normalized protein intensities along
+#'       with a column "cluster" indicating the cluster assignment for each protein.
+#' @param cluster_colours \strong{character vector} \cr
+#'      Colours to use for the different clusters.
+#'
+#' @returns A list of ggplot2 objects, each containing the lineplot for one cluster.
+#' @export
+#'
+#' @examples
+Lineplots <- function(D_zscore, cluster_colours) {
+
+  ### TODO: currently only plots without imputation. This should be changed to allow for imputation as well.
+
+  nr_clusters <- max(D_zscore$cluster)
+
+  lineplots <- list()
+
+  for (i in 1:nr_clusters) {
+
+    ## choose only data points from the specific cluster
+    D_tmp <- D_zscore[D_zscore$cluster == i, -c(ncol(D_zscore)), drop = FALSE] # remove cluster column
+
+    ## calculate mean profile of the cluster
+    mean_profile <- colMeans(D_tmp, na.rm = TRUE)
+
+    ## calculate euclidean distance of each protein to the cluster center
+    Dists_euclidean <- apply(D_tmp, 1, function(x) stats::dist(rbind(x, mean_profile)))
+
+    X <- data.frame(D_tmp, Dists_euclidean, id = 1:nrow(D_tmp))
+    X_long <- reshape2::melt(X, id.vars = c("id", "Dists_euclidean"))
+
+    X_long <- rbind(X_long, data.frame(id = max(X_long$id) + 1, Dists_euclidean = NA,
+                                       variable = colnames(D_tmp),
+                                       value = mean_profile))
+    X_long <- dplyr::mutate(X_long, ClusterCenter = dplyr::case_when(is.na(Dists_euclidean) ~ "Cluster Center", TRUE ~ "Cluster Members"))
+
+
+    variable <- value <- ClusterCenter <- id <- NULL # to silence notes while checking the package
+
+    pl <- ggplot2::ggplot(data = X_long, ggplot2::aes(x = variable, y = value, group = id,
+                                                      colour = Dists_euclidean, linetype = ClusterCenter)) + #, linewidth = ClusterCenter)) +
+      ggplot2::geom_line() + # linewidth = 0.5
+      ggplot2::scale_colour_gradient2(low = "red", mid = "yellow", high = "green", na.value = "black",
+                                      midpoint = 2, limits = c(0, max(X_long$Dists_euclidean, na.rm = TRUE)), name = "Distance \nto center") +
+      ggplot2::scale_linetype_manual(values=c("dotted", "solid"), na.value = "solid", name = "") +
+      ggplot2::scale_linewidth_manual(values=c("Cluster Members" = 0.7, "Cluster Center" = 3), na.value = 1, guide = "none") +
+      ggplot2::xlab("") + ggplot2::ylab("Z-Score") +
+      ggplot2::scale_x_discrete(expand = c(0.03, 0.03)) +
+      ggplot2::ggtitle(paste0("Cluster ", i, " (", nrow(X), " proteins)")) +
+      ggplot2::theme_bw(base_size = 20) +
+      ggplot2::theme(legend.key.width = ggplot2::unit(1.5,"cm"), axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5, hjust=1)) +
+      ggplot2::guides(linetype = ggplot2::guide_legend(override.aes = list(linewidth = 1.3), order = 1))+
+      ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, colour = cluster_colours[i]), legend.position = "bottom")
+
+    lineplots[[i]] <- pl
+  }
+  return(lineplots)
+
+}
+
+
+
+
+
+
+
+
+
diff --git a/R/Heatmap.R b/R/Heatmap.R
@@ -196,6 +196,7 @@ Heatmap_with_groups <- function(D,
 
   #row.names(data.asmatrix) <- row_labels
 
+  data.asmatrix2 <<- data.asmatrix
 
   ht <- ComplexHeatmap::Heatmap(data.asmatrix,
                 column_title = title,

diff --git a/R/WIP_Clustering_Heatmap_Lineplots.R b/R/WIP_Clustering_Heatmap_Lineplots.R
diff --git a/R/helpers_ttest_ANOVA.R b/R/helpers_ttest_ANOVA.R
@@ -18,16 +18,23 @@
 #'}
 
 prepareTtestData <- function(data_path,
-                             intensity_columns
+                             intensity_columns, remove_missings = FALSE
 ){
 
   D <- openxlsx::read.xlsx(data_path, na.strings = c("NA", "NaN", "Filtered","#NV"))
 
   id <- D[, -intensity_columns, drop = FALSE]
+
   D <- D[, intensity_columns]
 
   D[D == 0] <- NA
 
+  if (remove_missings) {
+    keep <- rowSums(is.na(D)) == 0
+    D <- D[keep, , drop = FALSE]
+    id <- id[keep, , drop = FALSE]
+  }
+
   group <- factor(limma::strsplit2(colnames(D), "_")[,1])
   number_of_groups <- length(levels(group))