Skip to content
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: ProtStatsWF
Title: Statistics Workflows for Proteomics Data
Version: 0.1.1
Version: 0.2.0
Authors@R: c(
person("Karin", "Schork", , "karin.schork@rub.de", role = c("aut", "cre"),
comment = c(ORCID = "0000-0003-3756-4347")),
Expand Down
5 changes: 4 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
export(ANOVA)
export(Boxplots)
export(Boxplots_candidates)
export(Clustering_heatmap_lineplots)
export(Heatmap_with_groups)
export(Lineplots)
export(MA_Plots)
export(PCA_Plot)
export(ValidValuePlot)
Expand All @@ -15,11 +15,14 @@ export(add_labels)
export(automatedNormalization)
export(calculate_significance_categories_ANOVA)
export(calculate_significance_categories_ttest)
export(clustering)
export(getClusterInfos)
export(prepareData)
export(prepareTtestData)
export(pvalue_foldchange_histogram)
export(ttest)
export(workflow_ANOVA)
export(workflow_QC)
export(workflow_clustering)
export(workflow_ttest)
importFrom(magrittr,"%>%")
177 changes: 177 additions & 0 deletions R/Clustering_Lineplots.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@



#' Cluster proteins for similar patterns across samples
#'
#' @param D \strong{data.frame} \cr
#' Dataframe containing protein intensities.
#' @param dist_method \strong{character(1)} \cr
#' Distance measure to use for the hierarchical clustering. In principle,
#' all methods available in \code{\link[amap]{Dist}} are possible, however
#' correlation-based metrics like "correlation", "pearson" or "spearman"
#' are recommended. The default is "correlation", which uses the centered
#' Pearson correlation.
#' @param nr_clusters \strong{integer(1)} \cr
#' Number of clusters to cut the dendrogram into. If \code{NULL} (default),
#' the optimal number of clusters is determined based on silhouette values
#' using the \code{\link[dendextend]{find_k}} function.
#' @param cluster_colours \strong{character vector} \cr
#' Colours to use for the different clusters. If \code{NULL} (default),
#' the default ggplot color palette is used.
#' @param colour_dend \strong{logical(1)} \cr
#' If \code{TRUE} (default), the branches of the dendrogram are coloured
#' according to the clusters, using the defined cluster_colours.
#'
#' @returns A list containing the following entries:
#' \item{row_dend}{The dendrogram object for the rows (proteins).}
#' \item{nr_clusters}{The number of clusters.}
#' \item{cluster_colours}{The colours used for the different clusters.}
#'
#' @export
#'
#' @examples
clustering <- function(D,
dist_method = "correlation",
nr_clusters = NULL,
cluster_colours = NULL,
colour_dend = TRUE) {

D2 <<- D

rownames(D) <- 1:nrow(D) # reset rownames (important to match cluster information later)
# cluster the proteins with centered Pearson correlation as distance function
row_dend <- stats::as.dendrogram(stats::hclust(amap::Dist(D, method = dist_method)))

if (is.null(nr_clusters)) {
# find optimal number of clusters based on silhouette values
nr_clusters <- dendextend::find_k(row_dend)$k
}

## define colours for each cluster (only if not provided by user)
if (is.null(cluster_colours)) {
cluster_colours <- scales::hue_pal()(nr_clusters)
}

if (colour_dend) {
## colour branches of the dendrogram to plot next to the heatmap
row_dend = dendextend::color_branches(row_dend, k = nr_clusters, col = cluster_colours)
}

return(list(row_dend = row_dend, nr_clusters = nr_clusters, cluster_colours = cluster_colours))
}





#' Get cluster information from heatmap
#'
#' @param heatmap \strong{Heatmap object} \cr
#' Heatmap object generated by \code{\link[ComplexHeatmap]{Heatmap}}.
#' @param nr_clusters \strong{integer(1)} \cr
#' Number of clusters to cut the dendrogram into.
#' @param D \strong{data.frame} \cr
#' Dataframe containing only protein intensities.
#' @param id \strong{data.frame} \cr
#' Dataframe containing the ID columns for the parameter D e.g. containing further columns like protein or gene names.
#'
#' @returns A data.frame containing the cluster assignment for each protein along with the original ID columns and the intensity values.
#' @export
#'
#' @examples
getClusterInfos <- function(heatmap, nr_clusters, D, id) {
### get cluster for each protein (cluster number from heatmap doesn't correspond to apply cutree() on the dendrogram. This is why we need to get the cluster number from the heatmap directly).
ht_draw <- ComplexHeatmap::draw(heatmap)#$heatmap)
x <- ComplexHeatmap::row_dend(ht_draw)
cluster <- integer(nrow(D))
for (j in 1:nr_clusters) {
cluster_members <- as.integer(names(dendextend::cutree(x[[j]],1))) ### get cluster members
cluster[cluster_members] <- j
}

# print(dim(id))
# print(dim(D))
# print(length(cluster))

### TODO: add z-scores to the table or generate separate table for that.
RES_clustering <- cbind(id, cluster = cluster, D)

return(RES_clustering)


}




#' Generate Lineplots for each cluster.
#'
#' @param D_zscore \strong{data.frame} \cr
#' Dataframe containing the z-score normalized protein intensities along
#' with a column "cluster" indicating the cluster assignment for each protein.
#' @param cluster_colours \strong{character vector} \cr
#' Colours to use for the different clusters.
#'
#' @returns A list of ggplot2 objects, each containing the lineplot for one cluster.
#' @export
#'
#' @examples
Lineplots <- function(D_zscore, cluster_colours) {

### TODO: currently only plots without imputation. This should be changed to allow for imputation as well.

nr_clusters <- max(D_zscore$cluster)

lineplots <- list()

for (i in 1:nr_clusters) {

## choose only data points from the specific cluster
D_tmp <- D_zscore[D_zscore$cluster == i, -c(ncol(D_zscore)), drop = FALSE] # remove cluster column

## calculate mean profile of the cluster
mean_profile <- colMeans(D_tmp, na.rm = TRUE)

## calculate euclidean distance of each protein to the cluster center
Dists_euclidean <- apply(D_tmp, 1, function(x) stats::dist(rbind(x, mean_profile)))

X <- data.frame(D_tmp, Dists_euclidean, id = 1:nrow(D_tmp))
X_long <- reshape2::melt(X, id.vars = c("id", "Dists_euclidean"))

X_long <- rbind(X_long, data.frame(id = max(X_long$id) + 1, Dists_euclidean = NA,
variable = colnames(D_tmp),
value = mean_profile))
X_long <- dplyr::mutate(X_long, ClusterCenter = dplyr::case_when(is.na(Dists_euclidean) ~ "Cluster Center", TRUE ~ "Cluster Members"))


variable <- value <- ClusterCenter <- id <- NULL # to silence notes while checking the package

pl <- ggplot2::ggplot(data = X_long, ggplot2::aes(x = variable, y = value, group = id,
colour = Dists_euclidean, linetype = ClusterCenter)) + #, linewidth = ClusterCenter)) +
ggplot2::geom_line() + # linewidth = 0.5
ggplot2::scale_colour_gradient2(low = "red", mid = "yellow", high = "green", na.value = "black",
midpoint = 2, limits = c(0, max(X_long$Dists_euclidean, na.rm = TRUE)), name = "Distance \nto center") +
ggplot2::scale_linetype_manual(values=c("dotted", "solid"), na.value = "solid", name = "") +
ggplot2::scale_linewidth_manual(values=c("Cluster Members" = 0.7, "Cluster Center" = 3), na.value = 1, guide = "none") +
ggplot2::xlab("") + ggplot2::ylab("Z-Score") +
ggplot2::scale_x_discrete(expand = c(0.03, 0.03)) +
ggplot2::ggtitle(paste0("Cluster ", i, " (", nrow(X), " proteins)")) +
ggplot2::theme_bw(base_size = 20) +
ggplot2::theme(legend.key.width = ggplot2::unit(1.5,"cm"), axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5, hjust=1)) +
ggplot2::guides(linetype = ggplot2::guide_legend(override.aes = list(linewidth = 1.3), order = 1))+
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, colour = cluster_colours[i]), legend.position = "bottom")

lineplots[[i]] <- pl
}
return(lineplots)

}









1 change: 1 addition & 0 deletions R/Heatmap.R
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ Heatmap_with_groups <- function(D,

#row.names(data.asmatrix) <- row_labels

data.asmatrix2 <<- data.asmatrix

ht <- ComplexHeatmap::Heatmap(data.asmatrix,
column_title = title,
Expand Down
150 changes: 0 additions & 150 deletions R/WIP_Clustering_Heatmap_Lineplots.R

This file was deleted.

9 changes: 8 additions & 1 deletion R/helpers_ttest_ANOVA.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,23 @@
#'}

prepareTtestData <- function(data_path,
intensity_columns
intensity_columns, remove_missings = FALSE
){

D <- openxlsx::read.xlsx(data_path, na.strings = c("NA", "NaN", "Filtered","#NV"))

id <- D[, -intensity_columns, drop = FALSE]

D <- D[, intensity_columns]

D[D == 0] <- NA

if (remove_missings) {
keep <- rowSums(is.na(D)) == 0
D <- D[keep, , drop = FALSE]
id <- id[keep, , drop = FALSE]
}

group <- factor(limma::strsplit2(colnames(D), "_")[,1])
number_of_groups <- length(levels(group))

Expand Down
Loading
Loading