diff --git a/DESCRIPTION b/DESCRIPTION index 55d7524..ff68517 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: ProtStatsWF Title: Statistics Workflows for Proteomics Data -Version: 0.1.1 +Version: 0.2.0 Authors@R: c( person("Karin", "Schork", , "karin.schork@rub.de", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-3756-4347")), diff --git a/NAMESPACE b/NAMESPACE index c0f6d97..96b9f4a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,8 +3,8 @@ export(ANOVA) export(Boxplots) export(Boxplots_candidates) -export(Clustering_heatmap_lineplots) export(Heatmap_with_groups) +export(Lineplots) export(MA_Plots) export(PCA_Plot) export(ValidValuePlot) @@ -15,11 +15,14 @@ export(add_labels) export(automatedNormalization) export(calculate_significance_categories_ANOVA) export(calculate_significance_categories_ttest) +export(clustering) +export(getClusterInfos) export(prepareData) export(prepareTtestData) export(pvalue_foldchange_histogram) export(ttest) export(workflow_ANOVA) export(workflow_QC) +export(workflow_clustering) export(workflow_ttest) importFrom(magrittr,"%>%") diff --git a/R/Clustering_Lineplots.R b/R/Clustering_Lineplots.R new file mode 100644 index 0000000..214c362 --- /dev/null +++ b/R/Clustering_Lineplots.R @@ -0,0 +1,177 @@ + + + +#' Cluster proteins for similar patterns across samples +#' +#' @param D \strong{data.frame} \cr +#' Dataframe containing protein intensities. +#' @param dist_method \strong{character(1)} \cr +#' Distance measure to use for the hierarchical clustering. In principle, +#' all methods available in \code{\link[amap]{Dist}} are possible, however +#' correlation-based metrics like "correlation", "pearson" or "spearman" +#' are recommended. The default is "correlation", which uses the centered +#' Pearson correlation. +#' @param nr_clusters \strong{integer(1)} \cr +#' Number of clusters to cut the dendrogram into. If \code{NULL} (default), +#' the optimal number of clusters is determined based on silhouette values +#' using the \code{\link[dendextend]{find_k}} function. +#' @param cluster_colours \strong{character vector} \cr +#' Colours to use for the different clusters. If \code{NULL} (default), +#' the default ggplot color palette is used. +#' @param colour_dend \strong{logical(1)} \cr +#' If \code{TRUE} (default), the branches of the dendrogram are coloured +#' according to the clusters, using the defined cluster_colours. +#' +#' @returns A list containing the following entries: +#' \item{row_dend}{The dendrogram object for the rows (proteins).} +#' \item{nr_clusters}{The number of clusters.} +#' \item{cluster_colours}{The colours used for the different clusters.} +#' +#' @export +#' +#' @examples +clustering <- function(D, + dist_method = "correlation", + nr_clusters = NULL, + cluster_colours = NULL, + colour_dend = TRUE) { + + D2 <<- D + + rownames(D) <- 1:nrow(D) # reset rownames (important to match cluster information later) + # cluster the proteins with centered Pearson correlation as distance function + row_dend <- stats::as.dendrogram(stats::hclust(amap::Dist(D, method = dist_method))) + + if (is.null(nr_clusters)) { + # find optimal number of clusters based on silhouette values + nr_clusters <- dendextend::find_k(row_dend)$k + } + + ## define colours for each cluster (only if not provided by user) + if (is.null(cluster_colours)) { + cluster_colours <- scales::hue_pal()(nr_clusters) + } + + if (colour_dend) { + ## colour branches of the dendrogram to plot next to the heatmap + row_dend = dendextend::color_branches(row_dend, k = nr_clusters, col = cluster_colours) + } + + return(list(row_dend = row_dend, nr_clusters = nr_clusters, cluster_colours = cluster_colours)) +} + + + + + +#' Get cluster information from heatmap +#' +#' @param heatmap \strong{Heatmap object} \cr +#' Heatmap object generated by \code{\link[ComplexHeatmap]{Heatmap}}. +#' @param nr_clusters \strong{integer(1)} \cr +#' Number of clusters to cut the dendrogram into. +#' @param D \strong{data.frame} \cr +#' Dataframe containing only protein intensities. +#' @param id \strong{data.frame} \cr +#' Dataframe containing the ID columns for the parameter D e.g. containing further columns like protein or gene names. +#' +#' @returns A data.frame containing the cluster assignment for each protein along with the original ID columns and the intensity values. +#' @export +#' +#' @examples +getClusterInfos <- function(heatmap, nr_clusters, D, id) { + ### get cluster for each protein (cluster number from heatmap doesn't correspond to apply cutree() on the dendrogram. This is why we need to get the cluster number from the heatmap directly). + ht_draw <- ComplexHeatmap::draw(heatmap)#$heatmap) + x <- ComplexHeatmap::row_dend(ht_draw) + cluster <- integer(nrow(D)) + for (j in 1:nr_clusters) { + cluster_members <- as.integer(names(dendextend::cutree(x[[j]],1))) ### get cluster members + cluster[cluster_members] <- j + } + + # print(dim(id)) + # print(dim(D)) + # print(length(cluster)) + + ### TODO: add z-scores to the table or generate separate table for that. + RES_clustering <- cbind(id, cluster = cluster, D) + + return(RES_clustering) + + +} + + + + +#' Generate Lineplots for each cluster. +#' +#' @param D_zscore \strong{data.frame} \cr +#' Dataframe containing the z-score normalized protein intensities along +#' with a column "cluster" indicating the cluster assignment for each protein. +#' @param cluster_colours \strong{character vector} \cr +#' Colours to use for the different clusters. +#' +#' @returns A list of ggplot2 objects, each containing the lineplot for one cluster. +#' @export +#' +#' @examples +Lineplots <- function(D_zscore, cluster_colours) { + + ### TODO: currently only plots without imputation. This should be changed to allow for imputation as well. + + nr_clusters <- max(D_zscore$cluster) + + lineplots <- list() + + for (i in 1:nr_clusters) { + + ## choose only data points from the specific cluster + D_tmp <- D_zscore[D_zscore$cluster == i, -c(ncol(D_zscore)), drop = FALSE] # remove cluster column + + ## calculate mean profile of the cluster + mean_profile <- colMeans(D_tmp, na.rm = TRUE) + + ## calculate euclidean distance of each protein to the cluster center + Dists_euclidean <- apply(D_tmp, 1, function(x) stats::dist(rbind(x, mean_profile))) + + X <- data.frame(D_tmp, Dists_euclidean, id = 1:nrow(D_tmp)) + X_long <- reshape2::melt(X, id.vars = c("id", "Dists_euclidean")) + + X_long <- rbind(X_long, data.frame(id = max(X_long$id) + 1, Dists_euclidean = NA, + variable = colnames(D_tmp), + value = mean_profile)) + X_long <- dplyr::mutate(X_long, ClusterCenter = dplyr::case_when(is.na(Dists_euclidean) ~ "Cluster Center", TRUE ~ "Cluster Members")) + + + variable <- value <- ClusterCenter <- id <- NULL # to silence notes while checking the package + + pl <- ggplot2::ggplot(data = X_long, ggplot2::aes(x = variable, y = value, group = id, + colour = Dists_euclidean, linetype = ClusterCenter)) + #, linewidth = ClusterCenter)) + + ggplot2::geom_line() + # linewidth = 0.5 + ggplot2::scale_colour_gradient2(low = "red", mid = "yellow", high = "green", na.value = "black", + midpoint = 2, limits = c(0, max(X_long$Dists_euclidean, na.rm = TRUE)), name = "Distance \nto center") + + ggplot2::scale_linetype_manual(values=c("dotted", "solid"), na.value = "solid", name = "") + + ggplot2::scale_linewidth_manual(values=c("Cluster Members" = 0.7, "Cluster Center" = 3), na.value = 1, guide = "none") + + ggplot2::xlab("") + ggplot2::ylab("Z-Score") + + ggplot2::scale_x_discrete(expand = c(0.03, 0.03)) + + ggplot2::ggtitle(paste0("Cluster ", i, " (", nrow(X), " proteins)")) + + ggplot2::theme_bw(base_size = 20) + + ggplot2::theme(legend.key.width = ggplot2::unit(1.5,"cm"), axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5, hjust=1)) + + ggplot2::guides(linetype = ggplot2::guide_legend(override.aes = list(linewidth = 1.3), order = 1))+ + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, colour = cluster_colours[i]), legend.position = "bottom") + + lineplots[[i]] <- pl + } + return(lineplots) + +} + + + + + + + + + diff --git a/R/Heatmap.R b/R/Heatmap.R index 5e839e5..a1be503 100644 --- a/R/Heatmap.R +++ b/R/Heatmap.R @@ -196,6 +196,7 @@ Heatmap_with_groups <- function(D, #row.names(data.asmatrix) <- row_labels + data.asmatrix2 <<- data.asmatrix ht <- ComplexHeatmap::Heatmap(data.asmatrix, column_title = title, diff --git a/R/WIP_Clustering_Heatmap_Lineplots.R b/R/WIP_Clustering_Heatmap_Lineplots.R deleted file mode 100644 index 1c2ed65..0000000 --- a/R/WIP_Clustering_Heatmap_Lineplots.R +++ /dev/null @@ -1,150 +0,0 @@ - -#' Clustering, Heatmap and Lineplots -#' -#' @param D **data.frame** \cr Dataframe with log-transformed protein intensities, e.g. filtered for significant proteins form the ANOVA or t-test results. -#' @param id **data.frame** \cr dataframe with id information e.g. protein names, gene names, accessions etc. -#' @param output_path **char(1)** \cr Path where results will be saved. -#' @param suffix **char(1)** \cr Suffix for the file names, should start with a underscore. -#' @param nr_clusters **int(1)** \cr Number of clusters. Default is NULL, meaning that the optimal number of clusters will be determined by [dendextend::find_k()]. -#' @param row_split **logi(1)** \cr If TRUE, there will be space between row clusters in the heatmap. -#' @param dist_method **char(1)** \cr distance method for clustering, default is "correlation" (centered Pearson correlation) -#' -#' @return save heatmap and data frame with cluster information, as well as line plots -#' -#' @export -#' -#' @examples # TODO -Clustering_heatmap_lineplots <- function(D, - id, - output_path, - suffix = "", - nr_clusters = NULL, - row_split = TRUE, - dist_method = "correlation", - plot_height_heatmap = 15, - plot_width_heatmap = 15, - plot_height_lineplot= 10, - plot_width_lineplot = 15) { - - ### TODO: What to do with NAs - ### TODO: what to do with constant rows (may happen for extremely low of high abundant proteins) - - id_columns <- 1:ncol(id) - - rownames(D) <- 1:nrow(D) # reset rownames (important to match cluster information later) - # cluster the proteins with centered Pearson correlation as distance function - row_dend <- stats::as.dendrogram(stats::hclust(amap::Dist(D, method = dist_method))) - - if (is.null(nr_clusters)) { - # find optimal number of clusters based on silhouette values - nr_clusters <- dendextend::find_k(row_dend)$k - } - - ## define colours for each cluster - cluster_colours <- scales::hue_pal()(nr_clusters) - ## colour branches of the dendrogram to plot next to the heatmap - row_dend_color = dendextend::color_branches(row_dend, k = nr_clusters, col = cluster_colours) - - if (row_split) { - row_split = nr_clusters - } else { - row_split = NULL - } - - - ht <- ProtStatsWF::Heatmap_with_groups(D = D, - id = id, - # TODO: no filtering at the moment but it may be necessary/useful depending on the data - filtermissings = ncol(D), - cluster_rows = row_dend_color, - cluster_columns = FALSE, - log_data = FALSE, - #output_path = output_path, - #suffix = paste(suffix, nr_clusters, sep = "_"), - ### TODO: allow omitting rows with missing values - na_method = "impute", - row_split = nr_clusters, - row_gap = grid::unit(5, "mm")) - - grDevices::png(paste0(output_path, "/heatmap", suffix, "_", nr_clusters, ".png"), - height = plot_height_heatmap, - width = plot_width_heatmap, units = "cm", res = 300) - graphics::plot(ht)#[["heatmap"]]) - grDevices::dev.off() - - - ### get cluster for each protein (cluster number from heatmap doesn't correspond to apply cutree() on the dendrogram. This is why we need to get the cluster number from the heatmap directly). - ht_draw <- ComplexHeatmap::draw(ht)#$heatmap) - x <- ComplexHeatmap::row_dend(ht_draw) - cluster <- integer(nrow(D)) - for (j in 1:nr_clusters) { - cluster_members <- as.integer(names(dendextend::cutree(x[[j]],1))) ### get cluster members - cluster[cluster_members] <- j - } - - - ### write table with cluster results - ### TODO: add z-scores to the table - RES_clustering <- cbind(id, cluster = cluster, D) - openxlsx::write.xlsx(RES_clustering, paste0(output_path, "/cluster_table", suffix, "_", nr_clusters, ".xlsx")) - - - ############### - ### draw lineplot for each cluster, coloured by distance to the cluster center - - D_zscore <- cbind(ht@matrix, cluster = cluster) - #id_columns <- 1:ncol(id) - ### TODO: currently only plots without imputation. This should be changed to allow for imputation as well. - - grDevices::pdf(paste0(output_path, "/Lineplots", suffix, "_", nr_clusters, ".pdf"), - width = plot_width_lineplot, - height = plot_height_lineplot) - - for (i in 1:nr_clusters) { - - ## choose only data points from the specific cluster - D_tmp <- D_zscore[cluster == i, -c(ncol(D_zscore)), drop = FALSE] # remove id columns and cluster column - - ## calculate mean profile of the cluster - mean_profile <- colMeans(D_tmp, na.rm = TRUE) - - ## calculate euclidean distance of each protein to the cluster center - Dists_euclidean <- apply(D_tmp, 1, function(x) stats::dist(rbind(x, mean_profile))) - - X <- data.frame(D_tmp, Dists_euclidean, id = 1:nrow(D_tmp)) - X_long <- reshape2::melt(X, id.vars = c("id", "Dists_euclidean")) - - X_long <- rbind(X_long, data.frame(id = max(X_long$id) + 1, Dists_euclidean = NA, - variable = colnames(D_tmp), - value = mean_profile)) - X_long <- dplyr::mutate(X_long, ClusterCenter = dplyr::case_when(is.na(Dists_euclidean) ~ "Cluster Center", TRUE ~ "Cluster Members")) - - - variable <- value <- ClusterCenter <- NULL # to silence notes while checking the package - - pl <- ggplot2::ggplot(data = X_long, ggplot2::aes(x = variable, y = value, group = id, - colour = Dists_euclidean, linetype = ClusterCenter)) + #, linewidth = ClusterCenter)) + - ggplot2::geom_line() + # linewidth = 0.5 - ggplot2::scale_colour_gradient2(low = "red", mid = "yellow", high = "green", na.value = "black", - midpoint = 2, limits = c(0, max(X_long$Dists_euclidean, na.rm = TRUE)), name = "Distance \nto center") + - ggplot2::scale_linetype_manual(values=c("dotted", "solid"), na.value = "solid", name = "") + - ggplot2::scale_linewidth_manual(values=c("Cluster Members" = 0.7, "Cluster Center" = 3), na.value = 1, guide = "none") + - ggplot2::xlab("") + ggplot2::ylab("Z-Score") + - ggplot2::scale_x_discrete(expand = c(0.03, 0.03)) + - ggplot2::ggtitle(paste0("Cluster ", i, " (", nrow(X), " proteins)")) + - ggplot2::theme_bw(base_size = 20) + - ggplot2::theme(legend.key.width = ggplot2::unit(1.5,"cm"), axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5, hjust=1)) + - ggplot2::guides(linetype = ggplot2::guide_legend(override.aes = list(linewidth = 1.3), order = 1))+ - ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, colour = cluster_colours[i]), legend.position = "bottom") - - print(pl) - - - } - - grDevices::dev.off() - -} - - - diff --git a/R/helpers_ttest_ANOVA.R b/R/helpers_ttest_ANOVA.R index c446c2e..075c851 100644 --- a/R/helpers_ttest_ANOVA.R +++ b/R/helpers_ttest_ANOVA.R @@ -18,16 +18,23 @@ #'} prepareTtestData <- function(data_path, - intensity_columns + intensity_columns, remove_missings = FALSE ){ D <- openxlsx::read.xlsx(data_path, na.strings = c("NA", "NaN", "Filtered","#NV")) id <- D[, -intensity_columns, drop = FALSE] + D <- D[, intensity_columns] D[D == 0] <- NA + if (remove_missings) { + keep <- rowSums(is.na(D)) == 0 + D <- D[keep, , drop = FALSE] + id <- id[keep, , drop = FALSE] + } + group <- factor(limma::strsplit2(colnames(D), "_")[,1]) number_of_groups <- length(levels(group)) diff --git a/R/workflow_clustering.R b/R/workflow_clustering.R new file mode 100644 index 0000000..d88ccc4 --- /dev/null +++ b/R/workflow_clustering.R @@ -0,0 +1,137 @@ + + +#' Workflow that clusters proteins for similar patterns over the samples and +#' produces a heatmap and lineplots +#' +#' @param data_path \strong{character} \cr +#' The path to an .xlsx file containing the input data. +#' @param output_path \strong{character} \cr +#' The path to the output folder. +#' @param intensity_columns \strong{integer vector} \cr +#' The numbers of the intensity columns in the table. +#' @param nr_clusters \strong{integer(1)} \cr +#' Number of clusters to cut the dendrogram into. If \code{NULL} (default), +#' the optimal number of clusters is determined based on silhouette values +#' using the \code{\link[dendextend]{find_k}} function. +#' @param cluster_colours \strong{character vector} \cr +#' Colours to use for the different clusters. If \code{NULL} (default), +#' the default ggplot color palette is used. +#' @param row_split \strong{logical(1)} \cr If TRUE, there will be space between row clusters in the heatmap. +#' @param dist_method \strong{character(1)} \cr +#' Distance measure to use for the hierarchical clustering. In principle, +#' all methods available in \code{\link[amap]{Dist}} are possible, however +#' correlation-based metrics like "correlation", "pearson" or "spearman" +#' are recommended. The default is "correlation", which uses the centere +#' Pearson correlation. +#' @param colour_dend \strong{logical(1)} \cr +#' If \code{TRUE} (default), the branches of the dendrogram are coloured +#' according to the clusters, using the defined cluster_colours. +#' +#' @param suffix \strong{character} \cr +#' The suffix for the output files. It needs to start with an underscore. +#' @param plot_height_heatmap \strong{numeric} \cr +#' The height for the heatmap in cm. +#' @param plot_width_heatmap \strong{numeric} \cr +#' The width for the heatmap in cm. +#' @param plot_height_lineplot \strong{numeric} \cr +#' The height for the lineplots in cm. +#' @param plot_width_lineplot \strong{numeric} \cr +#' The width for the lineplots in cm. +#' @param plot_dpi \strong{numeric} \cr +#' The plot resolution for the heatmap. +#' @param column_name_protein \strong{character(1)} \cr +#' The name of the column containing the protein identifiers. +#' @param ... Additional parameters passed to \code{\link[ProtStatsWF]{Heatmap_with_groups}}. +#' +#' @returns Nothing, but saves a heatmap, a set of lineplots (one per cluster) +#' and a cluster table to the output folder. +#' @export +#' +#' @examples +workflow_clustering <- function(data_path, + output_path, + intensity_columns, + + nr_clusters = NULL, + cluster_colours = NULL, + row_split = TRUE, + dist_method = "correlation", + colour_dend = TRUE, + + suffix = "", + plot_height_heatmap = 15, + plot_width_heatmap = 15, + plot_height_lineplot= 20, + plot_width_lineplot = 25, + plot_dpi = 300, + + column_name_protein = "Protein", + ...) { + + ### TODO: What to do with NAs + ### TODO: what to do with constant rows (may happen for extremely low of high abundant proteins) + + ### TODO: option to aggregate data by group before clustering + + #### Prepare Data #### + dataPrep <- prepareTtestData(data_path = data_path , intensity_columns = intensity_columns, + remove_missings = TRUE) + + Dprep2id <<- dataPrep$id + + clust <- clustering(dataPrep$D, + dist_method = dist_method, + nr_clusters = nr_clusters, + cluster_colours = cluster_colours, + colour_dend = colour_dend) + + + ht <- ProtStatsWF::Heatmap_with_groups(D = dataPrep$D, + id = dataPrep$id, + # TODO: no filtering at the moment but it may be necessary/useful depending on the data + filtermissings = ncol(D), + cluster_rows = clust$row_dend, + cluster_columns = FALSE, + log_data = FALSE, + ### TODO: allow omitting rows with missing values + na_method = "impute", + row_split = clust$nr_clusters, + row_gap = grid::unit(5, "mm"), + ...) + + grDevices::png(paste0(output_path, "/heatmap", suffix, "_", clust$nr_clusters, ".png"), + height = plot_height_heatmap, + width = plot_width_heatmap, units = "cm", res = plot_dpi) + graphics::plot(ht) + grDevices::dev.off() + + + clusterInfo <- getClusterInfos(heatmap = ht, nr_clusters = clust$nr_clusters, D = dataPrep$D, id = dataPrep$ID) + openxlsx::write.xlsx(clusterInfo, paste0(output_path, "/cluster_table", suffix, "_", clust$nr_clusters, ".xlsx")) + + D_zscore <- data.frame(ht@matrix, cluster = clusterInfo$cluster) + + D_zscore2 <<- D_zscore + + lineplots <- Lineplots(D_zscore = D_zscore, cluster_colours = clust$cluster_colours) + + grDevices::pdf(paste0(output_path, "/Lineplots", suffix, "_", clust$nr_clusters, ".pdf"), + width = plot_width_lineplot/2.54, + height = plot_height_lineplot/2.54) + + for(i in 1:clust$nr_clusters) { + print(lineplots[i]) + } + + grDevices::dev.off() + + return(invisible(NULL)) + +} + + + + + + + diff --git a/R/workflow_ttest.R b/R/workflow_ttest.R index b63f35e..d26b7ea 100644 --- a/R/workflow_ttest.R +++ b/R/workflow_ttest.R @@ -43,7 +43,7 @@ #' The minimum number of valid values to be an on protein. #' #' @param suffix \strong{character} \cr -#' The suffix of the file names should have one. +#' The suffix for the output files. It needs to start with an underscore. #' @param plot_device \strong{character} \cr #' The type of the output file, e.g. "pdf" or "png". #' @param plot_height \strong{numeric} \cr diff --git a/man/Clustering_heatmap_lineplots.Rd b/man/Clustering_heatmap_lineplots.Rd deleted file mode 100644 index c148819..0000000 --- a/man/Clustering_heatmap_lineplots.Rd +++ /dev/null @@ -1,44 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/WIP_Clustering_Heatmap_Lineplots.R -\name{Clustering_heatmap_lineplots} -\alias{Clustering_heatmap_lineplots} -\title{Clustering, Heatmap and Lineplots} -\usage{ -Clustering_heatmap_lineplots( - D, - id, - output_path, - suffix = "", - nr_clusters = NULL, - row_split = TRUE, - dist_method = "correlation", - plot_height_heatmap = 15, - plot_width_heatmap = 15, - plot_height_lineplot = 10, - plot_width_lineplot = 15 -) -} -\arguments{ -\item{D}{\strong{data.frame} \cr Dataframe with log-transformed protein intensities, e.g. filtered for significant proteins form the ANOVA or t-test results.} - -\item{id}{\strong{data.frame} \cr dataframe with id information e.g. protein names, gene names, accessions etc.} - -\item{output_path}{\strong{char(1)} \cr Path where results will be saved.} - -\item{suffix}{\strong{char(1)} \cr Suffix for the file names, should start with a underscore.} - -\item{nr_clusters}{\strong{int(1)} \cr Number of clusters. Default is NULL, meaning that the optimal number of clusters will be determined by \code{\link[dendextend:find_k]{dendextend::find_k()}}.} - -\item{row_split}{\strong{logi(1)} \cr If TRUE, there will be space between row clusters in the heatmap.} - -\item{dist_method}{\strong{char(1)} \cr distance method for clustering, default is "correlation" (centered Pearson correlation)} -} -\value{ -save heatmap and data frame with cluster information, as well as line plots -} -\description{ -Clustering, Heatmap and Lineplots -} -\examples{ -# TODO -} diff --git a/man/Lineplots.Rd b/man/Lineplots.Rd new file mode 100644 index 0000000..71d0150 --- /dev/null +++ b/man/Lineplots.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/WIP_Clustering_Heatmap_Lineplots.R +\name{Lineplots} +\alias{Lineplots} +\title{Generate Lineplots for each cluster.} +\usage{ +Lineplots(D_zscore, cluster_colours) +} +\arguments{ +\item{D_zscore}{\strong{data.frame} \cr +Dataframe containing the z-score normalized protein intensities along +with a column "cluster" indicating the cluster assignment for each protein.} + +\item{cluster_colours}{\strong{character vector} \cr +Colours to use for the different clusters.} +} +\value{ +A list of ggplot2 objects, each containing the lineplot for one cluster. +} +\description{ +Generate Lineplots for each cluster. +} diff --git a/man/PCA_Plot.Rd b/man/PCA_Plot.Rd index 78fa998..55c165f 100644 --- a/man/PCA_Plot.Rd +++ b/man/PCA_Plot.Rd @@ -6,6 +6,7 @@ \usage{ PCA_Plot( D, + id = NULL, groupvar1 = NULL, groupvar2 = NULL, impute = FALSE, @@ -32,6 +33,9 @@ PCA_Plot( \item{D}{\strong{data.frame} \cr The data set containing intensities of the sample.} +\item{id}{\strong{data.frame} \cr +The corresponding ID columns for the parameter D e.g. containing further columns like protein or gene names} + \item{groupvar1}{\strong{character vector} \cr The variable used for colors.} diff --git a/man/clustering.Rd b/man/clustering.Rd new file mode 100644 index 0000000..f40b9be --- /dev/null +++ b/man/clustering.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/WIP_Clustering_Heatmap_Lineplots.R +\name{clustering} +\alias{clustering} +\title{Cluster proteins for similar patterns across samples} +\usage{ +clustering( + D, + dist_method = "correlation", + nr_clusters = NULL, + cluster_colours = NULL, + colour_dend = TRUE +) +} +\arguments{ +\item{D}{\strong{data.frame} \cr +Dataframe containing protein intensities.} + +\item{dist_method}{\strong{character(1)} \cr +Distance measure to use for the hierarchical clustering. In principle, +all methods available in \code{\link[amap]{Dist}} are possible, however +correlation-based metrics like "correlation", "pearson" or "spearman" +are recommended. The default is "correlation", which uses the centered +Pearson correlation.} + +\item{nr_clusters}{\strong{integer(1)} \cr +Number of clusters to cut the dendrogram into. If \code{NULL} (default), +the optimal number of clusters is determined based on silhouette values +using the \code{\link[dendextend]{find_k}} function.} + +\item{cluster_colours}{\strong{character vector} \cr +Colours to use for the different clusters. If \code{NULL} (default), +the default ggplot color palette is used.} + +\item{colour_dend}{\strong{logical(1)} \cr +If \code{TRUE} (default), the branches of the dendrogram are coloured +according to the clusters, using the defined cluster_colours.} +} +\value{ +A list containing the following entries: +\item{row_dend}{The dendrogram object for the rows (proteins).} +\item{nr_clusters}{The number of clusters.} +\item{cluster_colours}{The colours used for the different clusters.} +} +\description{ +Cluster proteins for similar patterns across samples +} diff --git a/man/filter_PCA_data.Rd b/man/filter_PCA_data.Rd index 68b59cf..1d16d28 100644 --- a/man/filter_PCA_data.Rd +++ b/man/filter_PCA_data.Rd @@ -4,12 +4,21 @@ \alias{filter_PCA_data} \title{A method for filtering the data for PCA.} \usage{ -filter_PCA_data(D, impute = FALSE, impute_method = "mean", propNA = 0) +filter_PCA_data( + D, + id = NULL, + impute = FALSE, + impute_method = "mean", + propNA = 0 +) } \arguments{ \item{D}{\strong{data.frame} \cr The data set containing intensities of the sample.} +\item{id}{\strong{data.frame} \cr +The corresponding ID columns for the parameter D e.g. containing further columns like protein or gene names} + \item{impute}{\strong{logical} \cr If \code{TRUE}, missing values will be imputed.} diff --git a/man/getClusterInfos.Rd b/man/getClusterInfos.Rd new file mode 100644 index 0000000..2c9f228 --- /dev/null +++ b/man/getClusterInfos.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/WIP_Clustering_Heatmap_Lineplots.R +\name{getClusterInfos} +\alias{getClusterInfos} +\title{Get cluster information from heatmap} +\usage{ +getClusterInfos(heatmap, nr_clusters, D, id) +} +\arguments{ +\item{heatmap}{\strong{Heatmap object} \cr +Heatmap object generated by \code{\link[ComplexHeatmap]{Heatmap}}.} + +\item{nr_clusters}{\strong{integer(1)} \cr +Number of clusters to cut the dendrogram into.} + +\item{D}{\strong{data.frame} \cr +Dataframe containing only protein intensities.} + +\item{id}{\strong{data.frame} \cr +Dataframe containing the ID columns for the parameter D e.g. containing further columns like protein or gene names.} +} +\value{ +A data.frame containing the cluster assignment for each protein along with the original ID columns and the intensity values. +} +\description{ +Get cluster information from heatmap +} diff --git a/man/workflow_clustering.Rd b/man/workflow_clustering.Rd new file mode 100644 index 0000000..0699b2f --- /dev/null +++ b/man/workflow_clustering.Rd @@ -0,0 +1,89 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/WIP_workflow_clustering_heatmap.R +\name{workflow_clustering} +\alias{workflow_clustering} +\title{Workflow that clusters proteins for similar patterns over the samples and +produces a heatmap and lineplots} +\usage{ +workflow_clustering( + data_path, + output_path, + intensity_columns, + nr_clusters = NULL, + cluster_colours = NULL, + row_split = TRUE, + dist_method = "correlation", + colour_dend = TRUE, + suffix = "", + plot_height_heatmap = 15, + plot_width_heatmap = 15, + plot_height_lineplot = 10, + plot_width_lineplot = 15, + plot_dpi = 300, + column_name_protein = "Protein", + ... +) +} +\arguments{ +\item{data_path}{\strong{character} \cr +The path to an .xlsx file containing the input data.} + +\item{output_path}{\strong{character} \cr +The path to the output folder.} + +\item{intensity_columns}{\strong{integer vector} \cr +The numbers of the intensity columns in the table.} + +\item{nr_clusters}{\strong{integer(1)} \cr +Number of clusters to cut the dendrogram into. If \code{NULL} (default), +the optimal number of clusters is determined based on silhouette values +using the \code{\link[dendextend]{find_k}} function.} + +\item{cluster_colours}{\strong{character vector} \cr +Colours to use for the different clusters. If \code{NULL} (default), +the default ggplot color palette is used.} + +\item{row_split}{\strong{logical(1)} \cr If TRUE, there will be space between row clusters in the heatmap.} + +\item{dist_method}{\strong{character(1)} \cr +Distance measure to use for the hierarchical clustering. In principle, +all methods available in \code{\link[amap]{Dist}} are possible, however +correlation-based metrics like "correlation", "pearson" or "spearman" +are recommended. The default is "correlation", which uses the centere +Pearson correlation.} + +\item{colour_dend}{\strong{logical(1)} \cr +If \code{TRUE} (default), the branches of the dendrogram are coloured +according to the clusters, using the defined cluster_colours.} + +\item{suffix}{\strong{character} \cr +The suffix for the output files. It needs to start with an underscore.} + +\item{plot_height_heatmap}{\strong{numeric} \cr +The height for the heatmap in cm.} + +\item{plot_width_heatmap}{\strong{numeric} \cr +The width for the heatmap in cm.} + +\item{plot_height_lineplot}{\strong{numeric} \cr +The height for the lineplots in cm.} + +\item{plot_width_lineplot}{\strong{numeric} \cr +The width for the lineplotsin cm.} + +\item{plot_dpi}{\strong{numeric} \cr +The plot resolution for the heatmap.} + +\item{column_name_protein}{\strong{character(1)} \cr +The name of the column containing the protein identifiers.} + +\item{...}{Additional parameters passed to \code{\link[ProtStatsWF]{Heatmap_with_groups}}.} +} +\value{ +Nothing, but saves a heatmap, a set of lineplots (one per cluster) +and a cluster table to the output folder. +} +\description{ +Workflow that clusters proteins for similar patterns over the samples and +produces a heatmap and lineplots +} diff --git a/man/workflow_ttest.Rd b/man/workflow_ttest.Rd index 1bca49a..d4b2627 100644 --- a/man/workflow_ttest.Rd +++ b/man/workflow_ttest.Rd @@ -63,7 +63,7 @@ The maximum number of valid values to be an off protein.} The minimum number of valid values to be an on protein.} \item{suffix}{\strong{character} \cr -The suffix of the file names should have one.} +The suffix for the output files. It needs to start with an underscore.} \item{plot_device}{\strong{character} \cr The type of the output file, e.g. "pdf" or "png".}