diff --git a/docs/source/Combination_Kidney_IgAN.md b/docs/source/Combination_Kidney_IgAN.md index 8f32d94..5752119 100644 --- a/docs/source/Combination_Kidney_IgAN.md +++ b/docs/source/Combination_Kidney_IgAN.md @@ -1,9 +1,10 @@ -### Trajectory Analysis of Kidney IgAN Data with PILOT +### Trajectory Analysis and Integration of Modalities using Kidney IgAN (Pathomics) Data with PILOT
PILOT Welcome to the PILOT Package Tutorial for pathomics Data! +With this tutorial, we learn not only how to analyze the Pathomics Data but also how to integrate multimodal data with PILOT. You can find the pathomics data [here](https://github.com/CostaLab/PILOT/tree/main/Tutorial/Datasets). @@ -15,7 +16,7 @@ import pilotpy as pl import scanpy as sc ``` -#### Kidney_IgAN Tubuli +#### Kidney_IgAN Tubuli (first modality) ##### Reading Anndata
@@ -93,7 +94,7 @@ pl.pl.trajectory(adata_T, colors = ['red','blue','orange']) -#### Kidney_IgAN Glomeruli +#### Kidney_IgAN Glomeruli (second modality) ##### Reading Anndata
@@ -144,9 +145,9 @@ pl.pl.trajectory(adata_G, colors = ['red','blue','orange']) -#### Combination: +#### Integration of modalities:
-Here, we combine the distances of samples. We get the sum of distances of samples based on Tubuli and Glomeruli distances. +Here, we integrate the distances from the first (Tubuli) and second (Glomeruli) modalities. We get the sum of the distances of samples.
diff --git a/docs/source/Myocardial_infarction.md b/docs/source/Myocardial_infarction.md index 2082c88..4197429 100644 --- a/docs/source/Myocardial_infarction.md +++ b/docs/source/Myocardial_infarction.md @@ -152,505 +152,6 @@ for cell in adata.uns['cellnames']: plot_genes = False) ``` -##### Cluster Specific Marker Changes: -
-The previous test only finds genes with significant changes over time for a given cell type. However, it does not consider if a similar pattern and expression values are found in other clusters. To further select genes, we use a Wald test that compares the fit of the gene in the cluster vs. the fit of the gene in other clusters. -In the code below, we consider top genes (regarding the regression fit) for two interesting cell types discussed in the manuscript (‘healthy CM’ and ‘Myofib’). -
- - -```python -pl.tl.gene_cluster_differentiation(adata,cellnames = ['healthy_CM','Myofib'], number_genes = 70) -``` - - - - - -
-Test results are saved in ‘gene_clusters_stats_extend.csv’. To find a final list of genes, we only consider genes with a fold change higher than 0.5, i.e. genes which expression is increased in the cluster at hand; and we sort the genes based on the Wald test p-value. These can be seen bellow. -
- - -```python -df = pl.tl.results_gene_cluster_differentiation(cluster_name = 'Myofib',).head(50) -df.head(15) -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
geneclusterwaldStatpvalueFCExpression patternfit-pvaluefit-mod-rsquared
2642GAS7Myofib212.4772928.487275e-461.086644linear up quadratic down1.873033e-1070.570704
2151EXT1Myofib125.3831285.344198e-270.786136linear up quadratic down3.159831e-350.555757
4979PKNOX2Myofib89.7387122.492742e-190.855504quadratic down1.039404e-1170.544122
2529FN1Myofib70.6416963.110595e-151.573680linear down quadratic up2.947389e-1880.633774
1437COL6A3Myofib54.7511697.758841e-121.069156linear down quadratic up3.514298e-1720.608543
5775RORAMyofib52.4862952.359167e-110.899459quadratic down7.232834e-1740.587234
2832GXYLT2Myofib24.2471132.218154e-052.000205linear up quadratic down2.402171e-850.537920
3783MGPMyofib23.2444183.591226e-050.871041quadratic down1.327779e-2250.571374
4726PCDH9Myofib20.4396461.376052e-040.604830linear down0.000000e+000.596035
1231CHD9Myofib20.3895641.409364e-040.527488linear up quadratic down7.658862e-770.559604
1710DCNMyofib19.6563071.999818e-041.033697linear up quadratic down1.866152e-2840.588602
2824GSNMyofib18.0156124.366007e-040.638136linear up quadratic down2.942472e-2790.601684
1392COL3A1Myofib17.2764796.199787e-041.240454linear down quadratic up0.000000e+000.665616
1372COL1A2Myofib14.0688162.812963e-031.327753linear down quadratic up0.000000e+000.655032
7245VCANMyofib12.6101585.560192e-030.838764linear down quadratic up1.761922e-1640.571981
-
- - - -
-Here is the GO enrichment for the 50 first top genes of Myofib (FC >= 0.5 and p-value < 0.01). Plot is saved at Go folder. -
- - -```python -pl.pl.go_enrichment(df, cell_type = 'Myofib') -``` - - - -![png](Myocardial_infarction_files/Myocardial_infarction_23_0.png) - - - -
-We can visualize specific genes, for example the ones discussed in PILOT manuscript (COL1A2, DCN and EXT1). In the plot, the orange line indicates the fit in the target cell type (shown as orange lines) compared to other cell types (represented by grey lines). Plots of genes are saved at 'plot_genes_for_Myofib' folder. -
- - -```python -pl.pl.exploring_specific_genes(cluster_name = 'Myofib', gene_list = ['COL1A2','DCN','EXT1']) -``` - - - - - - -![png](Myocardial_infarction_files/Myocardial_infarction_25_1.png) - - - -
-We can repeat the same analysis for healthy_CM cell type by using the following commands. -
- - -```python -df=pl.tl.results_gene_cluster_differentiation(cluster_name = 'healthy_CM').head(50) -df.head(15) -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
geneclusterwaldStatpvalueFCExpression patternfit-pvaluefit-mod-rsquared
6165SORBS1healthy_CM1574.6656040.000000e+001.296470linear down quadratic up8.946560e-050.522953
1772DLG2healthy_CM1055.3130301.801893e-2281.155496linear down quadratic up1.323610e-2560.556306
6733THSD4healthy_CM834.2882391.583902e-1801.671315linear down quadratic up6.088694e-2500.582085
1276CMYA5healthy_CM752.3014079.561746e-1631.559703linear down quadratic up3.774063e-660.527869
3281LDB3healthy_CM542.2394583.342198e-1171.426196linear down quadratic up1.511694e-2380.546327
36ABLIM1healthy_CM379.4238676.335728e-820.979378linear up3.296026e-130.513734
6903TNNT2healthy_CM373.0379571.530428e-801.392561linear down quadratic up2.329698e-1180.535236
2398FHOD3healthy_CM343.3413264.125161e-741.731741linear down quadratic up0.000000e+000.612758
6663TECRLhealthy_CM338.8480753.875199e-731.261289linear up quadratic down0.000000e+000.570571
4056MYBPC3healthy_CM296.1572976.751814e-640.686940linear up quadratic down0.000000e+000.557570
5652RCAN2healthy_CM287.9960903.940736e-621.214055linear down quadratic up0.000000e+000.566313
1830DOCK3healthy_CM269.6536433.667754e-580.534836linear down quadratic up2.678914e-2020.527979
4177MYOM1healthy_CM236.4828755.483541e-511.637375linear down1.381677e-2680.548281
1915EFNA5healthy_CM236.2639576.115035e-511.089847linear down1.127515e-1640.532078
5436PXDNLhealthy_CM227.0664185.957588e-491.284827linear down quadratic up1.815885e-030.518712
-
- - - -
-Plot is saved at Go folder. -
- - -```python -pl.pl.go_enrichment(df, cell_type = 'healthy_CM') -``` - - - -![png](Myocardial_infarction_files/Myocardial_infarction_29_0.png) - - - -
-Plots of genes are saved at 'plot_genes_for_healthy_CM' folder. -
- - -```python -pl.pl.exploring_specific_genes(cluster_name = 'healthy_CM', gene_list = ['MYBPC3','MYOM1','FHOD3']) -``` - -![png](Myocardial_infarction_files/Myocardial_infarction_31_1.png) ##### Group genes by pattern: @@ -720,12 +221,15 @@ body {font-family: Arial;} border: 1px solid #ccc; border-top: none; } +.tabcontent.active { + display: block; +} In each tab below, you can check the information for each cluster.
- + @@ -733,40 +237,34 @@ In each tab below, you can check the information for each cluster.
-
+
-
-
- No GO information for cluster 3!
-
-
-
@@ -804,72 +302,62 @@ In the table, you can check the curves activities of some genes of the healthy_C The complete table can be found here: [healthy_CM_curves_activities.csv](https://costalab.ukaachen.de/open_data/PILOT/healthy_CM_curves_activities.csv) +##### Cluster Specific Marker Changes: +
+The previous test only finds genes with significant changes over time for a given cell type. However, it does not consider if a similar pattern and expression values are found in other clusters. To further select genes, we use a Wald test that compares the fit of the gene in the cluster vs. the fit of the gene in other clusters. +In the code below, we consider top genes (regarding the regression fit) for two interesting cell types discussed in the manuscript (‘healthy CM’ and ‘Myofib’). +
+ ```python -pl.pl.genes_selection_analysis(adata, 'Myofib', scaler_value = 0.5) +pl.tl.gene_cluster_differentiation(adata,cellnames = ['healthy_CM','Myofib'], number_genes = 70) ``` -![png](Myocardial_infarction_files/Myofib_heatmap.png) + + + + +
+Test results are saved in ‘gene_clusters_stats_extend.csv’. To find a final list of genes, we only consider genes with a fold change higher than 0.5, i.e. genes which expression is increased in the cluster at hand; and we sort the genes based on the Wald test p-value. These can be seen bellow. +
-Here, we utilize the [Enrichr](https://maayanlab.cloud/Enrichr/) tools to get the hallmarks of the clustered genes. The default dataset is MSigDB_Hallmark_2020, which you can change using the `gene_set_library` parameter. ```python -pl.pl.plot_hallmark_genes_clusters(adata, 'Myofib', 'MSigDB_Hallmark_2020') +df = pl.tl.results_gene_cluster_differentiation(cluster_name = 'healthy_CM',).head(50) +df.head(15) ``` +| gene | cluster | waldStat | pvalue | FC | Expression pattern | fit-pvalue | fit-mod-rsquared | +|----------|-----------|------------------|---------------|------------------|--------------------------|-----------------|------------------| +| SORBS1 | healthy_CM| 1574.665604 | 0.000000e+00 | 1.296470 | linear down quadratic up | 8.946560e-05 | 0.522953 | +| DLG2 | healthy_CM| 1055.313030 | 1.801893e-228 | 1.155496 | linear down quadratic up | 1.323610e-256 | 0.556306 | +| MYOM1 | healthy_CM| 236.482875 | 5.483541e-51 | 1.637375 | linear down | 1.381677e-268 | 0.548281 | +| FHOD3 | healthy_CM| 343.341326 | 4.125161e-74 | 1.731741 | linear down quadratic up | 0.000000e+00 | 0.612758 | +| MYBPC3 | healthy_CM| 296.157297 | 6.751814e-64 | 0.686940 | linear up quadratic down | 0.000000e+00 | 0.557570 | +| ... | ... | ... | ... | ... | ... | ... | ... | -![png](Myocardial_infarction_files/Myofib_hallmark.png) - -In each tab below, you can check the information for each cluster. -
- - - - +
+Here is the GO enrichment for the 50 first top genes of healthy_CM (FC >= 0.5 and p-value < 0.01). Plot is saved at Go folder.
-
- - - -
-
- - - -
+```python +pl.pl.go_enrichment(df, cell_type = 'healthy_CM') +``` +![png](Myocardial_infarction_files/Myocardial_infarction_29_0.png) + -
- - - No GO information for cluster 3! -
-
- - - +
+We can visualize specific genes, for example the ones discussed in PILOT manuscript (MYBPC3,MYOM1, and FHOD3). In the plot, the orange line indicates the fit in the target cell type (shown as orange lines) compared to other cell types (represented by grey lines). Plots of genes are saved at 'plot_genes_for_healthy_CM' folder.
-
-In the table, you can check the curves activities of some genes of the Myofib: -| Gene ID | Expression pattern | adjusted P-value | R-squared | mod_rsquared_adj | Terminal_logFC | Transient_logFC | Switching_time | area | cluster | -|---------|--------------------------|------------------|-----------|------------------|----------------|-----------------|----------------|-------|---------| -| LAMA2 | linear up quadratic down | 0.00 | 0.38 | 0.74 | -0.15 | 0.05 | 0.76 | 28.73 | 2 | -| COL1A1 | linear down quadratic up | 0.00 | 0.31 | 0.68 | 0.14 | -0.39 | 0.86 | 48.08 | 1 | -| NEGR1 | quadratic down | 0.00 | 0.28 | 0.67 | -0.17 | 0 | 0.64 | 16.17 | 2 | -| COL3A1 | linear down quadratic up | 0.00 | 0.28 | 0.67 | 0.12 | -0.63 | 0.89 | 54.59 | 1 | -| ZBTB20 | linear up quadratic down | 0.00 | 0.27 | 0.68 | -0.15 | 0.08 | 0.77 | 30.9 | 2 | -| ... | ... | ... | ... | ... | ... | ... | ... | | | -| MAN2A1 | quadratic down | 0.00 | -0.35 | 0.40 | -0.17 | 0 | 0.65 | 17.56 | 2 | -| ADGRD1 | linear down | 0.00 | -0.36 | 0.42 | -0.17 | 0 | 0.49 | 1.45 | 4 | -| ATR | quadratic down | 0.00 | -0.36 | 0.39 | -0.17 | 0 | 0.65 | 17.71 | 2 | -| URI1 | quadratic down | 0.00 | -0.36 | 0.39 | -0.17 | 0 | 0.65 | 17.49 | 2 | -| TRA2A | quadratic down | 0.01 | -0.39 | 0.38 | -0.17 | 0 | 0.65 | 17.01 | 2 | - -The complete table can be found here: [Myofib_curves_activities.csv](https://costalab.ukaachen.de/open_data/PILOT/Myofib_curves_activities.csv) +```python +pl.pl.exploring_specific_genes(cluster_name = 'healthy_CM', gene_list = ['MYBPC3','MYOM1','FHOD3']) +``` +![png](Myocardial_infarction_files/Myocardial_infarction_31_1.png) + ###### Plot specific genes:
diff --git a/docs/source/Myocardial_infarction_files/Myocardial_infarction_31_1.png b/docs/source/Myocardial_infarction_files/Myocardial_infarction_31_1.png index 6bf45a7..ea8e6b9 100644 Binary files a/docs/source/Myocardial_infarction_files/Myocardial_infarction_31_1.png and b/docs/source/Myocardial_infarction_files/Myocardial_infarction_31_1.png differ diff --git a/docs/source/Myocardial_infarction_files/healthy_CM_hallmark.png b/docs/source/Myocardial_infarction_files/healthy_CM_hallmark.png index 195b0af..b073aad 100644 Binary files a/docs/source/Myocardial_infarction_files/healthy_CM_hallmark.png and b/docs/source/Myocardial_infarction_files/healthy_CM_hallmark.png differ diff --git a/docs/source/index.md b/docs/source/index.md index 843c6ef..4c11a8c 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -63,7 +63,7 @@ Myocardial_infarction ```{toctree} --- maxdepth: 2 -caption: Pathomics data Analysis +caption: Multimodal Integration --- Combination_Kidney_IgAN ``` diff --git a/pilotpy/plot/__init__.py b/pilotpy/plot/__init__.py index 30f49bc..8c3f41a 100644 --- a/pilotpy/plot/__init__.py +++ b/pilotpy/plot/__init__.py @@ -1,5 +1,5 @@ from .ploting import * from .curve_activity import * from .gene_selection_analysis import * -from .pseudobulk_DE_analysis import * + diff --git a/pilotpy/plot/pseudobulk_DE_analysis.py b/pilotpy/plot/pseudobulk_DE_analysis.py deleted file mode 100644 index 1846af5..0000000 --- a/pilotpy/plot/pseudobulk_DE_analysis.py +++ /dev/null @@ -1,666 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Mon Apr 22 14:43:35 2024 - -@author: Mina Shaigan -""" - -import os -import pandas as pd -import numpy as np -import anndata as ad -import itertools -from sklearn.feature_selection import VarianceThreshold -from sklearn.decomposition import PCA -from matplotlib.lines import Line2D - -from adjustText import adjust_text -from gprofiler import GProfiler -import textwrap as tw -import seaborn as sns -import matplotlib.pyplot as plt - -import rpy2.robjects as robjects -import rpy2.robjects.numpy2ri -from rpy2.robjects import pandas2ri - -from rpy2.rinterface_lib.callbacks import logger as rpy2_logger -import logging -rpy2_logger.setLevel(logging.ERROR) -pandas2ri.activate() - - -def plot_cell_numbers(adata, proportion_df, - cell_type: str = None, - cluster_col: str = "Predicted_Labels", - celltype_col: str = "cell_types", - sample_col: str = "sampleID", - my_pal = None): - """ - - - Parameters - ---------- - adata : TYPE - DESCRIPTION. - proportion_df : TYPE - DESCRIPTION. - cell_type : str, optional - DESCRIPTION. The default is None. - cluster_col : str, optional - DESCRIPTION. The default is "Predicted_Labels". - celltype_col : str, optional - DESCRIPTION. The default is "cell_types". - sample_col : str, optional - DESCRIPTION. The default is "sampleID". - my_pal : TYPE, optional - DESCRIPTION. The default is None. - - Returns - ------- - None. - - """ - - copy_cells = adata.obs.copy() - copy_cells= copy_cells[copy_cells[celltype_col] == cell_type] - copy_cells['group'] = proportion_df.loc[copy_cells[sample_col]][cluster_col].values - data = copy_cells.groupby(['group', sample_col])[sample_col].count() - data = pd.DataFrame(data) - data = data.loc[~(data==0).all(axis=1)] - - n_groups = np.unique(data.index.get_level_values("group").values) - if my_pal is None: - if len(n_groups) == 3: - my_pal = dict(zip(n_groups, ["tab:red", "skyblue", "tab:blue"])) - else: - my_pal = dict(zip(n_groups, sns.color_palette("tab10", len(n_groups)))) - - plt.figure(figsize=(20,5)) - x_values = data.index.get_level_values(sample_col).values - plt.bar(range(data.shape[0]),data[sample_col].values, - color = [my_pal[key] for key in data.index.get_level_values("group").values], - tick_label = x_values) - plt.xticks(fontsize=24, rotation = 45, ha= 'center') - plt.yticks(fontsize=24) - - plt.title(cell_type, fontsize = 24) - plt.ylabel('Number of cells', fontsize = 24) - colors = my_pal - labels = list(colors.keys()) - handles = [plt.Rectangle((0,0),1,1, color=colors[label]) for label in labels] - plt.legend(handles, labels, fontsize = 24) - plt.show() - -def compute_pseudobulk_DE( - cluster_counts: pd.DataFrame = None, - cluster_metadata: pd.DataFrame = None, - group1: str = None, - group2: str = None, - cluster_col: str = None): - - """ - Parameters - ---------- - aggr_counts : pd.DataFrame, optional - DESCRIPTION. The default is None. - metadata : pd.DataFrame, optional - DESCRIPTION. The default is None. - cell_type : str, optional - DESCRIPTION. The default is None. - group1 : str, optional - DESCRIPTION. The default is None. - group2 : str, optional - DESCRIPTION. The default is None. - n_cpus : int, optional - DESCRIPTION. The default is 8. - - Returns - ------- - my_stat_res : TYPE - DESCRIPTION. - - """ - - # consider DE between two group of interest - my_cluster_metadata = cluster_metadata[ (cluster_metadata[cluster_col] == group1 ) | (cluster_metadata[cluster_col] == group2)] - my_cluster_counts = cluster_counts.loc[my_cluster_metadata.index] - - R = robjects.r - R('library(SingleCellExperiment)') - R('library(DESeq2)') - R('library(apeglm)') - R('library(tidyverse, verbose = FALSE)') - R.assign('cluster_counts', my_cluster_counts) - R.assign('cluster_metadata', my_cluster_metadata) - - try: - - R('dds <- DESeqDataSetFromMatrix(round(t(cluster_counts)), colData = cluster_metadata, design = ~ stage)') - R('rld <- rlog(dds, blind = TRUE)') - R('dds <- DESeq(dds)') - R(' \ - mylist <- list(resultsNames(dds)); \ - for(coef in resultsNames(dds)){ \ - if(coef != "Intercept"){ \ - print(coef); \ - res <- results(dds, name = coef, alpha = 0.05); \ - res <- lfcShrink(dds, coef = coef, res = res, type = "apeglm"); \ - res_tbl <- res %>% data.frame() %>% rownames_to_column(var = "gene") %>% as_tibble() %>% arrange(padj); \ - mylist[[coef]] <- res_tbl; \ - } \ - } \ - ') - - res = R('''mylist''') - return res - except rpy2.rinterface_lib.embedded.RRuntimeError: - return None - -def compute_pseudobulk_PCA( - cluster_counts: pd.DataFrame = None, - cluster_metadata: pd.DataFrame = None): - - """ - Parameters - ---------- - aggr_counts : pd.DataFrame, optional - DESCRIPTION. The default is None. - metadata : pd.DataFrame, optional - DESCRIPTION. The default is None. - cell_type : str, optional - DESCRIPTION. The default is None. - group1 : str, optional - DESCRIPTION. The default is None. - group2 : str, optional - DESCRIPTION. The default is None. - n_cpus : int, optional - DESCRIPTION. The default is 8. - - Returns - ------- - my_stat_res : TYPE - DESCRIPTION. - - """ - - - # consider DE between two group of interest - - R = robjects.r - R('library(SingleCellExperiment)') - R('library(DESeq2)') - R('library(apeglm)') - R('library(tidyverse, verbose = FALSE)') - R.assign('cluster_counts', cluster_counts) - R.assign('cluster_metadata', cluster_metadata) - - try: - - R('dds <- DESeqDataSetFromMatrix(round(t(cluster_counts)), colData = cluster_metadata, design = ~ stage)') - R('rld <- rlog(dds, blind = TRUE)') - R('dds <- DESeq(dds)') - rld = R('''as.data.frame(assay(rld))''') - return rld - except rpy2.rinterface_lib.embedded.RRuntimeError: - return None - -def plotPCA_subgroups(proportions, deseq2_counts, cell_type, my_pal, cluster_col): - # consider top variances features - selector = VarianceThreshold(0.2) - new_deseq2_counts = selector.fit_transform(deseq2_counts) - new_deseq2_counts = pd.DataFrame(new_deseq2_counts, index = deseq2_counts.index) - - # reduce dimension by PCA - pca = PCA(n_components=2) - X_pca = pca.fit_transform(new_deseq2_counts) - - # plot PCA - color_map = [my_pal[val] for val in proportions.loc[new_deseq2_counts.index, cluster_col]] - fig, ax = plt.subplots() - ax.scatter(X_pca[:, 0], X_pca[:, 1], - c = color_map, - cmap='viridis', edgecolor='k', s = 200) - plt.xlabel('PC1: ' + str(round(pca.explained_variance_ratio_[0]*100)) + "% variance", fontsize = 24) - plt.ylabel('PC2: ' + str(round(pca.explained_variance_ratio_[1]*100)) + "% variance", fontsize = 24) - plt.title('PCA ' + str(cell_type)) - legend_elements = [] - for k in my_pal.keys(): - legend_elements.append(Line2D([0], [0], marker='o', color='w', label=k, - markerfacecolor=my_pal[k], markersize=15)) - ax.legend(handles=legend_elements, loc=1) - plt.show() - -def map_color_ps(a, low_fc_thrr, high_fc_thrr, pv_thrr): - log2FoldChange, symbol, nlog10 = a - if log2FoldChange >= high_fc_thrr and nlog10 >= pv_thrr: - return 'very higher' - elif log2FoldChange <= -low_fc_thrr and nlog10 >= pv_thrr: - return 'very lower' - else: - return 'no' - -def volcano_plot_ps(data, symbol, foldchange, p_value, - cell_type, - feature1, - feature2, - low_fc_thr = 1, - high_fc_thr = 1, - pv_thr = 1, - figsize = (20,10), - output_path = None, - my_pal = None, - fontsize: int = 14 - ): - """ - - - Parameters - ---------- - data : TYPE - DESCRIPTION. - symbol : TYPE - DESCRIPTION. - foldchange : TYPE - DESCRIPTION. - p_value : TYPE - DESCRIPTION. - cell_type : TYPE - DESCRIPTION. - feature1 : TYPE - DESCRIPTION. - feature2 : TYPE - DESCRIPTION. - low_fc_thr : TYPE, optional - DESCRIPTION. The default is 1. - high_fc_thr : TYPE, optional - DESCRIPTION. The default is 1. - pv_thr : TYPE, optional - DESCRIPTION. The default is 1. - figsize : TYPE, optional - DESCRIPTION. The default is (20,10). - output_path : TYPE, optional - DESCRIPTION. The default is None. - my_pal : TYPE, optional - DESCRIPTION. The default is None. - fontsize : int, optional - DESCRIPTION. The default is 14. - - Returns - ------- - str - DESCRIPTION. - - """ - - df = pd.DataFrame(columns=['log2FoldChange', 'nlog10', 'symbol']) - df['log2FoldChange'] = data[foldchange] - df['nlog10'] = -np.log10(data[p_value].values) - df['symbol'] = data[symbol].values - - color1 = my_pal[feature1] - color2 = my_pal[feature2] - - df.replace([np.inf, -np.inf], np.nan, inplace=True) - df.dropna(subset=["nlog10"], how="all", inplace=True) - - - selected_labels = df.loc[ (df.log2FoldChange <= low_fc_thr) & (df.log2FoldChange >= high_fc_thr) & \ - (df['nlog10'] >= pv_thr)]['symbol'].values - - def map_shape(symbol): - if symbol in selected_labels: - return 'important' - return 'not' - - df['color'] = df[['log2FoldChange', 'symbol', 'nlog10']].apply(map_color_ps, low_fc_thrr = low_fc_thr, - high_fc_thrr = high_fc_thr, - pv_thrr = pv_thr, axis = 1) - df['shape'] = df.symbol.map(map_shape) - df['baseMean'] = df.nlog10*10 - - - plt.figure(figsize = figsize, frameon=False, dpi=100) - plt.style.use('default') - - ax = sns.scatterplot(data = df, x = 'log2FoldChange', y = 'nlog10', - hue = 'color', hue_order = ['no', 'very higher', 'very lower'], - palette = ['lightgrey', color2, color1], - style = 'shape', style_order = ['not', 'important'], - markers = ['o', 'o'], - size = 'baseMean', sizes = (40, 400) - ) - - ax.axhline(pv_thr, zorder = 0, c = 'k', lw = 2, ls = '--') - ax.axvline(high_fc_thr, zorder = 0, c = 'k', lw = 2, ls = '--') - ax.axvline(-low_fc_thr, zorder = 0, c = 'k', lw = 2, ls = '--') - - texts = [] - for i in range(len(df)): - if df.iloc[i].nlog10 >= pv_thr and (df.iloc[i].log2FoldChange >= high_fc_thr): - texts.append(plt.text(x = df.iloc[i].log2FoldChange, y = df.iloc[i].nlog10, s = df.iloc[i].symbol, - fontsize = fontsize, weight = 'bold', family = 'sans-serif')) - if df.iloc[i].nlog10 >= pv_thr and ( df.iloc[i].log2FoldChange <= -low_fc_thr): - texts.append(plt.text(x = df.iloc[i].log2FoldChange, y = df.iloc[i].nlog10, s = df.iloc[i].symbol, - fontsize = fontsize + 2, weight = 'bold', family = 'sans-serif')) - adjust_text(texts) - - custom_lines = [Line2D([0], [0], marker='o', color='w', markerfacecolor=color2, markersize=fontsize), - Line2D([0], [0], marker='o', color='w', markerfacecolor=color1, markersize=fontsize)] - - plt.legend(custom_lines, ['Higher expressions in ' + feature2, 'Higher expressions in ' + feature1], loc = 1, - bbox_to_anchor = (1,1.1), frameon = False, prop = {'weight': 'normal', 'size': fontsize}) - - for axis in ['bottom', 'left']: - ax.spines[axis].set_linewidth(2) - - ax.spines['top'].set_visible(False) - ax.spines['right'].set_visible(False) - - ax.tick_params(width = 2) - ax.set_ylim(bottom=0) - plt.title("Expression Score \n " + feature1 + " - " + feature2, fontsize = fontsize + 4) - plt.xticks(size = fontsize, weight = 'bold') - plt.yticks(size = fontsize, weight = 'bold') - - plt.xlabel("$log_{2}$ (Fold Change)", size = fontsize + 2) - plt.ylabel("-$log_{10}$ (P-value)", size = fontsize + 2) - - if output_path is not None: - plt.savefig(output_path + "/volcano_" + str(feature1) + "-" + str(feature2) + "_FC.pdf", - dpi = 100, bbox_inches = 'tight', facecolor = 'white') - plt.show() - -def gene_annotation_cell_type_subgroup(data: pd.DataFrame = None, - symbol: str = 'gene', - sig_col: str = 'significant_gene', - cell_type: str = None, - group: str = None, - sources: str = None, - num_gos: int = 10, - fig_h: int = 6, - fig_w: int = 4, - font_size: int = 14, - max_length:int = 50, - path_to_results: str = None, - my_pal = None - ): - """ - Plot to show the most relative GO terms for specifc cell-type of determind patient sub-group - - Parameters - ---------- - data : pd.DataFrame - DESCRIPTION. The default is None. - symbol : str, optional - DESCRIPTION. The default is 'gene'. - sig_col : str, optional - DESCRIPTION. The default is 'significant_gene'. - cell_type : str - DESCRIPTION. The default is None. - group : str - DESCRIPTION. The default is None. - sources : str, optional - DESCRIPTION. The default is None. - num_gos : int, optional - DESCRIPTION. The default is 10. - fig_h : int, optional - DESCRIPTION. The default is 6. - fig_w : int, optional - DESCRIPTION. The default is 4. - font_size : int, optional - DESCRIPTION. The default is 14. - max_length : int, optional - DESCRIPTION. The default is 50. - path_to_results : str, optional - DESCRIPTION. The default is None. - my_pal : TYPE, optional - DESCRIPTION. The default is None. - - Returns - ------- - None. - - """ - -# path_to_results = 'Results_PILOT' - - color = my_pal[group] - -# group_genes = pd.read_csv(path_to_results + \ -# "/significant_genes_" + cell_type + "_" + group + ".csv") - - group_genes = data.loc[data[sig_col] == group, symbol].values - gp = GProfiler(return_dataframe = True) - if list(group_genes): - gprofiler_results = gp.profile(organism = 'hsapiens', - query = list(group_genes), - no_evidences = False, - sources = sources) - else: - return "Genes list is empty!" - - if(gprofiler_results.shape[0] == 0): - return "Not enough information!" - elif(gprofiler_results.shape[0] < num_gos): - num_gos = gprofiler_results.shape[0] - - all_gprofiler_results = gprofiler_results.copy() - # display(all_gprofiler_results.head()) - - # print(len(list(group_genes['symbol'].values))) - # selected_gps = gprofiler_results.loc[0:num_gos,['name', 'p_value']] - selected_gps = gprofiler_results.head(num_gos)[['name', 'p_value']] - - selected_gps['nlog10'] = -np.log10(selected_gps['p_value'].values) - - for i in selected_gps.index: - split_name = "\n".join(tw.wrap(selected_gps.loc[i, 'name'], max_length)) - selected_gps.loc[i, 'name'] = split_name - - figsize = (fig_h, fig_w) - - plt.figure(figsize = figsize, dpi = 100) - plt.style.use('default') - sns.scatterplot(data = selected_gps, x = "nlog10", y = "name", s = 300, color = color) - - plt.title('GO enrichment in ' + cell_type + ' associated with ' + group + \ - '\n (number of genes: ' + str(len(list(group_genes))) + ")", fontsize = font_size + 2) - - plt.xticks(size = font_size) - plt.yticks(size = font_size) - - plt.ylabel("GO Terms", size = font_size) - plt.xlabel("-$log_{10}$ (P-value)", size = font_size) - - save_path = path_to_results + '/' - if not os.path.exists(save_path): - os.makedirs(save_path) -# plt.savefig(save_path + group + ".pdf", bbox_inches = 'tight', -# facecolor = 'white', transparent = False) - plt.show() - - all_gprofiler_results.to_csv(save_path + group + ".csv") - -def get_sig_genes(data, symbol, foldchange, p_value, cell_type, - feature1, feature2, - low_fc_thr = 1, high_fc_thr = 1, pv_thr = 1): - df = pd.DataFrame(columns=['log2FoldChange', 'nlog10', 'symbol']) - df['log2FoldChange'] = data[foldchange] - df['nlog10'] = -np.log10(data[p_value].values) - df['symbol'] = data[symbol].values - - df.replace([np.inf, -np.inf], np.nan, inplace = True) - df.dropna(subset = ["nlog10"], how = "all", inplace = True) - - data['significant_gene'] = "" - group1_selected_labels = df.loc[ (df.log2FoldChange <= -low_fc_thr) & (df['nlog10'] >= pv_thr), 'symbol'].values - data.loc[data[symbol].isin(group1_selected_labels), 'significant_gene'] = feature1 - - group2_selected_labels = df.loc[ (df.log2FoldChange >= high_fc_thr) & (df['nlog10'] >= pv_thr), 'symbol'].values - data.loc[data[symbol].isin(group2_selected_labels), 'significant_gene'] = feature2 - - return data - -def get_pseudobulk_DE(adata: ad.AnnData, - proportion_df: pd.DataFrame, - cell_type: str, - fc_thr: list, - pv_thr: float = 0.05, - celltype_col: str = "cell_types", - sample_col: str = "sampleID", - cluster_col: str = "Predicted_Labels", - remove_samples: list = [], - my_pal: dict = None, - path_to_results: str = 'Results_PILOT/', - figsize: tuple = (30, 15), - num_gos: int = 10, - fig_h: int = 6, - fig_w: int = 4, - sources: list = ['GO:CC', 'GO:PB', 'GO:MF'], - fontsize: int = 14, - load: bool = False - ): - """ - - - Parameters - ---------- - adata : ad.AnnData - DESCRIPTION. - proportion_df : pd.DataFrame - DESCRIPTION. - cell_type : str - DESCRIPTION. - fc_thr : list - DESCRIPTION. - pv_thr : float, optional - DESCRIPTION. The default is 0.05. - celltype_col : str, optional - DESCRIPTION. The default is "cell_types". - sample_col : str, optional - DESCRIPTION. The default is "sampleID". - cluster_col : str, optional - DESCRIPTION. The default is "Predicted_Labels". - remove_samples : list, optional - DESCRIPTION. The default is []. - my_pal : dict, optional - DESCRIPTION. The default is None. - path_to_results : str, optional - DESCRIPTION. The default is 'Results_PILOT/'. - figsize : tuple, optional - DESCRIPTION. The default is (30, 15). - num_gos : int, optional - DESCRIPTION. The default is 10. - fig_h : int, optional - DESCRIPTION. The default is 6. - fig_w : int, optional - DESCRIPTION. The default is 4. - sources : list, optional - DESCRIPTION. The default is ['GO:CC', 'GO:PB', 'GO:MF']. - fontsize : int, optional - DESCRIPTION. The default is 14. - load : bool, optional - DESCRIPTION. The default is False. - - Returns - ------- - None. - - """ - - - n_clusters = np.unique(proportion_df[cluster_col]) - if my_pal is None: - if len(n_clusters) == 3: - my_pal = dict(zip(n_clusters, ["tab:red", "skyblue", "tab:blue"])) - else: - my_pal = dict(zip(n_clusters, sns.color_palette("tab10", len(n_clusters)))) - - save_path = path_to_results + "/Diff_Expressions_Results/" + str(cell_type) + "/pseudobulk/" - log_pv_thr = -np.log10(pv_thr) - - print("Plot cells frequency for each sample... ") - plot_cell_numbers(adata, proportion_df, cell_type = cell_type, - cluster_col = cluster_col, celltype_col = celltype_col, - sample_col = sample_col, my_pal= my_pal) - - if load == False: - print("Aggregating the counts and metadata to the sample level...") - counts_df = adata.to_df() - counts_df[[celltype_col, sample_col]] = adata.obs[[celltype_col, sample_col]].values - - aggr_counts = counts_df.groupby([celltype_col, sample_col]).sum() - - - cluster_counts = aggr_counts.loc[cell_type] - cluster_metadata = proportion_df.loc[cluster_counts.index.values] - cluster_metadata['stage'] = cluster_metadata[cluster_col].values - - # remove unwanted samples - if not (remove_samples is None): - for sample in remove_samples: - if sample in cluster_metadata.index: - cluster_metadata = cluster_metadata.drop(index = sample) - if sample in cluster_counts.index: - cluster_counts = cluster_counts.drop(index = sample) - - cluster_metadata = cluster_metadata.loc[cluster_counts.index] - cluster_counts = cluster_counts.loc[:, (cluster_counts != 0).any(axis=0)] - - print("Use the median of ratios method for count normalization from DESeq2") - print("Use regularized log transform (rlog) of the normalized counts from DESeq2") - rld = compute_pseudobulk_PCA(cluster_counts, cluster_metadata) - - if rld is not None: - - if not os.path.exists(save_path): - os.makedirs(save_path) - rld.to_csv(save_path + "rld_PCA.csv") - else: - rld = pd.read_csv(save_path + "rld_PCA.csv", index_col = 0) - - deseq2_counts = rld.transpose() - print("Plot the first two principal components... ") - plotPCA_subgroups(proportion_df, deseq2_counts, cell_type, my_pal, cluster_col) - - print("Performing the DE analysis... ") - j = 0 - for groups in itertools.combinations(n_clusters, 2): - data = None - if load == False: - res = compute_pseudobulk_DE(cluster_counts, cluster_metadata, - group1 = groups[0], - group2 = groups[1], - cluster_col = cluster_col) - if res is not None: - with (robjects.default_converter + pandas2ri.converter).context(): - data = robjects.conversion.get_conversion().rpy2py(res[1]) - - data = get_sig_genes(data, 'gene', 'log2FoldChange', 'padj', cell_type, - groups[0], groups[1], fc_thr[j], fc_thr[j], log_pv_thr) - - data.to_csv(save_path + "/" + str(groups[1]) + "vs" + str(groups[0]) + "_DE.csv") - else: - data = pd.read_csv(save_path + "/" + str(groups[1]) + "vs" + str(groups[0]) + "_DE.csv", index_col = 0) - - if data is not None: - print("Plot volcano plot for " + str(groups[1]) + " vs " + str(groups[0])) - volcano_plot_ps(data, 'gene', 'log2FoldChange', 'padj', cell_type, - groups[0], groups[1], fc_thr[j], fc_thr[j], log_pv_thr, figsize = figsize, - output_path = save_path + "/", - my_pal = my_pal, fontsize = fontsize) - - print("Plot GO analysis for " + str(groups[1]) + " vs " + str(groups[0])) - gene_annotation_cell_type_subgroup(data, cell_type = cell_type, group = groups[0], - sources = sources, num_gos = num_gos, - fig_h = fig_h, fig_w = fig_w, font_size = fontsize, - path_to_results = save_path + "/" + str(groups[1]) + "vs" + str(groups[0]) + "/GOs/", - my_pal = my_pal) - gene_annotation_cell_type_subgroup(data, cell_type = cell_type, group = groups[1], - sources = sources, num_gos = num_gos, - fig_h = fig_h, fig_w = fig_w, font_size = fontsize, - path_to_results = save_path + "/" + str(groups[1]) + "vs" + str(groups[0]) + "/GOs/", - my_pal = my_pal) - j += 1 \ No newline at end of file diff --git a/pilotpy/tools/patients_sub_clustering.py b/pilotpy/tools/patients_sub_clustering.py index 648d3bc..a3a8710 100644 --- a/pilotpy/tools/patients_sub_clustering.py +++ b/pilotpy/tools/patients_sub_clustering.py @@ -181,7 +181,7 @@ def compute_diff_expressions(adata,cell_type: str = None, cells=adata.uns[cell_type] - + """ import rpy2.robjects as robjects import rpy2.robjects.numpy2ri from rpy2.robjects import pandas2ri @@ -239,7 +239,7 @@ def compute_diff_expressions(adata,cell_type: str = None, - + """ def install_r_packages(): """ Install R packages using rpy2. @@ -253,16 +253,6 @@ def install_r_packages(): None """ # Install R packages using rpy2 - import rpy2.robjects as robjects - - robjects.r(''' - if (!requireNamespace("BiocManager", quietly = TRUE)) - install.packages("BiocManager") - ''') - - robjects.r(''' - BiocManager::install("limma") - ''') - + print('Install rpy2 with conda') diff --git a/setup.py b/setup.py index 02f26d3..a127130 100644 --- a/setup.py +++ b/setup.py @@ -28,9 +28,7 @@ "elpigraph-python>=0.3.1,<0.4.0", "adjusttext>=0.8,<0.9", "gprofiler-official>=1.0.0,<1.1.0", - "rpy2>=3.5.11", - - + ], packages=find_packages()