From 892ee823197f08e558e9e7904a36295067cfa599 Mon Sep 17 00:00:00 2001 From: Tomer Yaron-Barir Date: Wed, 25 Jun 2025 18:55:46 -0400 Subject: [PATCH 1/8] Creating parsing function for a sequence --- src/kinase_library/utils/utils.py | 160 +++++++++++++++++++++--------- 1 file changed, 115 insertions(+), 45 deletions(-) diff --git a/src/kinase_library/utils/utils.py b/src/kinase_library/utils/utils.py index 98610f8..9a2f499 100644 --- a/src/kinase_library/utils/utils.py +++ b/src/kinase_library/utils/utils.py @@ -3,7 +3,6 @@ # The Kinase Library - Utilities # ################################## """ -import os import re import string import numpy as np @@ -11,7 +10,6 @@ import matplotlib.pyplot as plt from .. import logomaker from tqdm import tqdm -from Bio import SeqIO from sklearn.preprocessing import MultiLabelBinarizer import warnings @@ -58,7 +56,7 @@ def matrix_to_df(mat, kin_type=None, pp=True, k_mod=False, mat_type='log2', cols df_mat : pd.DataFrame Matrix as Pandas dataframe. """ - + if cols is None: if mat_type == 'densitometry': cols = [str(i) for i in range(mat.shape[1])] @@ -69,13 +67,13 @@ def matrix_to_df(mat, kin_type=None, pp=True, k_mod=False, mat_type='log2', cols cols = _global_vars.aa_phos else: cols = _global_vars.aa_all_ptm - + if rows is None: if mat_type == 'densitometry': rows = list(string.ascii_uppercase[:mat.shape[1]]) elif (mat_type in _global_vars.valid_mat_types) and (kin_type in _global_vars.valid_kin_types): rows = data.get_positions(kin_type) - + if len(rows) != mat.shape[0]: if rows is None: raise Exception('As default, {} matrices must be of shape of {}'.format(kin_type,(len(rows),len(cols)))) @@ -86,9 +84,9 @@ def matrix_to_df(mat, kin_type=None, pp=True, k_mod=False, mat_type='log2', cols raise Exception('As default, {} matrices must be of shape of {}'.format(kin_type,(len(rows),len(cols)))) else: raise Exception('Length of provided columns ({}) does not match the number of columns in the provided matrix ({})'.format(len(cols),mat.shape[1])) - + df_mat = pd.DataFrame(mat, index=rows, columns=cols) - + return(df_mat) @@ -106,7 +104,7 @@ def flatten_matrix(matrix): flat_matrix : np.ndarray Flatten matrix. """ - + flat_matrix = matrix.transpose().values.reshape(matrix.shape[0] * matrix.shape[1],1).transpose() return (flat_matrix) @@ -180,10 +178,10 @@ def make_seq_logo(matrix, logo_type='ratio_to_median', random_aa_value=0.05, fig : plt.figure() If specified, returning the figure object with the sequence logo. """ - + if font_name == 'Arial Rounded MT Bold' and font_name not in logomaker.list_font_names(): font_name = 'sans' - + if ax is None: zero_lw = 3 fig,ax = plt.subplots(figsize = [10,5]) @@ -192,10 +190,10 @@ def make_seq_logo(matrix, logo_type='ratio_to_median', random_aa_value=0.05, plot = False if save_fig or return_fig: raise ValueError('When Axes provided, \'save_fig\', and \'return_fig\' must be False.') - - if color_dict is None: + + if color_dict is None: color_dict = _global_vars.aa_colors - + if logo_type == 'ratio_to_random': if random_aa_value is None: raise Exception('If method is specified to be ratio to random, \'random_aa_value\' must be provided.') @@ -213,7 +211,7 @@ def make_seq_logo(matrix, logo_type='ratio_to_median', random_aa_value=0.05, norm_zero_pos = {key: zero_pos[key]/sum(zero_pos.values())*max_height for key in zero_pos.keys()} for aa in zero_pos.keys(): height_matrix.loc[0,aa] = norm_zero_pos[aa] - + xlims = ax.get_xlim() ylims = ax.get_ylim() logomaker.Logo(pd.DataFrame(height_matrix.loc[0]).transpose(), ax=ax, color_scheme=zero_pos_color, flip_below=flip_below, font_name=font_name) @@ -224,7 +222,7 @@ def make_seq_logo(matrix, logo_type='ratio_to_median', random_aa_value=0.05, ax.set_ylim(ylims) else: ax.axvline(0, color='k', linewidth=3, linestyle=':') - + ax.set_xticks(ticks=sorted(height_matrix.index)) ax.set_xticklabels(labels=['' if not xticks else str(x) if x<=0 else '+'+str(x) for x in sorted(height_matrix.index)], fontsize=xticks_fontsize, weight = 'bold') if yticks: @@ -233,7 +231,7 @@ def make_seq_logo(matrix, logo_type='ratio_to_median', random_aa_value=0.05, ax.tick_params(axis='y', which='both', left=False, labelleft=False) if title is not None: ax.set_title(title, fontsize=24) - + if xlabel != False: ax.set_xlabel(xlabel, fontsize=xlabel_size) if ylabel != False: @@ -246,24 +244,24 @@ def make_seq_logo(matrix, logo_type='ratio_to_median', random_aa_value=0.05, ax.set_ylabel('Probability',fontsize = ylabel_size) else: ax.set_ylabel(ylabel, fontsize=ylabel_size) - + if save_fig: fig.savefig(save_fig) - + try: fig.tight_layout() except: pass - + if not plot: try: plt.close(fig) except: pass - + if return_fig: return(fig) - + #%% """ @@ -296,27 +294,27 @@ def filter_invalid_subs(data, seq_col, suppress_warnings=True): omitted_enteries : pd.DataFrame Dropped enteries due to invalid sequence. """ - + data_no_na = data[data[seq_col].notna()] if not suppress_warnings and ((len(data)-len(data_no_na))>0): print(str(len(data)-len(data_no_na)) + ' entries were omitted due to empty value in the substrates column.') - + escaped_aa_list = [re.escape(c) for c in _global_vars.valid_aa] data_no_invalid_aa = data_no_na[data_no_na[seq_col].astype(str).str.contains('^['+''.join(escaped_aa_list)+']+$')] if not suppress_warnings and ((len(data_no_na)-len(data_no_invalid_aa))>0): print(str(len(data_no_na)-len(data_no_invalid_aa)) + ' entries were omitted due to invalid amino acids or characters.') - + data_no_even = data_no_invalid_aa[(data_no_invalid_aa[seq_col].str.len())%2 != 0] if not suppress_warnings and ((len(data_no_invalid_aa)-len(data_no_even))>0): print(str(len(data_no_invalid_aa)-len(data_no_even)) + ' entries were omitted due to even length (no central position).') - + data_no_invalid_phos_acc = data_no_even[data_no_even[seq_col].apply(lambda x: x[len(x)//2]).isin(_global_vars.valid_phos_res)] if not suppress_warnings and ((len(data_no_even)-len(data_no_invalid_phos_acc))>0): print(str(len(data_no_even)-len(data_no_invalid_phos_acc)) + ' entries were omitted due to invalid central phosphoacceptor.') omitted_enteries = pd.concat([data, data_no_invalid_phos_acc]) omitted_enteries = omitted_enteries.loc[omitted_enteries.astype(str).drop_duplicates(keep=False).index] #In order to deal with lists in the DataFrame - + return(data_no_invalid_phos_acc, omitted_enteries) @@ -344,7 +342,7 @@ def sequence_to_substrate(seq, pp=False, phos_pos=None, kin_type=None, validate_ substrate : str 15-mer with phosphoacceptor at the center. """ - + if validate_phos_res: if phos_pos is None: if (seq[len(seq)//2].lower() not in ['s','t','y']) | (len(seq) % 2 == 0): @@ -354,27 +352,27 @@ def sequence_to_substrate(seq, pp=False, phos_pos=None, kin_type=None, validate_ if validate_aa: if not _global_vars.valid_aa.issuperset(seq.upper()): raise Exception('Sequence contains invalid amino acids or characters ({}).\nAllowed characters are: {}'.format(seq.upper(),_global_vars.valid_aa)) - + if seq is np.nan: return(None) if phos_pos is None: phos_pos = len(seq)//2 + 1 - + if kin_type is not None: exceptions.check_kin_type(kin_type) phos_acc = seq[phos_pos-1] if phos_acc not in _global_vars.kin_type_phos_acc[kin_type]: raise Exception(f'Mismatch beteween kinase type ({kin_type}) and phosphoacceptor ({phos_acc}): {seq}') - + pad_seq = '_'*7 + seq + '_'*7 substrate = pad_seq[phos_pos-1:phos_pos+14] - + if pp: substrate = ''.join([x.upper() if x not in ['s','t','y'] else x for x in substrate[:7]]) + substrate[7].lower() + ''.join([x.upper() if x not in ['s','t','y'] else x for x in substrate[8:]]) else: substrate = substrate[:7].upper() + substrate[7].lower() + substrate[8:].upper() - + return(substrate) @@ -407,7 +405,7 @@ def sub_binary_matrix(substrates, pp=True, aa=None, pos=None, sub_pos=None, seq_ If input is one substrate - returns single binary martix. If input is list of substrates - returns dataframe with flattened binary matrices. """ - + if isinstance(substrates, str): subs_list = pd.Series([substrates], name='Sequence') elif isinstance(substrates, list): @@ -420,19 +418,19 @@ def sub_binary_matrix(substrates, pp=True, aa=None, pos=None, sub_pos=None, seq_ subs_list = substrates[seq_col] else: raise Exception('Invalid substrates list.') - + if not pp: subs_list = subs_list.apply(lambda x: unprime_substrate(x)) - + if pos is None: pos = list(range(1,max([len(x) for x in subs_list])+1)) if sub_pos is None: sub_pos = range(-7,8) if aa is None: aa = data.get_aa() - + subs_pos_aa = [[str(p)+a for a,p in zip(sub,sub_pos)] for sub in subs_list] - + bin_mat_classes = [str(pos)+a for pos in pos for a in aa] mlb = MultiLabelBinarizer(classes=bin_mat_classes) with warnings.catch_warnings(): @@ -441,13 +439,13 @@ def sub_binary_matrix(substrates, pp=True, aa=None, pos=None, sub_pos=None, seq_ if isinstance(substrates, str): return(pd.DataFrame(bin_mat.reshape(len(pos), len(aa)), index=pos, columns=aa).T) - + if as_dict: bin_df = [pd.DataFrame(bin_mat.reshape(len(pos), len(aa)), index=pos, columns=aa).T for x in bin_mat] return(dict(zip(subs_list,bin_df))) - + sub_mat = pd.DataFrame(bin_mat, index=subs_list, columns=bin_mat_classes) - + return(sub_mat) @@ -465,7 +463,7 @@ def substrate_type(sub): sub_type : str Substrate type ('ser_thr', 'tyrosine', or 'unknown'). """ - + if sub[7].lower() in ['s','t']: sub_type = 'ser_thr' elif sub[7].lower() == 'y': @@ -489,7 +487,7 @@ def unprime_substrate(sub): un_primed_sub : str The input substrate without phospho-residues. """ - + un_primed_sub = sub[:7].upper() + sub[7] + sub[8:].upper() return(un_primed_sub) @@ -501,6 +499,78 @@ def unprime_substrate(sub): ################### """ +def parse_phosphosites(sequence, phosphoacceptor=['S', 'T', 'Y'], pp=False): + """ + Parse protein sequence to identify phosphorylation sites. + + Parameters + ---------- + sequence : str + Protein sequence using one-letter amino acid codes. + phosphoacceptor : List[str], optional + Phosphoacceptors to parse (any combination of 'S', 'T', 'Y'). Default is ['S', 'T', 'Y']. + pp : bool, optional + Phospho-priming. If False, all non-central residues uppercase. + If True, keep S/T/Y case, others uppercase. Default is False. + + Returns + ------- + pd.DataFrame + DataFrame with columns: residue, position, window. + Window is 15-mer centered on phosphosite, padded with '-'. + """ + + if not set(phosphoacceptor).issubset(_global_vars.valid_phos_res): + raise ValueError(f'Phosphoacceptor(s) must be a subset of {_global_vars.valid_phos_res}.') + + # Create regex pattern for phosphoacceptors + pattern = '[' + ''.join([aa.upper() + aa.lower() for aa in phosphoacceptor]) + ']' + + # Find all matches with their positions + matches = [(m.group(), m.start()) for m in re.finditer(pattern, sequence)] + + sites = [] + + for residue, i in matches: + # Create 15-mer window + start_idx = max(0, i - 7) + end_idx = min(len(sequence), i + 8) + window_seq = sequence[start_idx:end_idx] + + # Pad with '_' + upstream_pad = max(0, 7-i) + downstream_pad = max(0, (i+8) - len(sequence)) + raw_window = '_'*upstream_pad + window_seq + '_'*downstream_pad + + # Apply phospho-priming rules + if pp: + # Keep S/T/Y case, others uppercase, central always lowercase + window = ''.join([ + char.upper() if char.upper() not in 'STY' else char + for char in raw_window[:7] + ]) + raw_window[7].lower() + ''.join([ + char.upper() if char.upper() not in 'STY' else char + for char in raw_window[8:] + ]) + else: + # All uppercase except central (lowercase) + window = raw_window[:7].upper() + raw_window[7].lower() + raw_window[8:].upper() + + sites.append({ + 'Residue': residue.lower(), + 'Position': i + 1, + 'Sequence': window + }) + + return pd.DataFrame(sites) + +#%% +""" +################### +# Other utilities # +################### +""" + def list_font_names(): """ Returns a list of valid font_name options for use in sequence logo. @@ -514,7 +584,7 @@ def list_font_names(): fontname : list List of valid font_name names from logomaker. This will vary from system to system. """ - + return logomaker.list_font_names() @@ -534,8 +604,8 @@ def list_series_to_df(subs_list, col_name=None): ------- pd.DataFrame with one column containing the list or pd.Series. """ - + if isinstance(subs_list, pd.Series): return(subs_list.to_frame(name=col_name)) if isinstance(subs_list, list): - return(pd.Series(subs_list).to_frame(name=col_name)) + return(pd.Series(subs_list).to_frame(name=col_name)) \ No newline at end of file From 282d0baaf97e8ed9c14296cdc5d7a6465c9cab71 Mon Sep 17 00:00:00 2001 From: Tomer Yaron-Barir Date: Thu, 26 Jun 2025 19:26:04 -0400 Subject: [PATCH 2/8] Updating titles --- src/kinase_library/modules/enrichment.py | 188 +++++++++++------------ src/kinase_library/utils/utils.py | 6 +- 2 files changed, 97 insertions(+), 97 deletions(-) diff --git a/src/kinase_library/modules/enrichment.py b/src/kinase_library/modules/enrichment.py index 00f14ae..fed5e05 100644 --- a/src/kinase_library/modules/enrichment.py +++ b/src/kinase_library/modules/enrichment.py @@ -28,9 +28,9 @@ #%% """ -########################### +################################ # Differential Phosphorylation # -########################### +################################ """ def dp_regulated_sites(dp_data, lfc_col, lfc_thresh=0, @@ -73,7 +73,7 @@ def dp_regulated_sites(dp_data, lfc_col, lfc_thresh=0, raise ValueError('P-values threshold must be between 0-1.') if not 0<=percent_thresh<=100: raise ValueError('Percent threshold must be between 0-100.') - + all_dp_data = dp_data.copy() if drop_na: dp_data = all_dp_data[~all_dp_data[lfc_col].isna()] @@ -84,13 +84,13 @@ def dp_regulated_sites(dp_data, lfc_col, lfc_thresh=0, print(str(len(all_dp_data)-len(dp_data)) + ' entries were dropped due to empty value in the logFC or p-value columns.') else: print(str(len(all_dp_data)-len(dp_data)) + ' entries were dropped due to empty value in the logFC column.') - + dropped_enteries = pd.concat([all_dp_data, dp_data]) dropped_enteries = dropped_enteries.loc[dropped_enteries.astype(str).drop_duplicates(keep=False).index] #In order to deal with lists in the DataFrame - + if len(dropped_enteries)>0 and not suppress_warnings: print('Use the \'dp_dropped_enteries\' attribute to view dropped enteries due to invalid differential phosphorylation values.') - + if percent_rank is not None: if percent_rank == 'logFC': sorted_dp_data = dp_data.sort_values(lfc_col, ascending=False) @@ -101,7 +101,7 @@ def dp_regulated_sites(dp_data, lfc_col, lfc_thresh=0, else: raise ValueError('percent_rank must be either \'logFC\' or \'pvalue\'.') reg_sites_dict = dict(zip(['upreg','unreg','downreg'], np.split(sorted_dp_data, [int(percent_thresh/100*len(dp_data)), int((1-percent_thresh/100)*len(dp_data))]))) - + else: reg_sites_dict = {} if lfc_thresh == 0: # In case of lfc_thresh of zero, treat lfc of 0 as unregulated @@ -122,7 +122,7 @@ def dp_regulated_sites(dp_data, lfc_col, lfc_thresh=0, reg_sites_dict['upreg'] = dp_data[dp_data[lfc_col]>=lfc_thresh] reg_sites_dict['downreg'] = dp_data[dp_data[lfc_col]<=-lfc_thresh] reg_sites_dict['unreg'] = dp_data[dp_data[lfc_col].abs()=threshold) elif comp_direction == 'lower': pred_kins = (data_values<=threshold) - + kins_subs_dict = {} for kin in tqdm(pred_kins.columns): kins_subs_dict[kin] = pred_kins.loc[pred_kins[kin],kin].index.to_list() - + return(kins_subs_dict) @@ -326,36 +326,36 @@ def combine_mea_enrichment_results(enrichment_results_dict, data_type='kl_object pval_data : pd.DataFrame Dataframe with adjusted p-value enrichment data of all conditions. """ - + if data_type not in ['kl_object','data_frame']: raise ValueError('data_type must be either \'kl_object\' or \'data_frame\'.') - + enrichment_results = list(enrichment_results_dict.values()) conds_list = list(enrichment_results_dict.keys()) - + if data_type == 'data_frame': enrichment_results_tables = enrichment_results else: enrichment_results_tables = [res.enrichment_results for res in enrichment_results] - + index_test = [x.index.to_list() == enrichment_results_tables[0].index.to_list() for x in enrichment_results_tables] if not np.all(index_test): raise ValueError('All enrichment results must have the same kinases enriched.') kinases = enrichment_results_tables[0].index.to_list() - + if pval_col_name is None: if adj_pval: pval_col_name = 'FDR' else: pval_col_name = 'pvalue' - + lff_data = pd.DataFrame(index=kinases, columns=conds_list) pval_data = pd.DataFrame(index=kinases, columns=conds_list) - + for res,cond in zip(enrichment_results_tables,conds_list): lff_data[cond] = res[lff_col_name] pval_data[cond] = res[pval_col_name] - + return(lff_data,pval_data) @@ -443,9 +443,9 @@ def plot_volcano(enrichment_data, sig_lff=0, sig_pval=0.1, enrichment_data = enrichment_data.loc[kinases] if highlight_kins is not None: highlight_kins = [x for x in highlight_kins if x in kinases] - + enrichment_data[pval_col] = enrichment_data[pval_col].astype(float) - + if ignore_depleted: if 'most_sig_direction' in enrichment_data.columns: signed_log2_ff = (enrichment_data['most_sig_direction'].replace({'-': -1, '+': 1}))*enrichment_data['most_sig_log2_freq_factor'] @@ -457,11 +457,11 @@ def plot_volcano(enrichment_data, sig_lff=0, sig_pval=0.1, depleted_kins = (enrichment_data[lff_col] < 0) enrichment_data.loc[depleted_kins, lff_col] = 0 enrichment_data.loc[depleted_kins, pval_col] = 1 - + sig_enriched_data = enrichment_data[(enrichment_data[lff_col] >= sig_lff) & (enrichment_data[pval_col] <= sig_pval)] sig_depleted_data = enrichment_data[(enrichment_data[lff_col] <= -sig_lff) & (enrichment_data[pval_col] <= sig_pval)] non_sig_data = enrichment_data[~((np.abs(enrichment_data[lff_col]) >= sig_lff) & (enrichment_data[pval_col] <= sig_pval))] - + if ax is None: existing_ax = False fig,ax = plt.subplots() @@ -474,7 +474,7 @@ def plot_volcano(enrichment_data, sig_lff=0, sig_pval=0.1, ax.plot(non_sig_data[lff_col],-np.log10(non_sig_data[pval_col]), markerfacecolor='black', linestyle='None', **plot_kwargs) ax.plot(sig_enriched_data[lff_col],-np.log10(sig_enriched_data[pval_col]), markerfacecolor='red', linestyle='None', **plot_kwargs) ax.plot(sig_depleted_data[lff_col],-np.log10(sig_depleted_data[pval_col]), markerfacecolor='blue', linestyle='None', **plot_kwargs) - + if highlight_kins is not None: if isinstance(highlight_kins, str): highlight_kins = [highlight_kins] @@ -483,15 +483,15 @@ def plot_volcano(enrichment_data, sig_lff=0, sig_pval=0.1, raise ValueError(f'Some kinases to highlight are not in the enrichment results ({missing_kinases}).') highlight_data = enrichment_data.loc[highlight_kins] ax.plot(highlight_data[lff_col],-np.log10(highlight_data[pval_col]), markerfacecolor='yellow', linestyle='None', **plot_kwargs) - + if symmetric_xaxis: ax.set_xlim(-abs(max(ax.get_xlim(), key=abs)), abs(max(ax.get_xlim(), key=abs))) - + ax.axhline(-np.log10(sig_pval), ls='--', lw=1, color='k') if sig_lff>0: ax.axvline(sig_lff, ls='--', lw=1, color='k') ax.axvline(-sig_lff, ls='--', lw=1, color='k') - + ax.set_title(title) ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) @@ -502,12 +502,12 @@ def plot_volcano(enrichment_data, sig_lff=0, sig_pval=0.1, ax.set_xticks(range(-xvalues,xvalues+1)) if grid: ax.grid('major') - + if max_window: figManager = fig.canvas.manager figManager.window.showMaximized() fig.tight_layout() - + if label_kins is None: label_kins = list(sig_enriched_data.index) + list(sig_depleted_data.index) if highlight_kins: @@ -526,13 +526,13 @@ def plot_volcano(enrichment_data, sig_lff=0, sig_pval=0.1, ax.annotate(kin_data.name, (kin_data[lff_col],-np.log10(kin_data[pval_col])), xytext=(2,2), textcoords='offset points', fontsize=labels_fontsize) if adjust_labels: adjust_text(labels, ax=ax, arrowprops=dict(arrowstyle='-', color='k', lw=0.5)) - + if save_fig: fig.savefig(save_fig, dpi=1000) - + if not plot and not existing_ax: plt.close(fig) - + if return_fig: return fig @@ -611,20 +611,20 @@ def plot_3x3_volcanos(dp_data, kin_type, kl_method, kl_thresh, dp_lfc_col, dp_lf Optional keyword arguments to be passed to the kinase_enrichment function. plotting_kwargs : dict, optional Optional keyword arguments to be passed to the plot_volcano function. - + Returns ------- If return_fig, the 3x3 figure containing downregulated, upregulated, and combined kinase enrichment volcano plots. """ - + if len(dp_lfc_thresh) != 3: raise ValueError('\'dp_lfc_thresh\' must contain exactly three values.') if dp_pval_col is not None and len(dp_pval_thresh) != 3: raise ValueError('\'dp_pval_thresh\' must contain exactly three values.') - + if seq_col is None: seq_col = _global_vars.default_seq_col - + exceptions.check_kl_method(kl_method) print('Calculating scores for all sites') dp_data_pps = pps.PhosphoProteomics(data=dp_data, seq_col=seq_col, **diff_phos_kwargs) @@ -632,17 +632,17 @@ def plot_3x3_volcanos(dp_data, kin_type, kl_method, kl_thresh, dp_lfc_col, dp_lf scores = dp_data_pps.score(kin_type=kin_type, kinases=kinases, values_only=True, **scoring_kwargs) elif kl_method in ['percentile','percentile_rank']: percentiles = dp_data_pps.percentile(kin_type=kin_type, kinases=kinases, values_only=True, **scoring_kwargs) - + fig = plt.figure(constrained_layout=True) figManager = fig.canvas.manager figManager.window.showMaximized() subfigs = fig.subfigures(nrows=3, ncols=1) - + for i,(lfc,pval) in enumerate(zip(dp_lfc_thresh,dp_pval_thresh)): - + subfigs[i].suptitle(r'$\bf{' + f'DE logFC threshold: {lfc}' + f' / DE p-value threshold: {pval}'*(dp_pval_col is not None) + '}$') ax = subfigs[i].subplots(nrows=1, ncols=3) - + print(f'\nLogFC threshold: {lfc}' + f' / p-value threshold: {pval}'*(dp_pval_col is not None)) diff_phos_data = DiffPhosData(dp_data=dp_data, kin_type=kin_type, lfc_col=dp_lfc_col, lfc_thresh=lfc, @@ -653,27 +653,27 @@ def plot_3x3_volcanos(dp_data, kin_type, kl_method, kl_thresh, dp_lfc_col, dp_lf diff_phos_data.submit_scores(kin_type=kin_type, scores=scores, suppress_messages=suppress_warnings) elif kl_method in ['percentile','percentile_rank']: diff_phos_data.submit_percentiles(kin_type=kin_type, percentiles=percentiles, suppress_messages=suppress_warnings) - + enrich_results = diff_phos_data.kinase_enrichment(kl_method=kl_method, kl_thresh=kl_thresh, **enrichment_kwargs) - + enrich_results.plot_down_up_comb_volcanos(sig_lff=ke_sig_lff, sig_pval=ke_sig_pval, kinases=kinases, plot_cont_kins=plot_cont_kins, highlight_kins=highlight_kins, ignore_depleted=ignore_depleted, label_kins=label_kins, adjust_labels=adjust_labels, labels_fontsize=labels_fontsize, ax=ax, **plotting_kwargs) - + fig.suptitle(title) - + if save_fig: fig.savefig(save_fig, dpi=1000) - + if not plot: plt.close(fig) - + if return_fig: return fig - - + + def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1, kinases=None, plot_cont_kins=True, highlight_cont_kins=True, sort_kins_by='family', cond_order=None, only_sig_kins=False, only_sig_conds = False, @@ -692,7 +692,7 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1, Parameters ---------- lff_data : pd.DataFrame - Matrix containing Kinase Library enrichment log frequency factor data with kinases as index and conditions as columns. + Matrix containing Kinase Library enrichment log frequency factor data with kinases as index and conditions as columns. pval_data : pd.DataFrame Matrix containing Kinase Library enrichment p_value data. Index (kinases) and columns (conditions) must be identical to lff_data. cont_kins : pd.DataFrame, optional @@ -708,7 +708,7 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1, highlight_cont_kins : boolean, optional Highlight contradicting kinases. The default is True. sort_kins_by : str, optional - String specifying what to sort the kinases on the figure's x-axes by. If provided, must be 'family' (default) or 'name'. + String specifying what to sort the kinases on the figure's x-axes by. If provided, must be 'family' (default) or 'name'. cond_order : list of str, optional Order in which the conditions will appear on the bubblemap. This list may be a subset of conditions but may not contain extra strings. The default is None, preserving the condition order from the lff_data matrix. only_sig_kins : bool, optional @@ -781,19 +781,19 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1, ------- None. """ - + if not ((lff_data.index.equals(pval_data.index)) and (lff_data.columns.equals(pval_data.columns))): raise ValueError('lff_data and pval_data must have the same columns and indices.') - + if (pval_data == 0).any().any(): print('Warning: zeros were detected in pval_data. All zeros were replaced with the non-zero minimal value in pval_data.') pval_data = pval_data.replace(0, pval_data[pval_data>0].min().min()) - + if cont_kins is None: cont_kins = pd.DataFrame(False, index=lff_data.index, columns=lff_data.columns) elif not ((lff_data.index.equals(cont_kins.index)) and (lff_data.columns.equals(cont_kins.columns))): raise ValueError('cont_kins must have the same columns and indices as lff_data and pval_data.') - + if kinases is None: kinases = lff_data.index.to_list() else: @@ -801,7 +801,7 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1, kinases = list(dict.fromkeys(kinases)) lff_data = lff_data.loc[kinases] pval_data = pval_data.loc[kinases] - + if not sort_kins_by: kins_order = kinases elif sort_kins_by == 'family': @@ -811,14 +811,14 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1, kins_order = natsorted(kinases) else: raise ValueError('sort_kins_by must be either \'family\', \'name\', or False.') - + if cond_order is None: cond_order = lff_data.columns - + if color_kins_by: exceptions.check_color_kins_method(color_kins_by) label_info_col = color_kins_by.upper() - + kinome_data = data.get_kinase_info(kinases) kin_categories_list = natsorted(kinome_data[label_info_col].unique()) if kin_categories_colors is None: @@ -829,7 +829,7 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1, kin_colors = {} for kin,label_type in zip(kinome_data.index,kinome_data[label_info_col]): kin_colors[kin] = kin_categories_colors[label_type] - + sorted_lff_data = lff_data.loc[kins_order,cond_order] sorted_pval_data = pval_data.loc[kins_order,cond_order] if highlight_cont_kins: @@ -843,7 +843,7 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1, if not plot_cont_kins: sig_lff_data = sig_lff_data.mask(cont_kins) sig_pval_data = sig_pval_data.mask(cont_kins) - + if only_sig_kins: sig_kins = list(sig_data.loc[sig_data.any(axis=1)].index) sig_lff_data = sig_lff_data.loc[sig_kins] @@ -856,7 +856,7 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1, sorted_highlight_data = sorted_highlight_data[sig_conds] minus_log10_pval_data = -np.log10(sig_pval_data.fillna(1)) - + if kin_clust: if cluster_by is None: raise ValueError('If kin_clust is True, cluster_by must be specified.') @@ -875,7 +875,7 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1, sig_lff_data = sig_lff_data.iloc[hierarchy.leaves_list(Z)] minus_log10_pval_data = minus_log10_pval_data.iloc[hierarchy.leaves_list(Z)] sorted_highlight_data = sorted_highlight_data.iloc[hierarchy.leaves_list(Z)] - + if condition_clust: if cluster_by is None: raise ValueError('If condition_clust is True, cluster_by must be specified.') @@ -894,7 +894,7 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1, sig_lff_data = sig_lff_data.iloc[:,hierarchy.leaves_list(Z)] minus_log10_pval_data = minus_log10_pval_data.iloc[:,hierarchy.leaves_list(Z)] sorted_highlight_data = sorted_highlight_data.iloc[:,hierarchy.leaves_list(Z)] - + intensity_matrices = np.array_split(sig_lff_data, num_panels) size_matrices = np.array_split(minus_log10_pval_data, num_panels) highlight_matrices = np.array_split(sorted_highlight_data, num_panels) @@ -902,13 +902,13 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1, if lff_clim is None: lff_clim = (min(sig_lff_data.min().min(),0),max(sig_lff_data.max().max(),0)) cnorm = mcol.TwoSlopeNorm(vmin=lff_clim[0], vcenter=0, vmax=lff_clim[1]) - + if max_pval_size is None: max_pval_size = np.ceil(minus_log10_pval_data.max().max()) if max_pval_size < -np.log10(sig_pval): raise ValueError('max_pval_size must be equal or greater than -log10(sig_pval).') pval_slim = (-np.log10(sig_pval), max_pval_size) - + if vertical: intensity_matrices = [mat.transpose() for mat in intensity_matrices] size_matrices = [mat.transpose() for mat in size_matrices] @@ -916,19 +916,19 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1, fig = plt.figure(constrained_layout=constrained_layout) subfigs = fig.subfigures(nrows=1, ncols=2, width_ratios=[10, 1]) - + if not vertical: axes = subfigs[0].subplots(nrows=num_panels, ncols=1, squeeze=False).ravel() else: axes = subfigs[0].subplots(nrows=1, ncols=num_panels, squeeze=False).ravel() - + for idx in range(num_panels): intensity_matrix = intensity_matrices[idx].loc[:,::-1] size_matrix = size_matrices[idx].loc[:,::-1] highlight_matrix = highlight_matrices[idx].loc[:,::-1] ax = axes[idx] - + melt_lff = pd.melt(intensity_matrix, ignore_index=False, var_name='condition', value_name='lff').set_index('condition', append=True)[::-1] melt_lff.index.names = ['kinase','condition'] melt_pval = pd.melt(size_matrix, ignore_index=False, var_name='condition', value_name='pval').set_index('condition', append=True)[::-1] @@ -957,13 +957,13 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1, else: eps = ((maxy - miny) / (len(ax.get_yticks()) - 1)) / 2 ax.set_ylim(maxy+eps, miny-eps) - + ax.grid(which='major', color='k', linestyle=':') ax.set_axisbelow(True) ax.set_aspect('equal', 'box') ax.tick_params(axis='x', which='major', labelsize=xlabels_size, labelrotation=90) ax.tick_params(axis='y', which='major', labelsize=ylabels_size) - + fig.canvas.draw() axes = subfigs[0].axes for ax in axes: @@ -1001,14 +1001,14 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1, for l in ax.get_yticklabels(): label_color = cond_colors[l.get_text()] l.set_color(label_color) - + axes = subfigs[1].subplots(nrows=3, ncols=1, squeeze=False).ravel() - + if family_legned and color_kins_by: patches = [mpatches.Patch(color=x[1], label=x[0]) for x in kin_categories_colors.items()] axes[0].legend(handles=patches, loc='center', title='Family', facecolor='white') axes[0].axis('off') - + if pval_legend: size_legend_data = pd.DataFrame({'x': 0, 'y': 0, 'sizes': np.linspace(pval_slim[0], pval_slim[1], 4, dtype=int)}) sns.scatterplot(x='x', y='y', data=size_legend_data, size='sizes', sizes=bubblesize_range, size_norm=pval_slim, @@ -1022,7 +1022,7 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1, axes[1].legend(handles=handles, labels=[str(10**-x) for x in size_legend_data['sizes'][:3]] + ['<'+str(10**int(-(size_legend_data['sizes'][3])))], title='Adj. p-value', loc='center', labelspacing=pval_legend_spacing, facecolor='white') axes[1].axis('off') - + if lff_cbar: axes[2].set(aspect=10) sm = plt.cm.ScalarMappable(cmap='coolwarm', norm=cnorm) @@ -1031,11 +1031,11 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1, cbar.ax.set_title('log2(FF)') else: axes[2].axis('off') - + subfigs[0].suptitle(title, fontsize=16) if max_window: figManager = plt.get_current_fig_manager() figManager.window.showMaximized() - + if save_fig: - fig.savefig(save_fig, dpi=1000) + fig.savefig(save_fig, dpi=1000) \ No newline at end of file diff --git a/src/kinase_library/utils/utils.py b/src/kinase_library/utils/utils.py index 9a2f499..edf0d44 100644 --- a/src/kinase_library/utils/utils.py +++ b/src/kinase_library/utils/utils.py @@ -494,9 +494,9 @@ def unprime_substrate(sub): #%% """ -################### -# Other utilities # -################### +########### +# Protein # +########### """ def parse_phosphosites(sequence, phosphoacceptor=['S', 'T', 'Y'], pp=False): From e72e12c2decec3d8a53c651ca6f3fbee38e7fd2b Mon Sep 17 00:00:00 2001 From: Tomer Yaron-Barir Date: Fri, 27 Jun 2025 11:04:12 -0400 Subject: [PATCH 3/8] Function to score all the possible phosphosites on a protein --- src/kinase_library/__init__.py | 1 + src/kinase_library/modules/scoring.py | 86 +++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 src/kinase_library/modules/scoring.py diff --git a/src/kinase_library/__init__.py b/src/kinase_library/__init__.py index 1c19b1d..0c4eea8 100644 --- a/src/kinase_library/__init__.py +++ b/src/kinase_library/__init__.py @@ -15,6 +15,7 @@ from .utils.utils import * from .modules.data import * +from .modules.scoring import * from .modules.enrichment import * from .logger import TqdmToLoggerWithStatus, logger diff --git a/src/kinase_library/modules/scoring.py b/src/kinase_library/modules/scoring.py new file mode 100644 index 0000000..9448a4a --- /dev/null +++ b/src/kinase_library/modules/scoring.py @@ -0,0 +1,86 @@ +""" +################################ +# The Kinase Library - Scoring # +################################ +""" + +import os +import numpy as np +import pandas as pd + +from ..objects import phosphoproteomics +from ..utils import _global_vars, exceptions, utils + +#%% +""" +################# +# Score Protein # +################# +""" + +def score_protein(seq, kinases=None, phosphoacceptor=['S', 'T', 'Y'], + pp=False, st_fav=True, non_canonical=False, + score_promiscuity_threshold=1, + percentile_promiscuity_threshold=90, + score_round_digits=3, percentile_round_digits=2): + """ + Score all potential phosphosites in a given protein sequence. + + + seq : str + Protein sequence to analyze for phosphorylation sites. + kinases : list, optional + List of specific kinases to consider in the analysis. + If None, all available kinases are used. The default is None. + phosphoacceptor : list, optional + Amino acid residues that can be phosphorylated. The default is ['S', 'T', 'Y']. + pp : bool, optional + Phospho-residues (s/t/y). The default is False. + st_fav : bool, optional + S/T favorability. The default is True. + non_canonical : bool, optional + Return also non-canonical kinases. For tyrosine kinases only. The default is False. + score_promiscuity_threshold : float, optional + Score threshold above which kinases are considered predicted. + percentile_promiscuity_threshold : float, optional + Percentile threshold above which kinases are considered predicted. + score_round_digits : int, optional + Number of decimal digits for score. The default is 4. + percentile_round_digits : int, optional + Number of decimal digits for percentile. The default is 2. + + Raises + ------ + ValueError + If phosphoacceptor residues are not a subset of valid phosphorylatable residues. + + Returns + ------- + results : dict + Dictionary containing prediction results with two keys: + - 'ser_thr': Predictions for serine/threonine kinases + - 'tyrosine': Predictions for tyrosine kinases + Each prediction includes kinase scores, percentiles, ranks, and promiscuity indices. + + """ + + if not set(phosphoacceptor).issubset(_global_vars.valid_phos_res): + raise ValueError(f'Phosphoacceptor(s) must be a subset of {_global_vars.valid_phos_res}.') + + phos_sites = utils.parse_phosphosites(seq, phosphoacceptor=phosphoacceptor, pp=pp) + + pps = phosphoproteomics.PhosphoProteomics(data=phos_sites, seq_col='Sequence', pp=pp, new_seq_phos_res_cols=False, suppress_warnings=False) + + results = {'ser_thr': pps.predict(kin_type='ser_thr', kinases=kinases, st_fav=st_fav, + score_promiscuity_threshold=score_promiscuity_threshold, + percentile_promiscuity_threshold=percentile_promiscuity_threshold, + score_round_digits=percentile_promiscuity_threshold, + percentile_round_digits=percentile_round_digits), + 'tyrosine': pps.predict(kin_type='tyrosine', kinases=kinases, non_canonical=non_canonical, + score_promiscuity_threshold=score_promiscuity_threshold, + percentile_promiscuity_threshold=percentile_promiscuity_threshold, + score_round_digits=percentile_promiscuity_threshold, + percentile_round_digits=percentile_round_digits) + } + + return(results) \ No newline at end of file From 79b3a519c020d0272b20aab143a0764ebecff129 Mon Sep 17 00:00:00 2001 From: Tomer Yaron-Barir Date: Fri, 27 Jun 2025 11:04:33 -0400 Subject: [PATCH 4/8] Bumping version --- src/kinase_library/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kinase_library/__init__.py b/src/kinase_library/__init__.py index 0c4eea8..569361b 100644 --- a/src/kinase_library/__init__.py +++ b/src/kinase_library/__init__.py @@ -24,7 +24,7 @@ #%% -__version__ = "1.4.1" +__version__ = "1.5.0" #%% Loading scored phosphoproteome one time per session From b9e0391fb0636180bd023fc709897f098c862599 Mon Sep 17 00:00:00 2001 From: Tomer Yaron-Barir Date: Fri, 27 Jun 2025 11:08:46 -0400 Subject: [PATCH 5/8] Fixing parameters assignment --- src/kinase_library/modules/scoring.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kinase_library/modules/scoring.py b/src/kinase_library/modules/scoring.py index 9448a4a..88aa6b6 100644 --- a/src/kinase_library/modules/scoring.py +++ b/src/kinase_library/modules/scoring.py @@ -74,12 +74,12 @@ def score_protein(seq, kinases=None, phosphoacceptor=['S', 'T', 'Y'], results = {'ser_thr': pps.predict(kin_type='ser_thr', kinases=kinases, st_fav=st_fav, score_promiscuity_threshold=score_promiscuity_threshold, percentile_promiscuity_threshold=percentile_promiscuity_threshold, - score_round_digits=percentile_promiscuity_threshold, + score_round_digits=score_round_digits, percentile_round_digits=percentile_round_digits), 'tyrosine': pps.predict(kin_type='tyrosine', kinases=kinases, non_canonical=non_canonical, score_promiscuity_threshold=score_promiscuity_threshold, percentile_promiscuity_threshold=percentile_promiscuity_threshold, - score_round_digits=percentile_promiscuity_threshold, + score_round_digits=score_round_digits, percentile_round_digits=percentile_round_digits) } From d929364378220f32c575f6d85838d937a85c1263 Mon Sep 17 00:00:00 2001 From: Tomer Yaron-Barir Date: Fri, 27 Jun 2025 11:11:42 -0400 Subject: [PATCH 6/8] Removing unused imports --- src/kinase_library/modules/scoring.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/kinase_library/modules/scoring.py b/src/kinase_library/modules/scoring.py index 88aa6b6..cb532ca 100644 --- a/src/kinase_library/modules/scoring.py +++ b/src/kinase_library/modules/scoring.py @@ -4,12 +4,8 @@ ################################ """ -import os -import numpy as np -import pandas as pd - from ..objects import phosphoproteomics -from ..utils import _global_vars, exceptions, utils +from ..utils import _global_vars, utils #%% """ From 165a698436f64deb70d4b59b1a297e341947f0b4 Mon Sep 17 00:00:00 2001 From: Tomer Yaron-Barir Date: Fri, 27 Jun 2025 11:24:52 -0400 Subject: [PATCH 7/8] Splitting kinases by kinase type when kinase list is provided --- src/kinase_library/modules/scoring.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/kinase_library/modules/scoring.py b/src/kinase_library/modules/scoring.py index cb532ca..b8f682f 100644 --- a/src/kinase_library/modules/scoring.py +++ b/src/kinase_library/modules/scoring.py @@ -5,7 +5,8 @@ """ from ..objects import phosphoproteomics -from ..utils import _global_vars, utils +from ..utils import _global_vars, exceptions, utils +from ..modules import data #%% """ @@ -62,17 +63,25 @@ def score_protein(seq, kinases=None, phosphoacceptor=['S', 'T', 'Y'], if not set(phosphoacceptor).issubset(_global_vars.valid_phos_res): raise ValueError(f'Phosphoacceptor(s) must be a subset of {_global_vars.valid_phos_res}.') + if kinases is not None: + kinases = [kin.upper() for kin in kinases] + exceptions.check_kin_name(kinases) + ser_thr_kins = [kin for kin in kinases if kin in data.get_kinase_list('ser_thr')] + tyrosine_kins = [kin for kin in kinases if kin in data.get_kinase_list('tyrosine')] + else: + ser_thr_kins = None + tyrosine_kins = None phos_sites = utils.parse_phosphosites(seq, phosphoacceptor=phosphoacceptor, pp=pp) pps = phosphoproteomics.PhosphoProteomics(data=phos_sites, seq_col='Sequence', pp=pp, new_seq_phos_res_cols=False, suppress_warnings=False) - results = {'ser_thr': pps.predict(kin_type='ser_thr', kinases=kinases, st_fav=st_fav, + results = {'ser_thr': pps.predict(kin_type='ser_thr', kinases=ser_thr_kins, st_fav=st_fav, score_promiscuity_threshold=score_promiscuity_threshold, percentile_promiscuity_threshold=percentile_promiscuity_threshold, score_round_digits=score_round_digits, percentile_round_digits=percentile_round_digits), - 'tyrosine': pps.predict(kin_type='tyrosine', kinases=kinases, non_canonical=non_canonical, + 'tyrosine': pps.predict(kin_type='tyrosine', kinases=tyrosine_kins, non_canonical=non_canonical, score_promiscuity_threshold=score_promiscuity_threshold, percentile_promiscuity_threshold=percentile_promiscuity_threshold, score_round_digits=score_round_digits, From eb2aec2505f1c3f426b7e93ffebca01162813ee0 Mon Sep 17 00:00:00 2001 From: Tomer Yaron-Barir Date: Fri, 27 Jun 2025 11:27:45 -0400 Subject: [PATCH 8/8] Fixing documentations --- src/kinase_library/modules/scoring.py | 2 +- .../objects/phosphoproteomics.py | 213 +++++++++--------- src/kinase_library/utils/utils.py | 5 +- 3 files changed, 109 insertions(+), 111 deletions(-) diff --git a/src/kinase_library/modules/scoring.py b/src/kinase_library/modules/scoring.py index b8f682f..f1526db 100644 --- a/src/kinase_library/modules/scoring.py +++ b/src/kinase_library/modules/scoring.py @@ -42,7 +42,7 @@ def score_protein(seq, kinases=None, phosphoacceptor=['S', 'T', 'Y'], percentile_promiscuity_threshold : float, optional Percentile threshold above which kinases are considered predicted. score_round_digits : int, optional - Number of decimal digits for score. The default is 4. + Number of decimal digits for score. The default is 3. percentile_round_digits : int, optional Number of decimal digits for percentile. The default is 2. diff --git a/src/kinase_library/objects/phosphoproteomics.py b/src/kinase_library/objects/phosphoproteomics.py index daede32..8210baf 100644 --- a/src/kinase_library/objects/phosphoproteomics.py +++ b/src/kinase_library/objects/phosphoproteomics.py @@ -67,19 +67,19 @@ class PhosphoProteomics(object): 21390 ESSPILTsFELVKVP 21391 THRRMVVsMPNLQDI """ - + def __init__(self, data, seq_col=None, pad=False, pp=False, drop_invalid_subs=True, new_seq_phos_res_cols=True, suppress_warnings=False): - + if not isinstance(data, pd.DataFrame): raise ValueError('\'data\' must be a pd.DataFrame.') - + if seq_col is None: seq_col = _global_vars.default_seq_col - + if drop_invalid_subs: processed_data,omited_entries = utils.filter_invalid_subs(data=data, seq_col=seq_col, suppress_warnings=suppress_warnings) else: @@ -88,39 +88,39 @@ def __init__(self, data, seq_col=None, self.omited_entries = omited_entries if len(omited_entries)>0 and not suppress_warnings: print('Use the \'omited_entries\' attribute to view dropped enteries due to invalid sequences.') - + subs_list = processed_data[seq_col] if pad: subs_list = processed_data[seq_col].apply(lambda x: '_'*pad[0] + x + '_'*pad[1]) subs_list = subs_list.apply(utils.sequence_to_substrate, pp=pp, validate_phos_res=drop_invalid_subs, validate_aa=drop_invalid_subs) - + phos_res = subs_list.str.lower().str[7] - + if new_seq_phos_res_cols: processed_data = processed_data.rename({_global_vars.default_seq_col: 'ORIGINAL_'+_global_vars.default_seq_col, 'phos_res': 'original_phos_res'}, axis=1) processed_data['phos_res'] = phos_res processed_data[_global_vars.default_seq_col] = subs_list - + self.data = processed_data self.original_data = data self.seq_col = seq_col self.substrates = processed_data[_global_vars.default_seq_col] self.phos_res = processed_data['phos_res'] self.pp = pp - + self.ser_thr_data = processed_data[processed_data['phos_res'].isin(['S','T','s','t'])] self.ser_thr_substrates = self.ser_thr_data[_global_vars.default_seq_col] self._ser_thr_phos_res = self.ser_thr_data['phos_res'] self.tyrosine_data = processed_data[processed_data['phos_res'].isin(['Y','y'])] self.tyrosine_substrates = self.tyrosine_data[_global_vars.default_seq_col] self._tyrosine_phos_res = self.tyrosine_data['phos_res'] - - + + @classmethod def from_file(cls, data_file, seq_col=None, pad=False, pp=False, drop_invalid_subs=True, new_seq_phos_res_cols=True, suppress_warnings=False, **file_args): """ Create PhosphoProteomics object from file. - + Parameters ---------- data_file : str @@ -140,12 +140,12 @@ def from_file(cls, data_file, seq_col=None, pad=False, pp=False, drop_invalid_su Do not print warnings. The default is False. **file_args : args Key arguments for pd.read_csv(). - + Returns ------- pps : kl.PhosphoProteomics PhosphoProteomics object with the data from the file. - + Examples ------- >>> pps = kl.PhosphoProteomics(data_file='./../databases/substrates/Kinase_Substrate_Dataset_count_07_2021.txt', skiprows=3) @@ -175,9 +175,9 @@ def from_file(cls, data_file, seq_col=None, pad=False, pp=False, drop_invalid_su 21390 ESSPILTsFELVKVP 21391 THRRMVVsMPNLQDI """ - + file_type = data_file.split('.')[-1] - + if file_type == 'parquet': data = pq.read_table(data_file).to_pandas() elif file_type in ['xlsx','xls']: @@ -186,19 +186,19 @@ def from_file(cls, data_file, seq_col=None, pad=False, pp=False, drop_invalid_su data = pd.read_csv(data_file, **file_args) else: data = pd.read_csv(data_file, sep = '\t', **file_args) - + if seq_col is None: seq_col = _global_vars.default_seq_col - + pps = cls(data, seq_col=seq_col, pad=pad, pp=pp, drop_invalid_subs=drop_invalid_subs, new_seq_phos_res_cols=new_seq_phos_res_cols, suppress_warnings=suppress_warnings) - + return(pps) - - + + def _calculate_subs_binary_matrix(self, kin_type=['ser_thr','tyrosine'], pp=None, pos=None): """ Making a binary matrix for a substrate. - + Parameters ---------- kin_type : str or list, optional @@ -212,32 +212,32 @@ def _calculate_subs_binary_matrix(self, kin_type=['ser_thr','tyrosine'], pp=None ------- Setting self.*kin_type*_bin_matrix attribute for binary matrix. """ - + if isinstance(kin_type, str): kin_type = [kin_type] - + if pp is None: pp = self.pp - + for kt in kin_type: exceptions.check_kin_type(kt) - + aa_labels = data.get_aa() if pos is None: pos = data.get_positions(kt) - + substrates = getattr(self, kt + '_substrates') - + subs_mat = utils.sub_binary_matrix(substrates, aa=aa_labels, pos=pos, pp=pp) setattr(self, '_' + kt + '_bin_matrix', subs_mat) - - + + def score(self, kin_type=None, kinases=None, st_fav=True, non_canonical=False, values_only=False, log2_score=True, pos=None, round_digits=3, return_values=True): """ Calculate score of the phosphoproteomics data for the given kinases. - + Score is being computed in a vectorized way: 1. Making binary matrix for the substrates. 2. Converting kinase matrix (norm-scaled) to log2 @@ -273,30 +273,30 @@ def score(self, kin_type=None, kinases=None, st_fav=True, * additional column with the -/+7 amino acids substrate * scores for all specificed kinases """ - + if all(v is None for v in [kin_type, kinases]): raise ValueError('Either list of kinases or kinase type must be provided.') - + if kinases is None: kinases = data.get_kinase_list(kin_type, non_canonical=non_canonical) elif isinstance(kinases, str): kinases = [kinases] - + kinases = [x.upper() for x in kinases] exceptions.check_kin_name(kinases) - + if kin_type is None: kin_type = data.get_kinase_type(kinases[0]) else: exceptions.check_kin_type(kin_type) exceptions.check_kin_list_type(kinases, kin_type=kin_type) - + print('Scoring '+str(len(getattr(self,kin_type+'_substrates')))+' '+kin_type+' substrates') logger.info('Scoring '+str(len(getattr(self,kin_type+'_substrates')))+' '+kin_type+' substrates') if not hasattr(self, '_' + kin_type + '_bin_matrix'): self._calculate_subs_binary_matrix(kin_type=kin_type, pp=self.pp, pos=pos) subs_bin_mat = getattr(self, '_' + kin_type + '_bin_matrix') - + # Using table with all the matrices concatenated (log2) kin_mat_log2 = data.get_multiple_matrices(kinases, kin_type=kin_type, mat_type='log2', pos=pos) @@ -308,29 +308,29 @@ def score(self, kin_type=None, kinases=None, st_fav=True, st_fav_scores_log2 = np.log2(st_fav_scores) score_log2 = score_log2 + st_fav_scores_log2 score = np.power(2,score_log2) - + if log2_score: score_output = score_log2 else: score_output = score - + score_output = score_output.round(round_digits) score_rank_output = score_output.rank(method='min', ascending=False, axis=1).astype(int) data_index = getattr(self, kin_type + '_data').index data_score_output = pd.concat([getattr(self, kin_type + '_data').reset_index(drop=True),score_output.reset_index(drop=True)], axis=1) data_score_output.index = data_index - + setattr(self, kin_type+'_scores', score_output) setattr(self, kin_type+'_score_ranks', score_rank_output) setattr(self, kin_type+'_scored_kins', kinases) - + if return_values: if values_only: return(score_output) return(data_score_output) - - + + def percentile(self, kin_type=None, kinases=None, st_fav=True, non_canonical=False, subs_scores=None, subs_scores_format=None, @@ -339,7 +339,7 @@ def percentile(self, kin_type=None, kinases=None, round_digits=2, return_values=True): """ Calculate the percentile score of the phosphoproteomics data for the given kinases. - + After score is being computed, the percentile of that score is being computed based on a basal scored phosphoproteome. @@ -381,24 +381,24 @@ def percentile(self, kin_type=None, kinases=None, * additional column with the -/+7 amino acids substrate * percentiles for all specificed kinases """ - + if all(v is None for v in [kin_type, kinases]): raise ValueError('Either list of kinases or kinase type must be provided.') - + if kinases is None: kinases = data.get_kinase_list(kin_type, non_canonical=non_canonical) elif isinstance(kinases, str): kinases = [kinases] - + kinases = [x.upper() for x in kinases] exceptions.check_kin_name(kinases) - + if kin_type is None: kin_type = data.get_kinase_type(kinases[0]) else: exceptions.check_kin_type(kin_type) exceptions.check_kin_list_type(kinases, kin_type=kin_type) - + percent_output = [] if subs_scores is None: @@ -413,12 +413,12 @@ def percentile(self, kin_type=None, kinases=None, raise ValueError('Please specify the format of input score data (\'subs_scores_format\').') elif subs_scores_format not in ['linear','log2']: raise ValueError('Please provide valid value for \'subs_scores_format\': \'linear\' or \'log2\'.') - + if (subs_scores_format == 'linear'): score = np.log2(subs_scores) else: score = subs_scores.copy() - + if len(score) == 0: # Data is empty - return empty dataframe percent_output = score.copy() setattr(self, kin_type+'_percentiles', percent_output) @@ -429,18 +429,18 @@ def percentile(self, kin_type=None, kinases=None, if values_only: return(percent_output) return(data_percent_output) - + if customized_scored_phosprot is not None: all_scored_phosprot = customized_scored_phosprot else: all_scored_phosprot = core.ScoredPhosphoProteome(phosprot_name=_global_vars.phosprot_name, phosprot_path=phosprot_path) - + if kin_type is None: kin_type = data.get_kinase_type(kinases[0]) else: exceptions.check_kin_type(kin_type) exceptions.check_kin_list_type(kinases, kin_type=kin_type) - + if kin_type == 'ser_thr': scored_phosprot = all_scored_phosprot.ser_thr_scores elif kin_type == 'tyrosine': @@ -448,35 +448,35 @@ def percentile(self, kin_type=None, kinases=None, else: raise ValueError('Wrong kinase type.') scored_phosprot = scored_phosprot.loc[:,kinases] # only for requested kinases if subset - + # If scored phopshoproteome is linear values - converting it to log2 values if not all_scored_phosprot.log2_values: scored_phosprot = np.log2(scored_phosprot) - + print('Calculating percentile for '+str(len(getattr(self,kin_type+'_substrates')))+' '+kin_type+' substrates') logger.info('Calculating percentile for '+str(len(getattr(self,kin_type+'_substrates')))+' '+kin_type+' substrates') percent_output = scored_phosprot.progress_apply(lambda x: x.sort_values().searchsorted(score[x.name], side='right'))/len(scored_phosprot)*100 percent_output.index = score.index - + percent_output = percent_output.round(round_digits) percent_rank_output = percent_output.rank(method='min', ascending=False, axis=1).astype(int) - + data_index = getattr(self, kin_type + '_data').index data_percent_output = pd.concat([getattr(self, kin_type + '_data').reset_index(drop=True),percent_output.reset_index(drop=True)], axis=1) data_percent_output.index = data_index - + setattr(self, kin_type+'_percentiles', percent_output) setattr(self, kin_type+'_percentile_ranks', percent_rank_output) setattr(self, kin_type+'_percentiled_kins', kinases) - + self.phosprot_name = _global_vars.phosprot_name - + if return_values: if values_only: return(percent_output) return(data_percent_output) - - + + def rank(self, metric, kin_type=None, kinases=None, st_fav=True, non_canonical=False, pos=None, rank_kinases=None, values_only=False, @@ -518,7 +518,7 @@ def rank(self, metric, kin_type=None, kinases=None, ranks : pd.DataFrame Ranks of the kinases based on the specified scoring metric. """ - + if all(v is None for v in [kin_type, kinases]): raise ValueError('Either list of kinases or kinase type must be provided.') if kinases is None: @@ -543,23 +543,23 @@ def rank(self, metric, kin_type=None, kinases=None, exceptions.check_kin_list_type(rank_kinases, kin_type=kin_type) exceptions.check_kin_list_type(kinases, kin_type=kin_type) exceptions.check_scoring_metric(metric) - + if metric == 'score': self.score(kin_type=kin_type, kinases=rank_kinases, st_fav=st_fav, non_canonical=non_canonical, return_values=False, pos=pos, round_digits=score_round_digits) elif metric == 'percentile': self.percentile(kin_type=kin_type, kinases=rank_kinases, st_fav=st_fav, non_canonical=non_canonical, return_values=False, pos=pos, round_digits=percentile_round_digits) - + rank_output = getattr(self, kin_type+'_'+metric+'_ranks')[kinases] - + data_index = getattr(self, kin_type + '_data').index data_rank_output = pd.concat([getattr(self, kin_type + '_data').reset_index(drop=True),rank_output.reset_index(drop=True)], axis=1) data_rank_output.index = data_index - + if values_only: return(rank_output) return(data_rank_output) - + def predict(self, metric=['score','percentile'], kin_type=None, kinases=None, st_fav=True, non_canonical=False, values_only=False, score_promiscuity_threshold=1, percentile_promiscuity_threshold=90, @@ -589,7 +589,7 @@ def predict(self, metric=['score','percentile'], kin_type=None, kinases=None, pos : list, optional List of positions to use in the matrix rows. The default is None. score_round_digits : int, optional - Number of decimal digits for score. The default is 4. + Number of decimal digits for score. The default is 3. percentile_round_digits : int, optional Number of decimal digits for percentile. The default is 2. @@ -620,7 +620,7 @@ def predict(self, metric=['score','percentile'], kin_type=None, kinases=None, if isinstance(metric, str): metric = [metric] - + prediction_output = pd.DataFrame(index=getattr(self, kin_type+'_substrates')) if 'score' in metric: @@ -633,7 +633,7 @@ def predict(self, metric=['score','percentile'], kin_type=None, kinases=None, percentiles = getattr(self, kin_type+'_percentiles')[kinases] percent_promis = self.promiscuity_index(kin_type=kin_type, kinases=kinases, metric='percentile', threshold=percentile_promiscuity_threshold, pos=pos, st_fav=st_fav, non_canonical=non_canonical, values_only=True) prediction_output = pd.concat([prediction_output, percent_promis], axis=1) - + for kin in kinases: if 'score' in metric: score_df = pd.DataFrame({kin+'_score': scores[kin], kin+'_score_rank': score_ranks[kin]}) @@ -642,16 +642,16 @@ def predict(self, metric=['score','percentile'], kin_type=None, kinases=None, if 'percentile' in metric: percentile_df = pd.DataFrame({kin+'_percentile': percentiles[kin], kin+'_percentile_rank': percentile_ranks[kin]}) prediction_output = pd.concat([prediction_output, percentile_df], axis=1) - + data_index = getattr(self, kin_type + '_data').index data_prediction_output = pd.concat([getattr(self, kin_type + '_data').reset_index(drop=True),prediction_output.reset_index(drop=True)], axis=1) data_prediction_output.index = data_index - + if values_only: return(prediction_output) return(data_prediction_output) - - + + def promiscuity_index(self, kin_type=None, kinases=None, metric='percentile', threshold=90, pos=None, st_fav=True, non_canonical=False, @@ -689,7 +689,7 @@ def promiscuity_index(self, kin_type=None, kinases=None, None. """ - + if all(v is None for v in [kin_type, kinases]): raise ValueError('Either list of kinases or kinase type must be provided.') if kinases is None: @@ -702,28 +702,28 @@ def promiscuity_index(self, kin_type=None, kinases=None, else: exceptions.check_kin_type(kin_type) exceptions.check_kin_list_type(kinases, kin_type=kin_type) - + if not hasattr(self, kin_type+'_'+metric+'s'): if metric == 'score': self.score(kin_type=kin_type, kinases=kinases, st_fav=st_fav, non_canonical=non_canonical, pos=pos, return_values=False) elif metric == 'percentile': self.percentile(kin_type=kin_type, kinases=kinases, st_fav=st_fav, non_canonical=non_canonical, pos=pos, return_values=False) - + metric_data = getattr(self, kin_type+'_'+metric+'s') promis_idx = (metric_data >= threshold).sum(axis=1) promis_idx.name = metric.capitalize() + ' Promiscuity Index' - + setattr(self, kin_type+'_'+metric+'_'+'promiscuity_index', promis_idx) - + data_index = getattr(self, kin_type + '_data').index data_promis_output = pd.concat([getattr(self, kin_type + '_data').reset_index(drop=True),promis_idx.reset_index(drop=True)], axis=1) data_promis_output.index = data_index - + if values_only: return(promis_idx) return(data_promis_output) - - + + def submit_scores(self, kin_type, scores, suppress_messages=False): """ Submitting scores for the substrates. @@ -743,30 +743,30 @@ def submit_scores(self, kin_type, scores, suppress_messages=False): ------- None. """ - + exceptions.check_kin_type(kin_type) if ~(scores.columns.isin(data.get_kinase_list(kin_type, non_canonical=True)).all()): raise ValueError(f'Score columns must contain only valid {kin_type} kinases. Use kl.get_kinase_list() to get the list of valid kinases.') - + data_subs = getattr(self, kin_type + '_substrates') scores_unique = scores[~scores.index.duplicated(keep='first')] - + if not set(data_subs) <= set(scores_unique.index): raise ValueError('Scores must be provided for all substrates in the data.') - + if scores_unique.isna().any().any(): raise ValueError('Some score values are missing.') - + subs_scores = scores_unique.loc[data_subs] - + score_rank = subs_scores.rank(method='min', ascending=False, axis=1).astype(int) setattr(self, kin_type+'_scores', subs_scores) setattr(self, kin_type+'_score_ranks', score_rank) - + if not suppress_messages: print('Scores submitted successfully.') - - + + def submit_percentiles(self, kin_type, percentiles, phosprot_name=None, suppress_messages=False): """ Submitting percentiles for the substrates. @@ -788,7 +788,7 @@ def submit_percentiles(self, kin_type, percentiles, phosprot_name=None, suppress ------- None. """ - + exceptions.check_kin_type(kin_type) if ~(percentiles.columns.isin(data.get_kinase_list(kin_type, non_canonical=True)).all()): raise ValueError(f'Percentile columns must contain only valid {kin_type} kinases. Use kl.get_kinase_list() to get the list of valid kinases.') @@ -797,27 +797,27 @@ def submit_percentiles(self, kin_type, percentiles, phosprot_name=None, suppress data_subs = getattr(self, kin_type + '_substrates') percentiles_unique = percentiles[~percentiles.index.duplicated(keep='first')] - + if not set(data_subs) <= set(percentiles_unique.index): raise ValueError('Percentiles must be provided for all substrates in the data.') - + if percentiles_unique.isna().any().any(): raise ValueError('Some percentile values are missing.') - + subs_percentiles = percentiles_unique.loc[data_subs] - + percentile_rank = subs_percentiles.rank(method='min', ascending=False, axis=1).astype(int) setattr(self, kin_type+'_percentiles', subs_percentiles) setattr(self, kin_type+'_percentile_ranks', percentile_rank) - + if phosprot_name is None: phosprot_name = _global_vars.phosprot_name self.phosprot_name = phosprot_name - + if not suppress_messages: print('Percentiles submitted successfully.') - - + + def merge_data_scores(self, kin_type, score_type): """ Merging phosphoproteome data and score data. @@ -834,13 +834,12 @@ def merge_data_scores(self, kin_type, score_type): merged_data : dataframe Merged dataframe of the phosphoproteome data and score data. """ - + exceptions.check_kin_type(kin_type) exceptions.check_score_type(score_type) - + data = getattr(self, kin_type+'_data').set_index(_global_vars.default_seq_col, drop=False) scores = getattr(self, kin_type+'_'+score_type) merged_data = pd.concat([data,scores], axis=1) - - return(merged_data) - \ No newline at end of file + + return(merged_data) \ No newline at end of file diff --git a/src/kinase_library/utils/utils.py b/src/kinase_library/utils/utils.py index edf0d44..1e37294 100644 --- a/src/kinase_library/utils/utils.py +++ b/src/kinase_library/utils/utils.py @@ -9,7 +9,6 @@ import pandas as pd import matplotlib.pyplot as plt from .. import logomaker -from tqdm import tqdm from sklearn.preprocessing import MultiLabelBinarizer import warnings @@ -516,8 +515,8 @@ def parse_phosphosites(sequence, phosphoacceptor=['S', 'T', 'Y'], pp=False): Returns ------- pd.DataFrame - DataFrame with columns: residue, position, window. - Window is 15-mer centered on phosphosite, padded with '-'. + DataFrame with columns: Residue, Position, Sequence. + Sequence is 15-mer centered on phosphosite, padded with '_'. """ if not set(phosphoacceptor).issubset(_global_vars.valid_phos_res):