From 892ee823197f08e558e9e7904a36295067cfa599 Mon Sep 17 00:00:00 2001
From: Tomer Yaron-Barir <tmy2110@cumc.columbia.edu>
Date: Wed, 25 Jun 2025 18:55:46 -0400
Subject: [PATCH 1/8] Creating parsing function for a sequence

---
 src/kinase_library/utils/utils.py | 160 +++++++++++++++++++++---------
 1 file changed, 115 insertions(+), 45 deletions(-)

diff --git a/src/kinase_library/utils/utils.py b/src/kinase_library/utils/utils.py
index 98610f8..9a2f499 100644
--- a/src/kinase_library/utils/utils.py
+++ b/src/kinase_library/utils/utils.py
@@ -3,7 +3,6 @@
 # The Kinase Library - Utilities #
 ##################################
 """
-import os
 import re
 import string
 import numpy as np
@@ -11,7 +10,6 @@
 import matplotlib.pyplot as plt
 from .. import logomaker
 from tqdm import tqdm
-from Bio import SeqIO
 from sklearn.preprocessing import MultiLabelBinarizer
 import warnings
 
@@ -58,7 +56,7 @@ def matrix_to_df(mat, kin_type=None, pp=True, k_mod=False, mat_type='log2', cols
     df_mat : pd.DataFrame
         Matrix as Pandas dataframe.
     """
-    
+
     if cols is None:
         if mat_type == 'densitometry':
             cols = [str(i) for i in range(mat.shape[1])]
@@ -69,13 +67,13 @@ def matrix_to_df(mat, kin_type=None, pp=True, k_mod=False, mat_type='log2', cols
                 cols = _global_vars.aa_phos
             else:
                 cols = _global_vars.aa_all_ptm
-                
+
     if rows is None:
         if mat_type == 'densitometry':
             rows = list(string.ascii_uppercase[:mat.shape[1]])
         elif (mat_type in _global_vars.valid_mat_types) and (kin_type in _global_vars.valid_kin_types):
                 rows = data.get_positions(kin_type)
-    
+
     if len(rows) != mat.shape[0]:
         if rows is None:
             raise Exception('As default, {} matrices must be of shape of {}'.format(kin_type,(len(rows),len(cols))))
@@ -86,9 +84,9 @@ def matrix_to_df(mat, kin_type=None, pp=True, k_mod=False, mat_type='log2', cols
             raise Exception('As default, {} matrices must be of shape of {}'.format(kin_type,(len(rows),len(cols))))
         else:
             raise Exception('Length of provided columns ({}) does not match the number of columns in the provided matrix ({})'.format(len(cols),mat.shape[1]))
-    
+
     df_mat = pd.DataFrame(mat, index=rows, columns=cols)
-    
+
     return(df_mat)
 
 
@@ -106,7 +104,7 @@ def flatten_matrix(matrix):
     flat_matrix : np.ndarray
         Flatten matrix.
     """
-    
+
     flat_matrix = matrix.transpose().values.reshape(matrix.shape[0] * matrix.shape[1],1).transpose()
     return (flat_matrix)
 
@@ -180,10 +178,10 @@ def make_seq_logo(matrix, logo_type='ratio_to_median', random_aa_value=0.05,
     fig : plt.figure()
         If specified, returning the figure object with the sequence logo.
     """
-    
+
     if font_name == 'Arial Rounded MT Bold' and font_name not in logomaker.list_font_names():
         font_name = 'sans'
-    
+
     if ax is None:
         zero_lw = 3
         fig,ax = plt.subplots(figsize = [10,5])
@@ -192,10 +190,10 @@ def make_seq_logo(matrix, logo_type='ratio_to_median', random_aa_value=0.05,
         plot = False
         if save_fig or return_fig:
             raise ValueError('When Axes provided, \'save_fig\', and \'return_fig\' must be False.')
-    
-    if color_dict is None: 
+
+    if color_dict is None:
         color_dict = _global_vars.aa_colors
-    
+
     if logo_type == 'ratio_to_random':
         if random_aa_value is None:
             raise Exception('If method is specified to be ratio to random, \'random_aa_value\' must be provided.')
@@ -213,7 +211,7 @@ def make_seq_logo(matrix, logo_type='ratio_to_median', random_aa_value=0.05,
         norm_zero_pos = {key: zero_pos[key]/sum(zero_pos.values())*max_height for key in zero_pos.keys()}
         for aa in zero_pos.keys():
             height_matrix.loc[0,aa] = norm_zero_pos[aa]
-    
+
         xlims = ax.get_xlim()
         ylims = ax.get_ylim()
         logomaker.Logo(pd.DataFrame(height_matrix.loc[0]).transpose(), ax=ax, color_scheme=zero_pos_color, flip_below=flip_below, font_name=font_name)
@@ -224,7 +222,7 @@ def make_seq_logo(matrix, logo_type='ratio_to_median', random_aa_value=0.05,
             ax.set_ylim(ylims)
     else:
         ax.axvline(0, color='k', linewidth=3, linestyle=':')
-    
+
     ax.set_xticks(ticks=sorted(height_matrix.index))
     ax.set_xticklabels(labels=['' if not xticks else str(x) if x<=0 else '+'+str(x) for x in sorted(height_matrix.index)], fontsize=xticks_fontsize, weight = 'bold')
     if yticks:
@@ -233,7 +231,7 @@ def make_seq_logo(matrix, logo_type='ratio_to_median', random_aa_value=0.05,
         ax.tick_params(axis='y', which='both', left=False, labelleft=False)
     if title is not None:
         ax.set_title(title, fontsize=24)
-    
+
     if xlabel != False:
         ax.set_xlabel(xlabel, fontsize=xlabel_size)
     if ylabel != False:
@@ -246,24 +244,24 @@ def make_seq_logo(matrix, logo_type='ratio_to_median', random_aa_value=0.05,
                 ax.set_ylabel('Probability',fontsize = ylabel_size)
         else:
             ax.set_ylabel(ylabel, fontsize=ylabel_size)
-    
+
     if save_fig:
         fig.savefig(save_fig)
-    
+
     try:
         fig.tight_layout()
     except:
         pass
-    
+
     if not plot:
         try:
             plt.close(fig)
         except:
             pass
-    
+
     if return_fig:
         return(fig)
-    
+
 
 #%%
 """
@@ -296,27 +294,27 @@ def filter_invalid_subs(data, seq_col, suppress_warnings=True):
     omitted_enteries : pd.DataFrame
         Dropped enteries due to invalid sequence.
     """
-    
+
     data_no_na = data[data[seq_col].notna()]
     if not suppress_warnings and ((len(data)-len(data_no_na))>0):
         print(str(len(data)-len(data_no_na)) + ' entries were omitted due to empty value in the substrates column.')
-    
+
     escaped_aa_list = [re.escape(c) for c in _global_vars.valid_aa]
     data_no_invalid_aa = data_no_na[data_no_na[seq_col].astype(str).str.contains('^['+''.join(escaped_aa_list)+']+$')]
     if not suppress_warnings and ((len(data_no_na)-len(data_no_invalid_aa))>0):
         print(str(len(data_no_na)-len(data_no_invalid_aa)) + ' entries were omitted due to invalid amino acids or characters.')
-    
+
     data_no_even = data_no_invalid_aa[(data_no_invalid_aa[seq_col].str.len())%2 != 0]
     if not suppress_warnings and ((len(data_no_invalid_aa)-len(data_no_even))>0):
         print(str(len(data_no_invalid_aa)-len(data_no_even)) + ' entries were omitted due to even length (no central position).')
-    
+
     data_no_invalid_phos_acc = data_no_even[data_no_even[seq_col].apply(lambda x: x[len(x)//2]).isin(_global_vars.valid_phos_res)]
     if not suppress_warnings and ((len(data_no_even)-len(data_no_invalid_phos_acc))>0):
         print(str(len(data_no_even)-len(data_no_invalid_phos_acc)) + ' entries were omitted due to invalid central phosphoacceptor.')
 
     omitted_enteries = pd.concat([data, data_no_invalid_phos_acc])
     omitted_enteries = omitted_enteries.loc[omitted_enteries.astype(str).drop_duplicates(keep=False).index] #In order to deal with lists in the DataFrame
-    
+
     return(data_no_invalid_phos_acc, omitted_enteries)
 
 
@@ -344,7 +342,7 @@ def sequence_to_substrate(seq, pp=False, phos_pos=None, kin_type=None, validate_
     substrate : str
         15-mer with phosphoacceptor at the center.
     """
-    
+
     if validate_phos_res:
         if phos_pos is None:
             if (seq[len(seq)//2].lower() not in ['s','t','y']) | (len(seq) % 2 == 0):
@@ -354,27 +352,27 @@ def sequence_to_substrate(seq, pp=False, phos_pos=None, kin_type=None, validate_
     if validate_aa:
         if not _global_vars.valid_aa.issuperset(seq.upper()):
             raise Exception('Sequence contains invalid amino acids or characters ({}).\nAllowed characters are: {}'.format(seq.upper(),_global_vars.valid_aa))
-    
+
     if seq is np.nan:
         return(None)
 
     if phos_pos is None:
         phos_pos = len(seq)//2 + 1
-    
+
     if kin_type is not None:
         exceptions.check_kin_type(kin_type)
         phos_acc = seq[phos_pos-1]
         if phos_acc not in _global_vars.kin_type_phos_acc[kin_type]:
             raise Exception(f'Mismatch beteween kinase type ({kin_type}) and phosphoacceptor ({phos_acc}): {seq}')
-        
+
     pad_seq = '_'*7 + seq + '_'*7
     substrate = pad_seq[phos_pos-1:phos_pos+14]
-    
+
     if pp:
         substrate = ''.join([x.upper() if x not in ['s','t','y'] else x for x in substrate[:7]]) + substrate[7].lower() + ''.join([x.upper() if x not in ['s','t','y'] else x for x in substrate[8:]])
     else:
         substrate = substrate[:7].upper() + substrate[7].lower() + substrate[8:].upper()
-    
+
     return(substrate)
 
 
@@ -407,7 +405,7 @@ def sub_binary_matrix(substrates, pp=True, aa=None, pos=None, sub_pos=None, seq_
         If input is one substrate - returns single binary martix.
         If input is list of substrates - returns dataframe with flattened binary matrices.
     """
-    
+
     if isinstance(substrates, str):
         subs_list = pd.Series([substrates], name='Sequence')
     elif isinstance(substrates, list):
@@ -420,19 +418,19 @@ def sub_binary_matrix(substrates, pp=True, aa=None, pos=None, sub_pos=None, seq_
         subs_list = substrates[seq_col]
     else:
         raise Exception('Invalid substrates list.')
-        
+
     if not pp:
         subs_list = subs_list.apply(lambda x: unprime_substrate(x))
-    
+
     if pos is None:
         pos = list(range(1,max([len(x) for x in subs_list])+1))
     if sub_pos is None:
         sub_pos = range(-7,8)
     if aa is None:
         aa = data.get_aa()
-    
+
     subs_pos_aa = [[str(p)+a for a,p in zip(sub,sub_pos)] for sub in subs_list]
-    
+
     bin_mat_classes = [str(pos)+a for pos in pos for a in aa]
     mlb = MultiLabelBinarizer(classes=bin_mat_classes)
     with warnings.catch_warnings():
@@ -441,13 +439,13 @@ def sub_binary_matrix(substrates, pp=True, aa=None, pos=None, sub_pos=None, seq_
 
     if isinstance(substrates, str):
         return(pd.DataFrame(bin_mat.reshape(len(pos), len(aa)), index=pos, columns=aa).T)
-    
+
     if as_dict:
         bin_df = [pd.DataFrame(bin_mat.reshape(len(pos), len(aa)), index=pos, columns=aa).T for x in bin_mat]
         return(dict(zip(subs_list,bin_df)))
-    
+
     sub_mat = pd.DataFrame(bin_mat, index=subs_list, columns=bin_mat_classes)
-    
+
     return(sub_mat)
 
 
@@ -465,7 +463,7 @@ def substrate_type(sub):
     sub_type : str
         Substrate type ('ser_thr', 'tyrosine', or 'unknown').
     """
-    
+
     if sub[7].lower() in ['s','t']:
         sub_type = 'ser_thr'
     elif sub[7].lower() == 'y':
@@ -489,7 +487,7 @@ def unprime_substrate(sub):
     un_primed_sub : str
         The input substrate without phospho-residues.
     """
-        
+
     un_primed_sub = sub[:7].upper() + sub[7] + sub[8:].upper()
     return(un_primed_sub)
 
@@ -501,6 +499,78 @@ def unprime_substrate(sub):
 ###################
 """
 
+def parse_phosphosites(sequence, phosphoacceptor=['S', 'T', 'Y'], pp=False):
+    """
+    Parse protein sequence to identify phosphorylation sites.
+
+    Parameters
+    ----------
+    sequence : str
+        Protein sequence using one-letter amino acid codes.
+    phosphoacceptor : List[str], optional
+        Phosphoacceptors to parse (any combination of 'S', 'T', 'Y'). Default is ['S', 'T', 'Y'].
+    pp : bool, optional
+        Phospho-priming. If False, all non-central residues uppercase.
+        If True, keep S/T/Y case, others uppercase. Default is False.
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with columns: residue, position, window.
+        Window is 15-mer centered on phosphosite, padded with '-'.
+    """
+
+    if not set(phosphoacceptor).issubset(_global_vars.valid_phos_res):
+        raise ValueError(f'Phosphoacceptor(s) must be a subset of {_global_vars.valid_phos_res}.')
+
+    # Create regex pattern for phosphoacceptors
+    pattern = '[' + ''.join([aa.upper() + aa.lower() for aa in phosphoacceptor]) + ']'
+
+    # Find all matches with their positions
+    matches = [(m.group(), m.start()) for m in re.finditer(pattern, sequence)]
+
+    sites = []
+
+    for residue, i in matches:
+        # Create 15-mer window
+        start_idx = max(0, i - 7)
+        end_idx = min(len(sequence), i + 8)
+        window_seq = sequence[start_idx:end_idx]
+
+        # Pad with '_'
+        upstream_pad = max(0, 7-i)
+        downstream_pad = max(0, (i+8) - len(sequence))
+        raw_window = '_'*upstream_pad + window_seq + '_'*downstream_pad
+
+        # Apply phospho-priming rules
+        if pp:
+            # Keep S/T/Y case, others uppercase, central always lowercase
+            window = ''.join([
+                char.upper() if char.upper() not in 'STY' else char
+                for char in raw_window[:7]
+            ]) + raw_window[7].lower() + ''.join([
+                char.upper() if char.upper() not in 'STY' else char
+                for char in raw_window[8:]
+            ])
+        else:
+            # All uppercase except central (lowercase)
+            window = raw_window[:7].upper() + raw_window[7].lower() + raw_window[8:].upper()
+
+        sites.append({
+            'Residue': residue.lower(),
+            'Position': i + 1,
+            'Sequence': window
+        })
+
+    return pd.DataFrame(sites)
+
+#%%
+"""
+###################
+# Other utilities #
+###################
+"""
+
 def list_font_names():
     """
     Returns a list of valid font_name options for use in sequence logo.
@@ -514,7 +584,7 @@ def list_font_names():
     fontname : list
         List of valid font_name names from logomaker. This will vary from system to system.
     """
-    
+
     return logomaker.list_font_names()
 
 
@@ -534,8 +604,8 @@ def list_series_to_df(subs_list, col_name=None):
     -------
     pd.DataFrame with one column containing the list or pd.Series.
     """
-    
+
     if isinstance(subs_list, pd.Series):
             return(subs_list.to_frame(name=col_name))
     if isinstance(subs_list, list):
-            return(pd.Series(subs_list).to_frame(name=col_name))
+            return(pd.Series(subs_list).to_frame(name=col_name))
\ No newline at end of file

From 282d0baaf97e8ed9c14296cdc5d7a6465c9cab71 Mon Sep 17 00:00:00 2001
From: Tomer Yaron-Barir <tmy2110@cumc.columbia.edu>
Date: Thu, 26 Jun 2025 19:26:04 -0400
Subject: [PATCH 2/8] Updating titles

---
 src/kinase_library/modules/enrichment.py | 188 +++++++++++------------
 src/kinase_library/utils/utils.py        |   6 +-
 2 files changed, 97 insertions(+), 97 deletions(-)

diff --git a/src/kinase_library/modules/enrichment.py b/src/kinase_library/modules/enrichment.py
index 00f14ae..fed5e05 100644
--- a/src/kinase_library/modules/enrichment.py
+++ b/src/kinase_library/modules/enrichment.py
@@ -28,9 +28,9 @@
 
 #%%
 """
-###########################
+################################
 # Differential Phosphorylation #
-###########################
+################################
 """
 
 def dp_regulated_sites(dp_data, lfc_col, lfc_thresh=0,
@@ -73,7 +73,7 @@ def dp_regulated_sites(dp_data, lfc_col, lfc_thresh=0,
         raise ValueError('P-values threshold must be between 0-1.')
     if not 0<=percent_thresh<=100:
         raise ValueError('Percent threshold must be between 0-100.')
-        
+
     all_dp_data = dp_data.copy()
     if drop_na:
         dp_data = all_dp_data[~all_dp_data[lfc_col].isna()]
@@ -84,13 +84,13 @@ def dp_regulated_sites(dp_data, lfc_col, lfc_thresh=0,
                 print(str(len(all_dp_data)-len(dp_data)) + ' entries were dropped due to empty value in the logFC or p-value columns.')
             else:
                 print(str(len(all_dp_data)-len(dp_data)) + ' entries were dropped due to empty value in the logFC column.')
-    
+
     dropped_enteries = pd.concat([all_dp_data, dp_data])
     dropped_enteries = dropped_enteries.loc[dropped_enteries.astype(str).drop_duplicates(keep=False).index] #In order to deal with lists in the DataFrame
-    
+
     if len(dropped_enteries)>0 and not suppress_warnings:
         print('Use the \'dp_dropped_enteries\' attribute to view dropped enteries due to invalid differential phosphorylation values.')
-    
+
     if percent_rank is not None:
         if percent_rank == 'logFC':
             sorted_dp_data = dp_data.sort_values(lfc_col, ascending=False)
@@ -101,7 +101,7 @@ def dp_regulated_sites(dp_data, lfc_col, lfc_thresh=0,
         else:
             raise ValueError('percent_rank must be either \'logFC\' or \'pvalue\'.')
         reg_sites_dict = dict(zip(['upreg','unreg','downreg'], np.split(sorted_dp_data, [int(percent_thresh/100*len(dp_data)), int((1-percent_thresh/100)*len(dp_data))])))
-    
+
     else:
         reg_sites_dict = {}
         if lfc_thresh == 0: # In case of lfc_thresh of zero, treat lfc of 0 as unregulated
@@ -122,7 +122,7 @@ def dp_regulated_sites(dp_data, lfc_col, lfc_thresh=0,
                 reg_sites_dict['upreg'] = dp_data[dp_data[lfc_col]>=lfc_thresh]
                 reg_sites_dict['downreg'] = dp_data[dp_data[lfc_col]<=-lfc_thresh]
                 reg_sites_dict['unreg'] = dp_data[dp_data[lfc_col].abs()<lfc_thresh]
-    
+
     return(dp_data, reg_sites_dict, dropped_enteries)
 
 #%%
@@ -148,7 +148,7 @@ def combine_binary_enrichment_results(enrichment_results_dict, data_type='kl_obj
     pval_col_name : str, optional
         Adjusted p-value column name. The default is None.
         If None, will be set to 'fisher_adj_pval'.
-    
+
     Raises
     ------
     ValueError
@@ -164,35 +164,35 @@ def combine_binary_enrichment_results(enrichment_results_dict, data_type='kl_obj
 
     if data_type not in ['kl_object','data_frame']:
         raise ValueError('data_type must be either \'kl_object\' or \'data_frame\'.')
-    
+
     enrichment_results = list(enrichment_results_dict.values())
     conds_list = list(enrichment_results_dict.keys())
-    
+
     if data_type == 'data_frame':
         enrichment_results_tables = enrichment_results
     else:
         enrichment_results_tables = [res.enrichment_results for res in enrichment_results]
-        
+
     if lff_col_name is None:
         lff_col_name = 'log2_freq_factor'
     if pval_col_name is None:
         pval_col_name = 'fisher_adj_pval'
-    
+
     index_test = [x.index.to_list() == enrichment_results_tables[0].index.to_list() for x in enrichment_results_tables]
     if not np.all(index_test):
         raise ValueError('All enrichment results must have the same kinases enriched.')
     kinases = enrichment_results_tables[0].index.to_list()
-    
+
     lff_data = pd.DataFrame(index=kinases, columns=conds_list)
     pval_data = pd.DataFrame(index=kinases, columns=conds_list)
-    
+
     for res,cond in zip(enrichment_results_tables,conds_list):
         lff_data[cond] = res[lff_col_name]
         pval_data[cond] = res[pval_col_name]
-    
+
     return(lff_data,pval_data)
-    
-    
+
+
 def combine_diff_phos_enrichment_results(enrichment_results_dict, enrichment_type='combined', data_type='kl_object',
                                         lff_col_name=None, pval_col_name=None, cont_kins_col_name=None):
     """
@@ -234,10 +234,10 @@ def combine_diff_phos_enrichment_results(enrichment_results_dict, enrichment_typ
     exceptions.check_dp_enrichment_type(enrichment_type)
     if data_type not in ['kl_object','data_frame']:
         raise ValueError('data_type must be either \'kl_object\' or \'data_frame\'.')
-    
+
     enrichment_results = list(enrichment_results_dict.values())
     conds_list = list(enrichment_results_dict.keys())
-    
+
     if data_type == 'data_frame':
         enrichment_results_tables = enrichment_results
     else:
@@ -249,22 +249,22 @@ def combine_diff_phos_enrichment_results(enrichment_results_dict, enrichment_typ
             enrichment_results_tables = [getattr(res,enrichment_type+'_enrichment_results').enrichment_results for res in enrichment_results]
         else:
             enrichment_results_tables = [res.combined_enrichment_results for res in enrichment_results]
-    
+
     if lff_col_name is None:
         lff_col_name = 'most_sig_'*(enrichment_type=='combined') + 'log2_freq_factor'
     if pval_col_name is None:
         pval_col_name = 'most_sig_'*(enrichment_type=='combined') + 'fisher_adj_pval'
-    
+
     index_test = [x.index.to_list() == enrichment_results_tables[0].index.to_list() for x in enrichment_results_tables]
     if not np.all(index_test):
         raise ValueError('All enrichment results must have the same kinases enriched.')
     kinases = enrichment_results_tables[0].index.to_list()
-    
+
     lff_data = pd.DataFrame(index=kinases, columns=conds_list)
     pval_data = pd.DataFrame(index=kinases, columns=conds_list)
     if enrichment_type == 'combined':
         cont_kins_data = pd.DataFrame(False, index=kinases, columns=conds_list)
-    
+
     for res,cond in zip(enrichment_results_tables,conds_list):
         lff_data[cond] = res[lff_col_name]
         pval_data[cond] = res[pval_col_name]
@@ -278,19 +278,19 @@ def combine_diff_phos_enrichment_results(enrichment_results_dict, enrichment_typ
 
 
 def create_kin_sub_sets(data_values, threshold, comp_direction):
-    
+
     if comp_direction not in ['higher','lower']:
         raise ValueError('\'comp_direction\' must be either \'higher\' or \'lower\'.')
-    
+
     if comp_direction == 'higher':
         pred_kins = (data_values>=threshold)
     elif comp_direction == 'lower':
         pred_kins = (data_values<=threshold)
-    
+
     kins_subs_dict = {}
     for kin in tqdm(pred_kins.columns):
         kins_subs_dict[kin] = pred_kins.loc[pred_kins[kin],kin].index.to_list()
-    
+
     return(kins_subs_dict)
 
 
@@ -326,36 +326,36 @@ def combine_mea_enrichment_results(enrichment_results_dict, data_type='kl_object
     pval_data : pd.DataFrame
         Dataframe with adjusted p-value enrichment data of all conditions.
     """
-    
+
     if data_type not in ['kl_object','data_frame']:
         raise ValueError('data_type must be either \'kl_object\' or \'data_frame\'.')
-    
+
     enrichment_results = list(enrichment_results_dict.values())
     conds_list = list(enrichment_results_dict.keys())
-    
+
     if data_type == 'data_frame':
         enrichment_results_tables = enrichment_results
     else:
         enrichment_results_tables = [res.enrichment_results for res in enrichment_results]
-    
+
     index_test = [x.index.to_list() == enrichment_results_tables[0].index.to_list() for x in enrichment_results_tables]
     if not np.all(index_test):
         raise ValueError('All enrichment results must have the same kinases enriched.')
     kinases = enrichment_results_tables[0].index.to_list()
-    
+
     if pval_col_name is None:
         if adj_pval:
             pval_col_name = 'FDR'
         else:
             pval_col_name = 'pvalue'
-    
+
     lff_data = pd.DataFrame(index=kinases, columns=conds_list)
     pval_data = pd.DataFrame(index=kinases, columns=conds_list)
-    
+
     for res,cond in zip(enrichment_results_tables,conds_list):
         lff_data[cond] = res[lff_col_name]
         pval_data[cond] = res[pval_col_name]
-    
+
     return(lff_data,pval_data)
 
 
@@ -443,9 +443,9 @@ def plot_volcano(enrichment_data, sig_lff=0, sig_pval=0.1,
         enrichment_data = enrichment_data.loc[kinases]
         if highlight_kins is not None:
             highlight_kins = [x for x in highlight_kins if x in kinases]
-    
+
     enrichment_data[pval_col] = enrichment_data[pval_col].astype(float)
-    
+
     if ignore_depleted:
         if 'most_sig_direction' in enrichment_data.columns:
             signed_log2_ff = (enrichment_data['most_sig_direction'].replace({'-': -1, '+': 1}))*enrichment_data['most_sig_log2_freq_factor']
@@ -457,11 +457,11 @@ def plot_volcano(enrichment_data, sig_lff=0, sig_pval=0.1,
             depleted_kins = (enrichment_data[lff_col] < 0)
             enrichment_data.loc[depleted_kins, lff_col] = 0
             enrichment_data.loc[depleted_kins, pval_col] = 1
-    
+
     sig_enriched_data = enrichment_data[(enrichment_data[lff_col] >= sig_lff) & (enrichment_data[pval_col] <= sig_pval)]
     sig_depleted_data = enrichment_data[(enrichment_data[lff_col] <= -sig_lff) & (enrichment_data[pval_col] <= sig_pval)]
     non_sig_data = enrichment_data[~((np.abs(enrichment_data[lff_col]) >= sig_lff) & (enrichment_data[pval_col] <= sig_pval))]
-    
+
     if ax is None:
         existing_ax = False
         fig,ax = plt.subplots()
@@ -474,7 +474,7 @@ def plot_volcano(enrichment_data, sig_lff=0, sig_pval=0.1,
     ax.plot(non_sig_data[lff_col],-np.log10(non_sig_data[pval_col]), markerfacecolor='black', linestyle='None', **plot_kwargs)
     ax.plot(sig_enriched_data[lff_col],-np.log10(sig_enriched_data[pval_col]), markerfacecolor='red', linestyle='None', **plot_kwargs)
     ax.plot(sig_depleted_data[lff_col],-np.log10(sig_depleted_data[pval_col]), markerfacecolor='blue', linestyle='None', **plot_kwargs)
-    
+
     if highlight_kins is not None:
         if isinstance(highlight_kins, str):
             highlight_kins = [highlight_kins]
@@ -483,15 +483,15 @@ def plot_volcano(enrichment_data, sig_lff=0, sig_pval=0.1,
             raise ValueError(f'Some kinases to highlight are not in the enrichment results ({missing_kinases}).')
         highlight_data = enrichment_data.loc[highlight_kins]
         ax.plot(highlight_data[lff_col],-np.log10(highlight_data[pval_col]), markerfacecolor='yellow', linestyle='None', **plot_kwargs)
-    
+
     if symmetric_xaxis:
         ax.set_xlim(-abs(max(ax.get_xlim(), key=abs)), abs(max(ax.get_xlim(), key=abs)))
-    
+
     ax.axhline(-np.log10(sig_pval), ls='--', lw=1, color='k')
     if sig_lff>0:
         ax.axvline(sig_lff, ls='--', lw=1, color='k')
         ax.axvline(-sig_lff, ls='--', lw=1, color='k')
-    
+
     ax.set_title(title)
     ax.set_xlabel(xlabel)
     ax.set_ylabel(ylabel)
@@ -502,12 +502,12 @@ def plot_volcano(enrichment_data, sig_lff=0, sig_pval=0.1,
     ax.set_xticks(range(-xvalues,xvalues+1))
     if grid:
         ax.grid('major')
-        
+
     if max_window:
         figManager = fig.canvas.manager
         figManager.window.showMaximized()
         fig.tight_layout()
-    
+
     if label_kins is None:
         label_kins = list(sig_enriched_data.index) + list(sig_depleted_data.index)
         if highlight_kins:
@@ -526,13 +526,13 @@ def plot_volcano(enrichment_data, sig_lff=0, sig_pval=0.1,
                 ax.annotate(kin_data.name, (kin_data[lff_col],-np.log10(kin_data[pval_col])), xytext=(2,2), textcoords='offset points', fontsize=labels_fontsize)
         if adjust_labels:
             adjust_text(labels, ax=ax, arrowprops=dict(arrowstyle='-', color='k', lw=0.5))
-    
+
     if save_fig:
         fig.savefig(save_fig, dpi=1000)
-        
+
     if not plot and not existing_ax:
         plt.close(fig)
-            
+
     if return_fig:
         return fig
 
@@ -611,20 +611,20 @@ def plot_3x3_volcanos(dp_data, kin_type, kl_method, kl_thresh, dp_lfc_col, dp_lf
         Optional keyword arguments to be passed to the kinase_enrichment function.
     plotting_kwargs : dict, optional
         Optional keyword arguments to be passed to the plot_volcano function.
-        
+
     Returns
     -------
     If return_fig, the 3x3 figure containing downregulated, upregulated, and combined kinase enrichment volcano plots.
     """
-    
+
     if len(dp_lfc_thresh) != 3:
         raise ValueError('\'dp_lfc_thresh\' must contain exactly three values.')
     if dp_pval_col is not None and len(dp_pval_thresh) != 3:
         raise ValueError('\'dp_pval_thresh\' must contain exactly three values.')
-    
+
     if seq_col is None:
         seq_col = _global_vars.default_seq_col
-    
+
     exceptions.check_kl_method(kl_method)
     print('Calculating scores for all sites')
     dp_data_pps = pps.PhosphoProteomics(data=dp_data, seq_col=seq_col, **diff_phos_kwargs)
@@ -632,17 +632,17 @@ def plot_3x3_volcanos(dp_data, kin_type, kl_method, kl_thresh, dp_lfc_col, dp_lf
         scores = dp_data_pps.score(kin_type=kin_type, kinases=kinases, values_only=True, **scoring_kwargs)
     elif kl_method in ['percentile','percentile_rank']:
         percentiles = dp_data_pps.percentile(kin_type=kin_type, kinases=kinases, values_only=True, **scoring_kwargs)
-        
+
     fig = plt.figure(constrained_layout=True)
     figManager = fig.canvas.manager
     figManager.window.showMaximized()
     subfigs = fig.subfigures(nrows=3, ncols=1)
-        
+
     for i,(lfc,pval) in enumerate(zip(dp_lfc_thresh,dp_pval_thresh)):
-        
+
         subfigs[i].suptitle(r'$\bf{' + f'DE logFC threshold: {lfc}' + f' / DE p-value threshold: {pval}'*(dp_pval_col is not None) + '}$')
         ax = subfigs[i].subplots(nrows=1, ncols=3)
-        
+
         print(f'\nLogFC threshold: {lfc}' + f' / p-value threshold: {pval}'*(dp_pval_col is not None))
         diff_phos_data = DiffPhosData(dp_data=dp_data, kin_type=kin_type,
                                     lfc_col=dp_lfc_col, lfc_thresh=lfc,
@@ -653,27 +653,27 @@ def plot_3x3_volcanos(dp_data, kin_type, kl_method, kl_thresh, dp_lfc_col, dp_lf
             diff_phos_data.submit_scores(kin_type=kin_type, scores=scores, suppress_messages=suppress_warnings)
         elif kl_method in ['percentile','percentile_rank']:
             diff_phos_data.submit_percentiles(kin_type=kin_type, percentiles=percentiles, suppress_messages=suppress_warnings)
-        
+
         enrich_results = diff_phos_data.kinase_enrichment(kl_method=kl_method, kl_thresh=kl_thresh,
                                                        **enrichment_kwargs)
-            
+
         enrich_results.plot_down_up_comb_volcanos(sig_lff=ke_sig_lff, sig_pval=ke_sig_pval, kinases=kinases,
                                                   plot_cont_kins=plot_cont_kins, highlight_kins=highlight_kins, ignore_depleted=ignore_depleted,
                                                   label_kins=label_kins, adjust_labels=adjust_labels, labels_fontsize=labels_fontsize, ax=ax,
                                                   **plotting_kwargs)
-    
+
     fig.suptitle(title)
-    
+
     if save_fig:
         fig.savefig(save_fig, dpi=1000)
-        
+
     if not plot:
         plt.close(fig)
-            
+
     if return_fig:
         return fig
-    
-    
+
+
 def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1, kinases=None,
                    plot_cont_kins=True, highlight_cont_kins=True, sort_kins_by='family',
                    cond_order=None, only_sig_kins=False, only_sig_conds = False,
@@ -692,7 +692,7 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1,
     Parameters
     ----------
     lff_data : pd.DataFrame
-        Matrix containing Kinase Library enrichment log frequency factor data with kinases as index and conditions as columns. 
+        Matrix containing Kinase Library enrichment log frequency factor data with kinases as index and conditions as columns.
     pval_data : pd.DataFrame
         Matrix containing Kinase Library enrichment p_value data. Index (kinases) and columns (conditions) must be identical to lff_data.
     cont_kins : pd.DataFrame, optional
@@ -708,7 +708,7 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1,
     highlight_cont_kins : boolean, optional
         Highlight contradicting kinases. The default is True.
     sort_kins_by : str, optional
-        String specifying what to sort the kinases on the figure's x-axes by. If provided, must be 'family' (default) or 'name'. 
+        String specifying what to sort the kinases on the figure's x-axes by. If provided, must be 'family' (default) or 'name'.
     cond_order : list of str, optional
         Order in which the conditions will appear on the bubblemap. This list may be a subset of conditions but may not contain extra strings. The default is None, preserving the condition order from the lff_data matrix.
     only_sig_kins : bool, optional
@@ -781,19 +781,19 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1,
     -------
     None.
     """
-    
+
     if not ((lff_data.index.equals(pval_data.index)) and (lff_data.columns.equals(pval_data.columns))):
         raise ValueError('lff_data and pval_data must have the same columns and indices.')
-    
+
     if (pval_data == 0).any().any():
         print('Warning: zeros were detected in pval_data. All zeros were replaced with the non-zero minimal value in pval_data.')
         pval_data = pval_data.replace(0, pval_data[pval_data>0].min().min())
-    
+
     if cont_kins is None:
         cont_kins = pd.DataFrame(False, index=lff_data.index, columns=lff_data.columns)
     elif not ((lff_data.index.equals(cont_kins.index)) and (lff_data.columns.equals(cont_kins.columns))):
         raise ValueError('cont_kins must have the same columns and indices as lff_data and pval_data.')
-    
+
     if kinases is None:
         kinases = lff_data.index.to_list()
     else:
@@ -801,7 +801,7 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1,
             kinases = list(dict.fromkeys(kinases))
     lff_data = lff_data.loc[kinases]
     pval_data = pval_data.loc[kinases]
-    
+
     if not sort_kins_by:
         kins_order = kinases
     elif sort_kins_by == 'family':
@@ -811,14 +811,14 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1,
         kins_order = natsorted(kinases)
     else:
         raise ValueError('sort_kins_by must be either \'family\', \'name\', or False.')
-    
+
     if cond_order is None:
         cond_order = lff_data.columns
-    
+
     if color_kins_by:
         exceptions.check_color_kins_method(color_kins_by)
         label_info_col = color_kins_by.upper()
-        
+
         kinome_data = data.get_kinase_info(kinases)
         kin_categories_list = natsorted(kinome_data[label_info_col].unique())
         if kin_categories_colors is None:
@@ -829,7 +829,7 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1,
         kin_colors = {}
         for kin,label_type in zip(kinome_data.index,kinome_data[label_info_col]):
             kin_colors[kin] = kin_categories_colors[label_type]
-    
+
     sorted_lff_data = lff_data.loc[kins_order,cond_order]
     sorted_pval_data = pval_data.loc[kins_order,cond_order]
     if highlight_cont_kins:
@@ -843,7 +843,7 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1,
     if not plot_cont_kins:
         sig_lff_data = sig_lff_data.mask(cont_kins)
         sig_pval_data = sig_pval_data.mask(cont_kins)
-    
+
     if only_sig_kins:
         sig_kins = list(sig_data.loc[sig_data.any(axis=1)].index)
         sig_lff_data = sig_lff_data.loc[sig_kins]
@@ -856,7 +856,7 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1,
         sorted_highlight_data = sorted_highlight_data[sig_conds]
 
     minus_log10_pval_data = -np.log10(sig_pval_data.fillna(1))
-    
+
     if kin_clust:
         if cluster_by is None:
             raise ValueError('If kin_clust is True, cluster_by must be specified.')
@@ -875,7 +875,7 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1,
             sig_lff_data = sig_lff_data.iloc[hierarchy.leaves_list(Z)]
             minus_log10_pval_data = minus_log10_pval_data.iloc[hierarchy.leaves_list(Z)]
             sorted_highlight_data = sorted_highlight_data.iloc[hierarchy.leaves_list(Z)]
-    
+
     if condition_clust:
         if cluster_by is None:
             raise ValueError('If condition_clust is True, cluster_by must be specified.')
@@ -894,7 +894,7 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1,
             sig_lff_data = sig_lff_data.iloc[:,hierarchy.leaves_list(Z)]
             minus_log10_pval_data = minus_log10_pval_data.iloc[:,hierarchy.leaves_list(Z)]
             sorted_highlight_data = sorted_highlight_data.iloc[:,hierarchy.leaves_list(Z)]
-    
+
     intensity_matrices = np.array_split(sig_lff_data, num_panels)
     size_matrices = np.array_split(minus_log10_pval_data, num_panels)
     highlight_matrices = np.array_split(sorted_highlight_data, num_panels)
@@ -902,13 +902,13 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1,
     if lff_clim is None:
         lff_clim = (min(sig_lff_data.min().min(),0),max(sig_lff_data.max().max(),0))
     cnorm = mcol.TwoSlopeNorm(vmin=lff_clim[0], vcenter=0, vmax=lff_clim[1])
-    
+
     if max_pval_size is None:
         max_pval_size = np.ceil(minus_log10_pval_data.max().max())
     if max_pval_size < -np.log10(sig_pval):
         raise ValueError('max_pval_size must be equal or greater than -log10(sig_pval).')
     pval_slim = (-np.log10(sig_pval), max_pval_size)
-    
+
     if vertical:
         intensity_matrices = [mat.transpose() for mat in intensity_matrices]
         size_matrices = [mat.transpose() for mat in size_matrices]
@@ -916,19 +916,19 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1,
 
     fig = plt.figure(constrained_layout=constrained_layout)
     subfigs = fig.subfigures(nrows=1, ncols=2, width_ratios=[10, 1])
-    
+
     if not vertical:
         axes = subfigs[0].subplots(nrows=num_panels, ncols=1, squeeze=False).ravel()
     else:
         axes = subfigs[0].subplots(nrows=1, ncols=num_panels, squeeze=False).ravel()
-    
+
     for idx in range(num_panels):
         intensity_matrix = intensity_matrices[idx].loc[:,::-1]
         size_matrix = size_matrices[idx].loc[:,::-1]
         highlight_matrix = highlight_matrices[idx].loc[:,::-1]
 
         ax = axes[idx]
-        
+
         melt_lff = pd.melt(intensity_matrix, ignore_index=False, var_name='condition', value_name='lff').set_index('condition', append=True)[::-1]
         melt_lff.index.names = ['kinase','condition']
         melt_pval = pd.melt(size_matrix, ignore_index=False, var_name='condition', value_name='pval').set_index('condition', append=True)[::-1]
@@ -957,13 +957,13 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1,
         else:
             eps = ((maxy - miny) / (len(ax.get_yticks()) - 1)) / 2
         ax.set_ylim(maxy+eps, miny-eps)
-        
+
         ax.grid(which='major', color='k', linestyle=':')
         ax.set_axisbelow(True)
         ax.set_aspect('equal', 'box')
         ax.tick_params(axis='x', which='major', labelsize=xlabels_size, labelrotation=90)
         ax.tick_params(axis='y', which='major', labelsize=ylabels_size)
-        
+
     fig.canvas.draw()
     axes = subfigs[0].axes
     for ax in axes:
@@ -1001,14 +1001,14 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1,
                 for l in ax.get_yticklabels():
                     label_color = cond_colors[l.get_text()]
                     l.set_color(label_color)
-    
+
     axes = subfigs[1].subplots(nrows=3, ncols=1, squeeze=False).ravel()
-    
+
     if family_legned and color_kins_by:
         patches = [mpatches.Patch(color=x[1], label=x[0]) for x in kin_categories_colors.items()]
         axes[0].legend(handles=patches, loc='center', title='Family', facecolor='white')
     axes[0].axis('off')
-    
+
     if pval_legend:
         size_legend_data = pd.DataFrame({'x': 0, 'y': 0, 'sizes': np.linspace(pval_slim[0], pval_slim[1], 4, dtype=int)})
         sns.scatterplot(x='x', y='y', data=size_legend_data, size='sizes', sizes=bubblesize_range, size_norm=pval_slim,
@@ -1022,7 +1022,7 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1,
         axes[1].legend(handles=handles, labels=[str(10**-x) for x in size_legend_data['sizes'][:3]] + ['<'+str(10**int(-(size_legend_data['sizes'][3])))],
                        title='Adj. p-value', loc='center', labelspacing=pval_legend_spacing, facecolor='white')
     axes[1].axis('off')
-    
+
     if lff_cbar:
         axes[2].set(aspect=10)
         sm = plt.cm.ScalarMappable(cmap='coolwarm', norm=cnorm)
@@ -1031,11 +1031,11 @@ def plot_bubblemap(lff_data, pval_data, cont_kins=None, sig_lff=0, sig_pval=0.1,
         cbar.ax.set_title('log2(FF)')
     else:
         axes[2].axis('off')
-    
+
     subfigs[0].suptitle(title, fontsize=16)
     if max_window:
         figManager = plt.get_current_fig_manager()
         figManager.window.showMaximized()
-    
+
     if save_fig:
-        fig.savefig(save_fig, dpi=1000)
+        fig.savefig(save_fig, dpi=1000)
\ No newline at end of file
diff --git a/src/kinase_library/utils/utils.py b/src/kinase_library/utils/utils.py
index 9a2f499..edf0d44 100644
--- a/src/kinase_library/utils/utils.py
+++ b/src/kinase_library/utils/utils.py
@@ -494,9 +494,9 @@ def unprime_substrate(sub):
 
 #%%
 """
-###################
-# Other utilities #
-###################
+###########
+# Protein #
+###########
 """
 
 def parse_phosphosites(sequence, phosphoacceptor=['S', 'T', 'Y'], pp=False):

From e72e12c2decec3d8a53c651ca6f3fbee38e7fd2b Mon Sep 17 00:00:00 2001
From: Tomer Yaron-Barir <tmy2110@cumc.columbia.edu>
Date: Fri, 27 Jun 2025 11:04:12 -0400
Subject: [PATCH 3/8] Function to score all the possible phosphosites on a
 protein

---
 src/kinase_library/__init__.py        |  1 +
 src/kinase_library/modules/scoring.py | 86 +++++++++++++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 src/kinase_library/modules/scoring.py

diff --git a/src/kinase_library/__init__.py b/src/kinase_library/__init__.py
index 1c19b1d..0c4eea8 100644
--- a/src/kinase_library/__init__.py
+++ b/src/kinase_library/__init__.py
@@ -15,6 +15,7 @@
 from .utils.utils import *
 
 from .modules.data import *
+from .modules.scoring import *
 from .modules.enrichment import *
 
 from .logger import TqdmToLoggerWithStatus, logger
diff --git a/src/kinase_library/modules/scoring.py b/src/kinase_library/modules/scoring.py
new file mode 100644
index 0000000..9448a4a
--- /dev/null
+++ b/src/kinase_library/modules/scoring.py
@@ -0,0 +1,86 @@
+"""
+################################
+# The Kinase Library - Scoring #
+################################
+"""
+
+import os
+import numpy as np
+import pandas as pd
+
+from ..objects import phosphoproteomics
+from ..utils import _global_vars, exceptions, utils
+
+#%%
+"""
+#################
+# Score Protein #
+#################
+"""
+
+def score_protein(seq, kinases=None, phosphoacceptor=['S', 'T', 'Y'],
+                   pp=False, st_fav=True, non_canonical=False,
+                   score_promiscuity_threshold=1,
+                   percentile_promiscuity_threshold=90,
+                   score_round_digits=3, percentile_round_digits=2):
+    """
+    Score all potential phosphosites in a given protein sequence.
+
+
+    seq : str
+        Protein sequence to analyze for phosphorylation sites.
+    kinases : list, optional
+        List of specific kinases to consider in the analysis.
+        If None, all available kinases are used. The default is None.
+    phosphoacceptor : list, optional
+        Amino acid residues that can be phosphorylated. The default is ['S', 'T', 'Y'].
+    pp : bool, optional
+        Phospho-residues (s/t/y). The default is False.
+    st_fav : bool, optional
+        S/T favorability. The default is True.
+    non_canonical : bool, optional
+        Return also non-canonical kinases. For tyrosine kinases only. The default is False.
+    score_promiscuity_threshold : float, optional
+        Score threshold above which kinases are considered predicted.
+    percentile_promiscuity_threshold : float, optional
+        Percentile threshold above which kinases are considered predicted.
+    score_round_digits : int, optional
+        Number of decimal digits for score. The default is 4.
+    percentile_round_digits : int, optional
+        Number of decimal digits for percentile. The default is 2.
+
+    Raises
+    ------
+    ValueError
+        If phosphoacceptor residues are not a subset of valid phosphorylatable residues.
+
+    Returns
+    -------
+    results : dict
+        Dictionary containing prediction results with two keys:
+        - 'ser_thr': Predictions for serine/threonine kinases
+        - 'tyrosine': Predictions for tyrosine kinases
+        Each prediction includes kinase scores, percentiles, ranks, and promiscuity indices.
+
+    """
+
+    if not set(phosphoacceptor).issubset(_global_vars.valid_phos_res):
+        raise ValueError(f'Phosphoacceptor(s) must be a subset of {_global_vars.valid_phos_res}.')
+
+    phos_sites = utils.parse_phosphosites(seq, phosphoacceptor=phosphoacceptor, pp=pp)
+
+    pps = phosphoproteomics.PhosphoProteomics(data=phos_sites, seq_col='Sequence', pp=pp, new_seq_phos_res_cols=False, suppress_warnings=False)
+
+    results = {'ser_thr': pps.predict(kin_type='ser_thr', kinases=kinases, st_fav=st_fav,
+                                      score_promiscuity_threshold=score_promiscuity_threshold,
+                                      percentile_promiscuity_threshold=percentile_promiscuity_threshold,
+                                      score_round_digits=percentile_promiscuity_threshold,
+                                      percentile_round_digits=percentile_round_digits),
+               'tyrosine': pps.predict(kin_type='tyrosine', kinases=kinases, non_canonical=non_canonical,
+                                       score_promiscuity_threshold=score_promiscuity_threshold,
+                                       percentile_promiscuity_threshold=percentile_promiscuity_threshold,
+                                       score_round_digits=percentile_promiscuity_threshold,
+                                       percentile_round_digits=percentile_round_digits)
+               }
+
+    return(results)
\ No newline at end of file

From 79b3a519c020d0272b20aab143a0764ebecff129 Mon Sep 17 00:00:00 2001
From: Tomer Yaron-Barir <tmy2110@cumc.columbia.edu>
Date: Fri, 27 Jun 2025 11:04:33 -0400
Subject: [PATCH 4/8] Bumping version

---
 src/kinase_library/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kinase_library/__init__.py b/src/kinase_library/__init__.py
index 0c4eea8..569361b 100644
--- a/src/kinase_library/__init__.py
+++ b/src/kinase_library/__init__.py
@@ -24,7 +24,7 @@
 
 #%%
 
-__version__ = "1.4.1"
+__version__ = "1.5.0"
 
 #%% Loading scored phosphoproteome one time per session
 

From b9e0391fb0636180bd023fc709897f098c862599 Mon Sep 17 00:00:00 2001
From: Tomer Yaron-Barir <tmy2110@cumc.columbia.edu>
Date: Fri, 27 Jun 2025 11:08:46 -0400
Subject: [PATCH 5/8] Fixing parameters assignment

---
 src/kinase_library/modules/scoring.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/kinase_library/modules/scoring.py b/src/kinase_library/modules/scoring.py
index 9448a4a..88aa6b6 100644
--- a/src/kinase_library/modules/scoring.py
+++ b/src/kinase_library/modules/scoring.py
@@ -74,12 +74,12 @@ def score_protein(seq, kinases=None, phosphoacceptor=['S', 'T', 'Y'],
     results = {'ser_thr': pps.predict(kin_type='ser_thr', kinases=kinases, st_fav=st_fav,
                                       score_promiscuity_threshold=score_promiscuity_threshold,
                                       percentile_promiscuity_threshold=percentile_promiscuity_threshold,
-                                      score_round_digits=percentile_promiscuity_threshold,
+                                      score_round_digits=score_round_digits,
                                       percentile_round_digits=percentile_round_digits),
                'tyrosine': pps.predict(kin_type='tyrosine', kinases=kinases, non_canonical=non_canonical,
                                        score_promiscuity_threshold=score_promiscuity_threshold,
                                        percentile_promiscuity_threshold=percentile_promiscuity_threshold,
-                                       score_round_digits=percentile_promiscuity_threshold,
+                                       score_round_digits=score_round_digits,
                                        percentile_round_digits=percentile_round_digits)
                }
 

From d929364378220f32c575f6d85838d937a85c1263 Mon Sep 17 00:00:00 2001
From: Tomer Yaron-Barir <tmy2110@cumc.columbia.edu>
Date: Fri, 27 Jun 2025 11:11:42 -0400
Subject: [PATCH 6/8] Removing unused imports

---
 src/kinase_library/modules/scoring.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/kinase_library/modules/scoring.py b/src/kinase_library/modules/scoring.py
index 88aa6b6..cb532ca 100644
--- a/src/kinase_library/modules/scoring.py
+++ b/src/kinase_library/modules/scoring.py
@@ -4,12 +4,8 @@
 ################################
 """
 
-import os
-import numpy as np
-import pandas as pd
-
 from ..objects import phosphoproteomics
-from ..utils import _global_vars, exceptions, utils
+from ..utils import _global_vars, utils
 
 #%%
 """

From 165a698436f64deb70d4b59b1a297e341947f0b4 Mon Sep 17 00:00:00 2001
From: Tomer Yaron-Barir <tmy2110@cumc.columbia.edu>
Date: Fri, 27 Jun 2025 11:24:52 -0400
Subject: [PATCH 7/8] Splitting kinases by kinase type when kinase list is
 provided

---
 src/kinase_library/modules/scoring.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/kinase_library/modules/scoring.py b/src/kinase_library/modules/scoring.py
index cb532ca..b8f682f 100644
--- a/src/kinase_library/modules/scoring.py
+++ b/src/kinase_library/modules/scoring.py
@@ -5,7 +5,8 @@
 """
 
 from ..objects import phosphoproteomics
-from ..utils import _global_vars, utils
+from ..utils import _global_vars, exceptions, utils
+from ..modules import data
 
 #%%
 """
@@ -62,17 +63,25 @@ def score_protein(seq, kinases=None, phosphoacceptor=['S', 'T', 'Y'],
 
     if not set(phosphoacceptor).issubset(_global_vars.valid_phos_res):
         raise ValueError(f'Phosphoacceptor(s) must be a subset of {_global_vars.valid_phos_res}.')
+    if kinases is not None:
+        kinases = [kin.upper() for kin in kinases]
+        exceptions.check_kin_name(kinases)
+        ser_thr_kins = [kin for kin in kinases if kin in data.get_kinase_list('ser_thr')]
+        tyrosine_kins = [kin for kin in kinases if kin in data.get_kinase_list('tyrosine')]
+    else:
+        ser_thr_kins = None
+        tyrosine_kins = None
 
     phos_sites = utils.parse_phosphosites(seq, phosphoacceptor=phosphoacceptor, pp=pp)
 
     pps = phosphoproteomics.PhosphoProteomics(data=phos_sites, seq_col='Sequence', pp=pp, new_seq_phos_res_cols=False, suppress_warnings=False)
 
-    results = {'ser_thr': pps.predict(kin_type='ser_thr', kinases=kinases, st_fav=st_fav,
+    results = {'ser_thr': pps.predict(kin_type='ser_thr', kinases=ser_thr_kins, st_fav=st_fav,
                                       score_promiscuity_threshold=score_promiscuity_threshold,
                                       percentile_promiscuity_threshold=percentile_promiscuity_threshold,
                                       score_round_digits=score_round_digits,
                                       percentile_round_digits=percentile_round_digits),
-               'tyrosine': pps.predict(kin_type='tyrosine', kinases=kinases, non_canonical=non_canonical,
+               'tyrosine': pps.predict(kin_type='tyrosine', kinases=tyrosine_kins, non_canonical=non_canonical,
                                        score_promiscuity_threshold=score_promiscuity_threshold,
                                        percentile_promiscuity_threshold=percentile_promiscuity_threshold,
                                        score_round_digits=score_round_digits,

From eb2aec2505f1c3f426b7e93ffebca01162813ee0 Mon Sep 17 00:00:00 2001
From: Tomer Yaron-Barir <tmy2110@cumc.columbia.edu>
Date: Fri, 27 Jun 2025 11:27:45 -0400
Subject: [PATCH 8/8] Fixing documentations

---
 src/kinase_library/modules/scoring.py         |   2 +-
 .../objects/phosphoproteomics.py              | 213 +++++++++---------
 src/kinase_library/utils/utils.py             |   5 +-
 3 files changed, 109 insertions(+), 111 deletions(-)

diff --git a/src/kinase_library/modules/scoring.py b/src/kinase_library/modules/scoring.py
index b8f682f..f1526db 100644
--- a/src/kinase_library/modules/scoring.py
+++ b/src/kinase_library/modules/scoring.py
@@ -42,7 +42,7 @@ def score_protein(seq, kinases=None, phosphoacceptor=['S', 'T', 'Y'],
     percentile_promiscuity_threshold : float, optional
         Percentile threshold above which kinases are considered predicted.
     score_round_digits : int, optional
-        Number of decimal digits for score. The default is 4.
+        Number of decimal digits for score. The default is 3.
     percentile_round_digits : int, optional
         Number of decimal digits for percentile. The default is 2.
 
diff --git a/src/kinase_library/objects/phosphoproteomics.py b/src/kinase_library/objects/phosphoproteomics.py
index daede32..8210baf 100644
--- a/src/kinase_library/objects/phosphoproteomics.py
+++ b/src/kinase_library/objects/phosphoproteomics.py
@@ -67,19 +67,19 @@ class PhosphoProteomics(object):
         21390    ESSPILTsFELVKVP
         21391    THRRMVVsMPNLQDI
     """
-        
+
     def __init__(self, data, seq_col=None,
                  pad=False, pp=False,
                  drop_invalid_subs=True,
                  new_seq_phos_res_cols=True,
                  suppress_warnings=False):
-        
+
         if not isinstance(data, pd.DataFrame):
             raise ValueError('\'data\' must be a pd.DataFrame.')
-        
+
         if seq_col is None:
             seq_col = _global_vars.default_seq_col
-        
+
         if drop_invalid_subs:
             processed_data,omited_entries = utils.filter_invalid_subs(data=data, seq_col=seq_col, suppress_warnings=suppress_warnings)
         else:
@@ -88,39 +88,39 @@ def __init__(self, data, seq_col=None,
         self.omited_entries = omited_entries
         if len(omited_entries)>0 and not suppress_warnings:
             print('Use the \'omited_entries\' attribute to view dropped enteries due to invalid sequences.')
-        
+
         subs_list = processed_data[seq_col]
         if pad:
             subs_list = processed_data[seq_col].apply(lambda x: '_'*pad[0] + x + '_'*pad[1])
         subs_list = subs_list.apply(utils.sequence_to_substrate, pp=pp, validate_phos_res=drop_invalid_subs, validate_aa=drop_invalid_subs)
-        
+
         phos_res = subs_list.str.lower().str[7]
-        
+
         if new_seq_phos_res_cols:
             processed_data = processed_data.rename({_global_vars.default_seq_col: 'ORIGINAL_'+_global_vars.default_seq_col, 'phos_res': 'original_phos_res'}, axis=1)
         processed_data['phos_res'] = phos_res
         processed_data[_global_vars.default_seq_col] = subs_list
-        
+
         self.data = processed_data
         self.original_data = data
         self.seq_col = seq_col
         self.substrates = processed_data[_global_vars.default_seq_col]
         self.phos_res = processed_data['phos_res']
         self.pp = pp
-        
+
         self.ser_thr_data = processed_data[processed_data['phos_res'].isin(['S','T','s','t'])]
         self.ser_thr_substrates = self.ser_thr_data[_global_vars.default_seq_col]
         self._ser_thr_phos_res = self.ser_thr_data['phos_res']
         self.tyrosine_data = processed_data[processed_data['phos_res'].isin(['Y','y'])]
         self.tyrosine_substrates = self.tyrosine_data[_global_vars.default_seq_col]
         self._tyrosine_phos_res = self.tyrosine_data['phos_res']
-    
-    
+
+
     @classmethod
     def from_file(cls, data_file, seq_col=None, pad=False, pp=False, drop_invalid_subs=True, new_seq_phos_res_cols=True, suppress_warnings=False, **file_args):
         """
         Create PhosphoProteomics object from file.
-    
+
         Parameters
         ----------
         data_file : str
@@ -140,12 +140,12 @@ def from_file(cls, data_file, seq_col=None, pad=False, pp=False, drop_invalid_su
             Do not print warnings. The default is False.
         **file_args : args
             Key arguments for pd.read_csv().
-        
+
         Returns
         -------
         pps : kl.PhosphoProteomics
             PhosphoProteomics object with the data from the file.
-        
+
         Examples
         -------
         >>> pps = kl.PhosphoProteomics(data_file='./../databases/substrates/Kinase_Substrate_Dataset_count_07_2021.txt', skiprows=3)
@@ -175,9 +175,9 @@ def from_file(cls, data_file, seq_col=None, pad=False, pp=False, drop_invalid_su
             21390    ESSPILTsFELVKVP
             21391    THRRMVVsMPNLQDI
         """
-        
+
         file_type = data_file.split('.')[-1]
-                    
+
         if file_type == 'parquet':
             data = pq.read_table(data_file).to_pandas()
         elif file_type in ['xlsx','xls']:
@@ -186,19 +186,19 @@ def from_file(cls, data_file, seq_col=None, pad=False, pp=False, drop_invalid_su
             data = pd.read_csv(data_file, **file_args)
         else:
             data = pd.read_csv(data_file, sep = '\t', **file_args)
-        
+
         if seq_col is None:
             seq_col = _global_vars.default_seq_col
-            
+
         pps = cls(data, seq_col=seq_col, pad=pad, pp=pp, drop_invalid_subs=drop_invalid_subs, new_seq_phos_res_cols=new_seq_phos_res_cols, suppress_warnings=suppress_warnings)
-        
+
         return(pps)
-    
-        
+
+
     def _calculate_subs_binary_matrix(self, kin_type=['ser_thr','tyrosine'], pp=None, pos=None):
         """
         Making a binary matrix for a substrate.
-    
+
         Parameters
         ----------
         kin_type : str or list, optional
@@ -212,32 +212,32 @@ def _calculate_subs_binary_matrix(self, kin_type=['ser_thr','tyrosine'], pp=None
         -------
         Setting self.*kin_type*_bin_matrix attribute for binary matrix.
         """
-        
+
         if isinstance(kin_type, str):
             kin_type = [kin_type]
-            
+
         if pp is None:
             pp = self.pp
-        
+
         for kt in kin_type:
             exceptions.check_kin_type(kt)
-            
+
             aa_labels = data.get_aa()
             if pos is None:
                 pos = data.get_positions(kt)
-            
+
             substrates = getattr(self, kt + '_substrates')
-            
+
             subs_mat = utils.sub_binary_matrix(substrates, aa=aa_labels, pos=pos, pp=pp)
             setattr(self, '_' + kt + '_bin_matrix', subs_mat)
-    
-    
+
+
     def score(self, kin_type=None, kinases=None, st_fav=True,
               non_canonical=False, values_only=False, log2_score=True,
               pos=None, round_digits=3, return_values=True):
         """
         Calculate score of the phosphoproteomics data for the given kinases.
-        
+
         Score is being computed in a vectorized way:
             1. Making binary matrix for the substrates.
             2. Converting kinase matrix (norm-scaled) to log2
@@ -273,30 +273,30 @@ def score(self, kin_type=None, kinases=None, st_fav=True,
                 * additional column with the -/+7 amino acids substrate
                 * scores for all specificed kinases
         """
-        
+
         if all(v is None for v in [kin_type, kinases]):
             raise ValueError('Either list of kinases or kinase type must be provided.')
-        
+
         if kinases is None:
             kinases = data.get_kinase_list(kin_type, non_canonical=non_canonical)
         elif isinstance(kinases, str):
             kinases = [kinases]
-        
+
         kinases = [x.upper() for x in kinases]
         exceptions.check_kin_name(kinases)
-        
+
         if kin_type is None:
             kin_type = data.get_kinase_type(kinases[0])
         else:
             exceptions.check_kin_type(kin_type)
         exceptions.check_kin_list_type(kinases, kin_type=kin_type)
-        
+
         print('Scoring '+str(len(getattr(self,kin_type+'_substrates')))+' '+kin_type+' substrates')
         logger.info('Scoring '+str(len(getattr(self,kin_type+'_substrates')))+' '+kin_type+' substrates')
         if not hasattr(self, '_' + kin_type + '_bin_matrix'):
             self._calculate_subs_binary_matrix(kin_type=kin_type, pp=self.pp, pos=pos)
         subs_bin_mat = getattr(self, '_' + kin_type + '_bin_matrix')
-        
+
         # Using table with all the matrices concatenated (log2)
         kin_mat_log2 = data.get_multiple_matrices(kinases, kin_type=kin_type, mat_type='log2', pos=pos)
 
@@ -308,29 +308,29 @@ def score(self, kin_type=None, kinases=None, st_fav=True,
             st_fav_scores_log2 = np.log2(st_fav_scores)
             score_log2 = score_log2 + st_fav_scores_log2
         score = np.power(2,score_log2)
-        
+
         if log2_score:
             score_output = score_log2
         else:
             score_output = score
-        
+
         score_output = score_output.round(round_digits)
         score_rank_output = score_output.rank(method='min', ascending=False, axis=1).astype(int)
 
         data_index = getattr(self, kin_type + '_data').index
         data_score_output = pd.concat([getattr(self, kin_type + '_data').reset_index(drop=True),score_output.reset_index(drop=True)], axis=1)
         data_score_output.index = data_index
-        
+
         setattr(self, kin_type+'_scores', score_output)
         setattr(self, kin_type+'_score_ranks', score_rank_output)
         setattr(self, kin_type+'_scored_kins', kinases)
-        
+
         if return_values:
             if values_only:
                 return(score_output)
             return(data_score_output)
-    
-    
+
+
     def percentile(self, kin_type=None, kinases=None,
                    st_fav=True, non_canonical=False,
                    subs_scores=None, subs_scores_format=None,
@@ -339,7 +339,7 @@ def percentile(self, kin_type=None, kinases=None,
                    round_digits=2, return_values=True):
         """
         Calculate the percentile score of the phosphoproteomics data for the given kinases.
-        
+
         After score is being computed, the percentile of that score is being
         computed based on a basal scored phosphoproteome.
 
@@ -381,24 +381,24 @@ def percentile(self, kin_type=None, kinases=None,
                 * additional column with the -/+7 amino acids substrate
                 * percentiles for all specificed kinases
         """
-        
+
         if all(v is None for v in [kin_type, kinases]):
             raise ValueError('Either list of kinases or kinase type must be provided.')
-        
+
         if kinases is None:
             kinases = data.get_kinase_list(kin_type, non_canonical=non_canonical)
         elif isinstance(kinases, str):
             kinases = [kinases]
-        
+
         kinases = [x.upper() for x in kinases]
         exceptions.check_kin_name(kinases)
-        
+
         if kin_type is None:
             kin_type = data.get_kinase_type(kinases[0])
         else:
             exceptions.check_kin_type(kin_type)
         exceptions.check_kin_list_type(kinases, kin_type=kin_type)
-        
+
         percent_output = []
 
         if subs_scores is None:
@@ -413,12 +413,12 @@ def percentile(self, kin_type=None, kinases=None,
                 raise ValueError('Please specify the format of input score data (\'subs_scores_format\').')
             elif subs_scores_format not in ['linear','log2']:
                 raise ValueError('Please provide valid value for \'subs_scores_format\': \'linear\' or \'log2\'.')
-            
+
             if (subs_scores_format == 'linear'):
                 score = np.log2(subs_scores)
             else:
                 score = subs_scores.copy()
-        
+
         if len(score) == 0: # Data is empty - return empty dataframe
             percent_output = score.copy()
             setattr(self, kin_type+'_percentiles', percent_output)
@@ -429,18 +429,18 @@ def percentile(self, kin_type=None, kinases=None,
                 if values_only:
                     return(percent_output)
                 return(data_percent_output)
-        
+
         if customized_scored_phosprot is not None:
             all_scored_phosprot = customized_scored_phosprot
         else:
             all_scored_phosprot = core.ScoredPhosphoProteome(phosprot_name=_global_vars.phosprot_name, phosprot_path=phosprot_path)
-        
+
         if kin_type is None:
             kin_type = data.get_kinase_type(kinases[0])
         else:
             exceptions.check_kin_type(kin_type)
         exceptions.check_kin_list_type(kinases, kin_type=kin_type)
-        
+
         if kin_type == 'ser_thr':
             scored_phosprot = all_scored_phosprot.ser_thr_scores
         elif kin_type == 'tyrosine':
@@ -448,35 +448,35 @@ def percentile(self, kin_type=None, kinases=None,
         else:
             raise ValueError('Wrong kinase type.')
         scored_phosprot = scored_phosprot.loc[:,kinases] # only for requested kinases if subset
-        
+
         # If scored phopshoproteome is linear values - converting it to log2 values
         if not all_scored_phosprot.log2_values:
             scored_phosprot = np.log2(scored_phosprot)
-        
+
         print('Calculating percentile for '+str(len(getattr(self,kin_type+'_substrates')))+' '+kin_type+' substrates')
         logger.info('Calculating percentile for '+str(len(getattr(self,kin_type+'_substrates')))+' '+kin_type+' substrates')
         percent_output = scored_phosprot.progress_apply(lambda x: x.sort_values().searchsorted(score[x.name], side='right'))/len(scored_phosprot)*100
         percent_output.index = score.index
-        
+
         percent_output = percent_output.round(round_digits)
         percent_rank_output = percent_output.rank(method='min', ascending=False, axis=1).astype(int)
-        
+
         data_index = getattr(self, kin_type + '_data').index
         data_percent_output = pd.concat([getattr(self, kin_type + '_data').reset_index(drop=True),percent_output.reset_index(drop=True)], axis=1)
         data_percent_output.index = data_index
-        
+
         setattr(self, kin_type+'_percentiles', percent_output)
         setattr(self, kin_type+'_percentile_ranks', percent_rank_output)
         setattr(self, kin_type+'_percentiled_kins', kinases)
-        
+
         self.phosprot_name = _global_vars.phosprot_name
-        
+
         if return_values:
             if values_only:
                 return(percent_output)
             return(data_percent_output)
-    
-    
+
+
     def rank(self, metric, kin_type=None, kinases=None,
              st_fav=True, non_canonical=False,
              pos=None, rank_kinases=None, values_only=False,
@@ -518,7 +518,7 @@ def rank(self, metric, kin_type=None, kinases=None,
         ranks : pd.DataFrame
             Ranks of the kinases based on the specified scoring metric.
         """
-        
+
         if all(v is None for v in [kin_type, kinases]):
             raise ValueError('Either list of kinases or kinase type must be provided.')
         if kinases is None:
@@ -543,23 +543,23 @@ def rank(self, metric, kin_type=None, kinases=None,
         exceptions.check_kin_list_type(rank_kinases, kin_type=kin_type)
         exceptions.check_kin_list_type(kinases, kin_type=kin_type)
         exceptions.check_scoring_metric(metric)
-        
+
         if metric == 'score':
             self.score(kin_type=kin_type, kinases=rank_kinases, st_fav=st_fav, non_canonical=non_canonical, return_values=False, pos=pos, round_digits=score_round_digits)
         elif metric == 'percentile':
             self.percentile(kin_type=kin_type, kinases=rank_kinases, st_fav=st_fav, non_canonical=non_canonical, return_values=False, pos=pos, round_digits=percentile_round_digits)
-        
+
         rank_output = getattr(self, kin_type+'_'+metric+'_ranks')[kinases]
-        
+
         data_index = getattr(self, kin_type + '_data').index
         data_rank_output = pd.concat([getattr(self, kin_type + '_data').reset_index(drop=True),rank_output.reset_index(drop=True)], axis=1)
         data_rank_output.index = data_index
-        
+
         if values_only:
             return(rank_output)
         return(data_rank_output)
 
-    
+
     def predict(self, metric=['score','percentile'], kin_type=None, kinases=None,
                 st_fav=True, non_canonical=False, values_only=False,
                 score_promiscuity_threshold=1, percentile_promiscuity_threshold=90,
@@ -589,7 +589,7 @@ def predict(self, metric=['score','percentile'], kin_type=None, kinases=None,
         pos : list, optional
             List of positions to use in the matrix rows. The default is None.
         score_round_digits : int, optional
-            Number of decimal digits for score. The default is 4.
+            Number of decimal digits for score. The default is 3.
         percentile_round_digits : int, optional
             Number of decimal digits for percentile. The default is 2.
 
@@ -620,7 +620,7 @@ def predict(self, metric=['score','percentile'], kin_type=None, kinases=None,
 
         if isinstance(metric, str):
             metric = [metric]
-        
+
         prediction_output = pd.DataFrame(index=getattr(self, kin_type+'_substrates'))
 
         if 'score' in metric:
@@ -633,7 +633,7 @@ def predict(self, metric=['score','percentile'], kin_type=None, kinases=None,
             percentiles = getattr(self, kin_type+'_percentiles')[kinases]
             percent_promis = self.promiscuity_index(kin_type=kin_type, kinases=kinases, metric='percentile', threshold=percentile_promiscuity_threshold, pos=pos, st_fav=st_fav, non_canonical=non_canonical, values_only=True)
             prediction_output = pd.concat([prediction_output, percent_promis], axis=1)
-        
+
         for kin in kinases:
             if 'score' in metric:
                 score_df = pd.DataFrame({kin+'_score': scores[kin], kin+'_score_rank': score_ranks[kin]})
@@ -642,16 +642,16 @@ def predict(self, metric=['score','percentile'], kin_type=None, kinases=None,
             if 'percentile' in metric:
                 percentile_df = pd.DataFrame({kin+'_percentile': percentiles[kin], kin+'_percentile_rank': percentile_ranks[kin]})
                 prediction_output = pd.concat([prediction_output, percentile_df], axis=1)
-        
+
         data_index = getattr(self, kin_type + '_data').index
         data_prediction_output = pd.concat([getattr(self, kin_type + '_data').reset_index(drop=True),prediction_output.reset_index(drop=True)], axis=1)
         data_prediction_output.index = data_index
-        
+
         if values_only:
             return(prediction_output)
         return(data_prediction_output)
-    
-    
+
+
     def promiscuity_index(self, kin_type=None, kinases=None,
                           metric='percentile', threshold=90, pos=None,
                           st_fav=True, non_canonical=False,
@@ -689,7 +689,7 @@ def promiscuity_index(self, kin_type=None, kinases=None,
         None.
 
         """
-        
+
         if all(v is None for v in [kin_type, kinases]):
             raise ValueError('Either list of kinases or kinase type must be provided.')
         if kinases is None:
@@ -702,28 +702,28 @@ def promiscuity_index(self, kin_type=None, kinases=None,
         else:
             exceptions.check_kin_type(kin_type)
         exceptions.check_kin_list_type(kinases, kin_type=kin_type)
-        
+
         if not hasattr(self, kin_type+'_'+metric+'s'):
             if metric == 'score':
                 self.score(kin_type=kin_type, kinases=kinases, st_fav=st_fav, non_canonical=non_canonical, pos=pos, return_values=False)
             elif metric == 'percentile':
                 self.percentile(kin_type=kin_type, kinases=kinases, st_fav=st_fav, non_canonical=non_canonical, pos=pos, return_values=False)
-        
+
         metric_data = getattr(self, kin_type+'_'+metric+'s')
         promis_idx = (metric_data >= threshold).sum(axis=1)
         promis_idx.name = metric.capitalize() + ' Promiscuity Index'
-        
+
         setattr(self, kin_type+'_'+metric+'_'+'promiscuity_index', promis_idx)
-        
+
         data_index = getattr(self, kin_type + '_data').index
         data_promis_output = pd.concat([getattr(self, kin_type + '_data').reset_index(drop=True),promis_idx.reset_index(drop=True)], axis=1)
         data_promis_output.index = data_index
-        
+
         if values_only:
             return(promis_idx)
         return(data_promis_output)
-        
-    
+
+
     def submit_scores(self, kin_type, scores, suppress_messages=False):
         """
         Submitting scores for the substrates.
@@ -743,30 +743,30 @@ def submit_scores(self, kin_type, scores, suppress_messages=False):
         -------
         None.
         """
-        
+
         exceptions.check_kin_type(kin_type)
         if ~(scores.columns.isin(data.get_kinase_list(kin_type, non_canonical=True)).all()):
             raise ValueError(f'Score columns must contain only valid {kin_type} kinases. Use kl.get_kinase_list() to get the list of valid kinases.')
-        
+
         data_subs = getattr(self, kin_type + '_substrates')
         scores_unique = scores[~scores.index.duplicated(keep='first')]
-        
+
         if not set(data_subs) <= set(scores_unique.index):
             raise ValueError('Scores must be provided for all substrates in the data.')
-        
+
         if scores_unique.isna().any().any():
             raise ValueError('Some score values are missing.')
-            
+
         subs_scores = scores_unique.loc[data_subs]
-        
+
         score_rank = subs_scores.rank(method='min', ascending=False, axis=1).astype(int)
         setattr(self, kin_type+'_scores', subs_scores)
         setattr(self, kin_type+'_score_ranks', score_rank)
-        
+
         if not suppress_messages:
             print('Scores submitted successfully.')
-        
-        
+
+
     def submit_percentiles(self, kin_type, percentiles, phosprot_name=None, suppress_messages=False):
         """
         Submitting percentiles for the substrates.
@@ -788,7 +788,7 @@ def submit_percentiles(self, kin_type, percentiles, phosprot_name=None, suppress
         -------
         None.
         """
-        
+
         exceptions.check_kin_type(kin_type)
         if ~(percentiles.columns.isin(data.get_kinase_list(kin_type, non_canonical=True)).all()):
             raise ValueError(f'Percentile columns must contain only valid {kin_type} kinases. Use kl.get_kinase_list() to get the list of valid kinases.')
@@ -797,27 +797,27 @@ def submit_percentiles(self, kin_type, percentiles, phosprot_name=None, suppress
 
         data_subs = getattr(self, kin_type + '_substrates')
         percentiles_unique = percentiles[~percentiles.index.duplicated(keep='first')]
-        
+
         if not set(data_subs) <= set(percentiles_unique.index):
             raise ValueError('Percentiles must be provided for all substrates in the data.')
-        
+
         if percentiles_unique.isna().any().any():
             raise ValueError('Some percentile values are missing.')
-        
+
         subs_percentiles = percentiles_unique.loc[data_subs]
-        
+
         percentile_rank = subs_percentiles.rank(method='min', ascending=False, axis=1).astype(int)
         setattr(self, kin_type+'_percentiles', subs_percentiles)
         setattr(self, kin_type+'_percentile_ranks', percentile_rank)
-        
+
         if phosprot_name is None:
             phosprot_name = _global_vars.phosprot_name
         self.phosprot_name = phosprot_name
-        
+
         if not suppress_messages:
             print('Percentiles submitted successfully.')
-    
-    
+
+
     def merge_data_scores(self, kin_type, score_type):
         """
         Merging phosphoproteome data and score data.
@@ -834,13 +834,12 @@ def merge_data_scores(self, kin_type, score_type):
         merged_data : dataframe
             Merged dataframe of the phosphoproteome data and score data.
         """
-        
+
         exceptions.check_kin_type(kin_type)
         exceptions.check_score_type(score_type)
-        
+
         data = getattr(self, kin_type+'_data').set_index(_global_vars.default_seq_col, drop=False)
         scores = getattr(self, kin_type+'_'+score_type)
         merged_data = pd.concat([data,scores], axis=1)
-        
-        return(merged_data)
-        
\ No newline at end of file
+
+        return(merged_data)
\ No newline at end of file
diff --git a/src/kinase_library/utils/utils.py b/src/kinase_library/utils/utils.py
index edf0d44..1e37294 100644
--- a/src/kinase_library/utils/utils.py
+++ b/src/kinase_library/utils/utils.py
@@ -9,7 +9,6 @@
 import pandas as pd
 import matplotlib.pyplot as plt
 from .. import logomaker
-from tqdm import tqdm
 from sklearn.preprocessing import MultiLabelBinarizer
 import warnings
 
@@ -516,8 +515,8 @@ def parse_phosphosites(sequence, phosphoacceptor=['S', 'T', 'Y'], pp=False):
     Returns
     -------
     pd.DataFrame
-        DataFrame with columns: residue, position, window.
-        Window is 15-mer centered on phosphosite, padded with '-'.
+        DataFrame with columns: Residue, Position, Sequence.
+        Sequence is 15-mer centered on phosphosite, padded with '_'.
     """
 
     if not set(phosphoacceptor).issubset(_global_vars.valid_phos_res):