From f5af1ce502c8805933c163cd7e537bf28be292f7 Mon Sep 17 00:00:00 2001 From: Du Phan Date: Wed, 30 Dec 2020 12:22:10 +0100 Subject: [PATCH 1/4] add dku_model_accessor --- dkulib/dku_model_accessor/__init__.py | 3 + dkulib/dku_model_accessor/constants.py | 22 +++ dkulib/dku_model_accessor/model_accessor.py | 136 +++++++++++++++++++ dkulib/dku_model_accessor/model_metadata.py | 60 ++++++++ dkulib/dku_model_accessor/preprocessing.py | 118 ++++++++++++++++ dkulib/dku_model_accessor/surrogate_model.py | 41 ++++++ 6 files changed, 380 insertions(+) create mode 100644 dkulib/dku_model_accessor/__init__.py create mode 100644 dkulib/dku_model_accessor/constants.py create mode 100644 dkulib/dku_model_accessor/model_accessor.py create mode 100644 dkulib/dku_model_accessor/model_metadata.py create mode 100644 dkulib/dku_model_accessor/preprocessing.py create mode 100644 dkulib/dku_model_accessor/surrogate_model.py diff --git a/dkulib/dku_model_accessor/__init__.py b/dkulib/dku_model_accessor/__init__.py new file mode 100644 index 0000000..738d379 --- /dev/null +++ b/dkulib/dku_model_accessor/__init__.py @@ -0,0 +1,3 @@ +from dku_model_accessor.model_accessor import ModelAccessor +from dku_model_accessor.model_metadata import get_model_handler +from dku_model_accessor.constants import DkuModelAccessorConstants diff --git a/dkulib/dku_model_accessor/constants.py b/dkulib/dku_model_accessor/constants.py new file mode 100644 index 0000000..8328ad1 --- /dev/null +++ b/dkulib/dku_model_accessor/constants.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- + +class DkuModelAccessorConstants(object): + MODEL_ID = 'model_id' + VERSION_ID = 'version_id' + REGRRSSION_TYPE = 'REGRESSION' + DKU_MULTICLASS_CLASSIF = 'MULTICLASS' + DKU_BINARY_CLASSIF = 'BINARY_CLASSIFICATION' + CLASSIFICATION_TYPE = 'CLASSIFICATION' + CLUSTERING_TYPE = 'CLUSTERING' + MAX_NUM_ROW = 10000000 + CUMULATIVE_PERCENTAGE_THRESHOLD = 90 + SURROGATE_TARGET = "_dku_predicted_label_" + FEAT_IMP_CUMULATIVE_PERCENTAGE_THRESHOLD = 95 + CUMULATIVE_IMPORTANCE = 'cumulative_importance' + FEATURE = 'feature' + IMPORTANCE = 'importance' + RANK = 'rank' + CLASS = 'class' + PERCENTAGE = 'percentage' + DKU_XGBOOST_CLASSIF = 'XGBOOST_CLASSIFICATION' + DKU_XGBOOST_REGRESSION = 'XGBOOST_REGRESSION' diff --git a/dkulib/dku_model_accessor/model_accessor.py b/dkulib/dku_model_accessor/model_accessor.py new file mode 100644 index 0000000..f08e1ad --- /dev/null +++ b/dkulib/dku_model_accessor/model_accessor.py @@ -0,0 +1,136 @@ +# -*- coding: utf-8 -*- +import logging +import pandas as pd +from dku_model_accessor.constants import DkuModelAccessorConstants +from dku_model_accessor.surrogate_model import SurrogateModel +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, \ + GradientBoostingRegressor, ExtraTreesClassifier, ExtraTreesRegressor +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor + +logger = logging.getLogger(__name__) + +ALGORITHMS_WITH_VARIABLE_IMPORTANCE = [RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, + GradientBoostingRegressor, ExtraTreesClassifier, ExtraTreesRegressor, + DecisionTreeClassifier, DecisionTreeRegressor] + + +class ModelAccessor(object): + """ + Wrapper for our internal object PredictionModelInformationHandler + """ + def __init__(self, model_handler=None): + """ + model_handler: PredictionModelInformationHandler object + """ + self.model_handler = model_handler + + def get_prediction_type(self): + """ + Wrap the prediction type accessor of the model + """ + if self.model_handler.get_prediction_type() in [DkuModelAccessorConstants.DKU_BINARY_CLASSIF, DkuModelAccessorConstants.DKU_MULTICLASS_CLASSIF]: + return DkuModelAccessorConstants.CLASSIFICATION_TYPE + elif DkuModelAccessorConstants.REGRRSSION_TYPE == self.model_handler.get_prediction_type(): + return DkuModelAccessorConstants.REGRRSSION_TYPE + else: + return DkuModelAccessorConstants.CLUSTERING_TYPE + + def get_target_variable(self): + """ + Return the name of the target variable + """ + return self.model_handler.get_target_variable() + + def get_original_test_df(self, limit=DkuModelAccessorConstants.MAX_NUM_ROW): + try: + full_test_df = self.model_handler.get_test_df()[0] + test_df = full_test_df[:limit] + logger.info('Loading {}/{} rows of the original test set'.format(len(test_df), len(full_test_df))) + return test_df + except Exception as e: + logger.warning('Can not retrieve original test set: {}. The plugin will take the whole original dataset.'.format(e)) + full_test_df = self.model_handler.get_full_df()[0] + test_df = full_test_df[:limit] + logger.info('Loading {}/{} rows of the whole original test set'.format(len(test_df), len(full_test_df))) + return test_df + + def get_train_df(self, limit=DkuModelAccessorConstants.MAX_NUM_ROW): + full_train_df = self.model_handler.get_train_df()[0] + train_df = full_train_df[:limit] + logger.info('Loading {}/{} rows of the original train set'.format(len(train_df), len(full_train_df))) + return train_df + + def get_per_feature(self): + return self.model_handler.get_per_feature() + + def get_predictor(self): + return self.model_handler.get_predictor() + + def get_feature_importance(self,cumulative_percentage_threshold=DkuModelAccessorConstants.FEAT_IMP_CUMULATIVE_PERCENTAGE_THRESHOLD): + """ + :param cumulative_percentage_threshold: only return the top n features whose sum of importance reaches this threshold + :return: + """ + if self._algorithm_is_tree_based(): + predictor = self.get_predictor() + clf = predictor._clf + feature_names = predictor.get_features() + feature_importances = clf.feature_importances_ + + else: # use surrogate model + logger.info('Fitting surrogate model ...') + surrogate_model = SurrogateModel(self.get_prediction_type()) + original_test_df = self.get_original_test_df() + predictions_on_original_test_df = self.get_predictor().predict(original_test_df) + surrogate_df = original_test_df[self.get_selected_features()] + surrogate_df[DkuModelAccessorConstants.SURROGATE_TARGET] = predictions_on_original_test_df['prediction'] + surrogate_model.fit(surrogate_df, DkuModelAccessorConstants.SURROGATE_TARGET) + feature_names = surrogate_model.get_features() + feature_importances = surrogate_model.clf.feature_importances_ + + feature_importance = [] + for feature_name, feat_importance in zip(feature_names, feature_importances): + feature_importance.append({ + DkuModelAccessorConstants.FEATURE: feature_name, + DkuModelAccessorConstants.IMPORTANCE: 100 * feat_importance / sum(feature_importances) + }) + + dfx = pd.DataFrame(feature_importance).sort_values(by=DkuModelAccessorConstants.IMPORTANCE, + ascending=False).reset_index(drop=True) + dfx[DkuModelAccessorConstants.CUMULATIVE_IMPORTANCE] = dfx[DkuModelAccessorConstants.IMPORTANCE].cumsum() + dfx_top = dfx.loc[dfx[DkuModelAccessorConstants.CUMULATIVE_IMPORTANCE] <= cumulative_percentage_threshold] + return dfx_top.rename_axis(DkuModelAccessorConstants.RANK).reset_index().set_index( + DkuModelAccessorConstants.FEATURE) + + def get_selected_features(self): + """ + Return only features used in the model + """ + selected_features = [] + for feat, feat_info in self.get_per_feature().items(): + if feat_info.get('role') == 'INPUT': + selected_features.append(feat) + return selected_features + + def get_selected_and_rejected_features(self): + """ + Return all features in the input dataset except the target + """ + selected_features = [] + for feat, feat_info in self.get_per_feature().items(): + if feat_info.get('role') in ['INPUT', 'REJECT']: + selected_features.append(feat) + return selected_features + + def predict(self, df): + return self.get_predictor().predict(df) + + def _algorithm_is_tree_based(self): + predictor = self.get_predictor() + algo = predictor._clf + for algorithm in ALGORITHMS_WITH_VARIABLE_IMPORTANCE: + if isinstance(algo, algorithm): + return True + elif predictor.params.modeling_params.get('algorithm') in [DkuModelAccessorConstants.DKU_XGBOOST_CLASSIF, DkuModelAccessorConstants.DKU_XGBOOST_REGRESSION]: + return True + return False diff --git a/dkulib/dku_model_accessor/model_metadata.py b/dkulib/dku_model_accessor/model_metadata.py new file mode 100644 index 0000000..cb56035 --- /dev/null +++ b/dkulib/dku_model_accessor/model_metadata.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +import json +import os +import sys +from dataiku.doctor.posttraining.model_information_handler import PredictionModelInformationHandler + + +def get_model_handler(model, version_id=None): + """ + model: a dku saved model returned by dataiku.Model(model_id) + version_id: if None, the active one is chosen + """ + saved_model_version_id = _get_saved_model_version_id(model, version_id) + return _get_model_info_handler(saved_model_version_id) + + +def _get_saved_model_version_id(model, version_id=None): + model_def = model.get_definition() + if version_id is None: + version_id = model_def.get('activeVersion') + saved_model_version_id = 'S-{0}-{1}-{2}'.format(model_def.get('projectKey'), model_def.get('id'), version_id) + return saved_model_version_id + + +def _get_model_info_handler(saved_model_version_id): + infos = saved_model_version_id.split("-") + if len(infos) != 4 or infos[0] != "S": + raise ValueError("Invalid saved model id") + pkey = infos[1] + model_id = infos[2] + version_id = infos[3] + + datadir_path = os.environ['DIP_HOME'] + version_folder = os.path.join(datadir_path, "saved_models", pkey, model_id, "versions", version_id) + + # Loading and resolving paths in split_desc + split_folder = os.path.join(version_folder, "split") + with open(os.path.join(split_folder, "split.json")) as split_file: + split_desc = json.load(split_file) + + path_field_names = ["trainPath", "testPath", "fullPath"] + for field_name in path_field_names: + if split_desc.get(field_name, None) is not None: + split_desc[field_name] = os.path.join(split_folder, split_desc[field_name]) + + with open(os.path.join(version_folder, "core_params.json")) as core_params_file: + core_params = json.load(core_params_file) + + try: + return PredictionModelInformationHandler(split_desc, core_params, version_folder, version_folder) + except Exception as e: + from future.utils import raise_ + if "ordinal not in range(128)" in str(e): + raise_(Exception, "The plugin only supports python3, cannot load a python2 model. Original error: {}".format(e), sys.exc_info()[2]) + elif str(e) == "non-string names in Numpy dtype unpickling": + raise_(Exception, "The plugin is using a python2 code-env, cannot load a python3 model. Original error: {}".format(e), sys.exc_info()[2]) + elif str(e) == "Using saved models in python recipes is limited to models trained using the python engine": + raise_(Exception, "The plugin does not support Clustering model.", sys.exc_info()[2]) + else: + raise_(Exception, "Fail to load saved model: {}".format(e), sys.exc_info()[2]) diff --git a/dkulib/dku_model_accessor/preprocessing.py b/dkulib/dku_model_accessor/preprocessing.py new file mode 100644 index 0000000..eb05970 --- /dev/null +++ b/dkulib/dku_model_accessor/preprocessing.py @@ -0,0 +1,118 @@ +# -*- coding: utf-8 -*- +import sys +import random +from collections import Counter +from datetime import datetime +import logging +import numpy as np + +logger = logging.getLogger(__name__) +EPOCH = datetime(1900, 1, 1) + + +class Preprocessor(object): + """ + An object that replicates default behavior of doctor + """ + + def __init__(self, df=None, target=None): + self.df = df + self.target = target + self._categorical_features = [] + self._numerical_features = [] + self._text_features = [] + + def check(self): + if self.df is None: + raise ValueError('df is not specified.') + if self.target is None: + raise ValueError('target is not specified.') + + def get_processed_train_test(self): + self._categorical_features = [x for x in self._get_categorical_features() if x != self.target] + self._numerical_features = self._get_numerical_features() + self._text_features = self._get_text_features() + self._parse_data() + raw_train, raw_test = self._get_train_test_set() + imputed_train, imputed_test = self._impute(raw_train, raw_test) + dummy_values_dict = self._select_dummy_values(imputed_train, self._categorical_features) + final_train = self._dummy_encode(imputed_train, dummy_values_dict) + final_test = self._dummy_encode(imputed_test, dummy_values_dict) + return final_train, final_test + + def _parse_data(self): + def _datetime_to_epoch(series): + return (series - EPOCH) / np.timedelta64(1, 's') + + for feature in self._categorical_features: + self.df[feature] = self.df[feature].apply(self._coerce_to_unicode) + for feature in self._text_features: + self.df[feature] = self.df[feature].apply(self._coerce_to_unicode) + for feature in self._numerical_features: + if self.df[feature].dtype == np.dtype('M8[ns]'): + self.df[feature] = _datetime_to_epoch(self.df[feature]) + else: + self.df[feature] = self.df[feature].astype('double') + + def _get_numerical_features(self): + return self.df.select_dtypes(include=['number', 'M8[ns]']).columns.tolist() + + def _get_categorical_features(self): + return self.df.select_dtypes(include=['object', 'category']).columns.tolist() + + def _get_text_features(self): + return [] + + def _coerce_to_unicode(self, x): + if sys.version_info < (3, 0): + if isinstance(x, str): + return unicode(x, 'utf-8') + else: + return unicode(x) + else: + return str(x) + + def _select_dummy_values(self, dfx, features, LIMIT_DUMMIES = 100): + dummy_values = {} + for feature in features: + values = [ + value + for (value, _) in Counter(dfx[feature]).most_common(LIMIT_DUMMIES) + ] + dummy_values[feature] = values + return dummy_values + + def _get_train_test_set(self, prop=0.8, seed=1234): + k = int(self.df.shape[0] * prop) + random.seed(seed) + sampler = random.sample(self.df.index.tolist(), k) + train = self.df.loc[sampler] + test = self.df[~self.df.index.isin(sampler)] + return train, test + + def _impute(self, df_train, df_test): + for feature in self._numerical_features: + v = df_train[feature].mean() + df_train[feature] = df_train[feature].fillna(v) + df_test[feature] = df_test[feature].fillna(v) + logger.info('Imputed missing values in feature %s with value %s' % (feature, self._coerce_to_unicode(v))) + + for feature in self._categorical_features: + v = 'NULL_CATEGORY' + df_train[feature] = df_train[feature].fillna(v) + df_test[feature] = df_test[feature].fillna(v) + logger.info('Imputed missing values in feature %s with value %s' % (feature, self._coerce_to_unicode(v))) + + return df_train, df_test + + def _dummy_encode(self, dfx, dummy_values_dict): + dfx_copy = dfx.copy() + for (feature, dummy_values) in dummy_values_dict.items(): + for dummy_value in dummy_values: + #TODO add dummy:N/A and dummy:_Others_ + dummy_name = u'dummy:%s:%s' % (feature, self._coerce_to_unicode(dummy_value)) + dfx_copy[dummy_name] = (dfx_copy[feature] == dummy_value).astype(float) + del dfx_copy[feature] + logger.info('Dummy-encoded feature %s' % feature) + + return dfx_copy diff --git a/dkulib/dku_model_accessor/surrogate_model.py b/dkulib/dku_model_accessor/surrogate_model.py new file mode 100644 index 0000000..e51f182 --- /dev/null +++ b/dkulib/dku_model_accessor/surrogate_model.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- +import logging +from dku_model_accessor.constants import DkuModelAccessorConstants +from dku_model_accessor.preprocessing import Preprocessor +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor + +logger = logging.getLogger(__name__) + + +class SurrogateModel(object): + """ + In case the chosen saved model uses a non-tree based algorithm (and thus does not have feature importance), we fit this surrogate model + on top of the prediction of the former one to be able to retrieve the feature importance information. + """ + + def __init__(self, prediction_type): + self.check(prediction_type) + self.feature_names = None + self.target = None + self.prediction_type = prediction_type + # TODO should we define some params of RF to avoid long computation ? + if prediction_type == DkuModelAccessorConstants.CLASSIFICATION_TYPE: + self.clf = RandomForestClassifier(random_state=1407) + else: + self.clf = RandomForestRegressor(random_state=1407) + + def check(self, prediction_type): + if prediction_type not in [DkuModelAccessorConstants.CLASSIFICATION_TYPE, + DkuModelAccessorConstants.REGRRSSION_TYPE]: + raise ValueError('Prediction type must either be CLASSIFICATION or REGRESSION.') + + def get_features(self): + return self.feature_names + + def fit(self, df, target): + preprocessor = Preprocessor(df, target) + train, test = preprocessor.get_processed_train_test() + train_X = train.drop(target, axis=1) + train_Y = train[target] + self.clf.fit(train_X, train_Y) + self.feature_names = train_X.columns From fba767f3ea0bf78c58b611451d244b4a84c6512a Mon Sep 17 00:00:00 2001 From: Du Phan Date: Wed, 30 Dec 2020 12:22:23 +0100 Subject: [PATCH 2/4] add .idea to .gitingore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 23338a3..c6e347d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ plugenv +.idea/ # Byte-compiled / optimized / DLL files __pycache__/ @@ -138,4 +139,4 @@ dmypy.json .pytype/ # Cython debug symbols -cython_debug/ \ No newline at end of file +cython_debug/ From 7a00fbe63132862fca206fa5c9eb6bb1fdc791c7 Mon Sep 17 00:00:00 2001 From: Du Phan Date: Wed, 30 Dec 2020 12:26:44 +0100 Subject: [PATCH 3/4] add requirements --- dkulib/dku_model_accessor/requirements.txt | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 dkulib/dku_model_accessor/requirements.txt diff --git a/dkulib/dku_model_accessor/requirements.txt b/dkulib/dku_model_accessor/requirements.txt new file mode 100644 index 0000000..5fcf95b --- /dev/null +++ b/dkulib/dku_model_accessor/requirements.txt @@ -0,0 +1,8 @@ +flask==1.1.2 +future==0.18.2 +joblib==0.14.1 +numpy==1.16.6 +simplejson==3.17.2 +scikit-learn==0.20.2 +scipy==1.1.0 +xgboost==0.81 \ No newline at end of file From 6b97942234e61c24ad1d3bf4bb966a66d6d9a9d2 Mon Sep 17 00:00:00 2001 From: Du Phan Date: Wed, 30 Dec 2020 12:45:49 +0100 Subject: [PATCH 4/4] add readme --- README.md | 3 ++ dkulib/dku_model_accessor/README.md | 42 ++++++++++++++++++++++++++ dkulib/dku_model_accessor/constants.py | 2 +- 3 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 dkulib/dku_model_accessor/README.md diff --git a/README.md b/README.md index 219b1ad..daa0746 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,9 @@ This repo contains reusable code for DSS Plugin development. - [dku_config](https://github.com/dataiku/dss-plugin-dkulib/tree/main/dkulib/dku_config) - Gives the ability to check forms parameters in backend and to display understandable messages if fails. +- [dku_model_accessor](https://github.com/dataiku/dss-plugin-dkulib/tree/main/dkulib/dku_model_accessor) - +Give the ability to interact with saved model data. + ## Licence This library is distributed under the Apache License version 2.0 \ No newline at end of file diff --git a/dkulib/dku_model_accessor/README.md b/dkulib/dku_model_accessor/README.md new file mode 100644 index 0000000..324ab36 --- /dev/null +++ b/dkulib/dku_model_accessor/README.md @@ -0,0 +1,42 @@ +# DKU Model Accessor + +## Description +This lib provides tools to interact with dss saved models data (getting the original train/test set for example). + +It has a surrogate model and a doctor-like default preprocessor allowing to retrieve feature importance of any non-tree-based models. + +It uses an internal api, `dataiku.doctor.posttraining.model_information_handler.PredictionModelInformationHandler` (merci mamène Coni) so beware of future api break. + + +## Examples + + +```python +from dku_model_accessor import get_model_handler, ModelAccessor + +model_id = 'XQyU0TO0' +model = dataiku.Model(model_id) +model_handler = get_model_handler(model) +model_accessor = ModelAccessor(model_handler) + +original_test_set = model_accessor.get_original_test_df() +feature_importance = model_accessor.get_feature_importance() # works for any models +selected_features = model_accessor.get_selected_features() +``` + +## Projects using the library + +Don't hesitate to check these plugins using the library for more examples : + +- [dss-plugin-model-drift](https://github.com/dataiku/dss-plugin-model-drift) +- [dss-plugin-model-fairness-report](https://github.com/dataiku/dss-plugin-model-fairness-report) +- [dss-plugin-model-error-analysis](https://github.com/dataiku/dss-plugin-model-error-analysis) + +## Version + +- Version: 0.1.0 +- State: Supported + +## Credit + +Library created by Du Phan. \ No newline at end of file diff --git a/dkulib/dku_model_accessor/constants.py b/dkulib/dku_model_accessor/constants.py index 8328ad1..8758205 100644 --- a/dkulib/dku_model_accessor/constants.py +++ b/dkulib/dku_model_accessor/constants.py @@ -8,7 +8,7 @@ class DkuModelAccessorConstants(object): DKU_BINARY_CLASSIF = 'BINARY_CLASSIFICATION' CLASSIFICATION_TYPE = 'CLASSIFICATION' CLUSTERING_TYPE = 'CLUSTERING' - MAX_NUM_ROW = 10000000 + MAX_NUM_ROW = 1000000 CUMULATIVE_PERCENTAGE_THRESHOLD = 90 SURROGATE_TARGET = "_dku_predicted_label_" FEAT_IMP_CUMULATIVE_PERCENTAGE_THRESHOLD = 95