From f5af1ce502c8805933c163cd7e537bf28be292f7 Mon Sep 17 00:00:00 2001
From: Du Phan <du.phan@dataiku.com>
Date: Wed, 30 Dec 2020 12:22:10 +0100
Subject: [PATCH 1/4] add dku_model_accessor

---
 dkulib/dku_model_accessor/__init__.py        |   3 +
 dkulib/dku_model_accessor/constants.py       |  22 +++
 dkulib/dku_model_accessor/model_accessor.py  | 136 +++++++++++++++++++
 dkulib/dku_model_accessor/model_metadata.py  |  60 ++++++++
 dkulib/dku_model_accessor/preprocessing.py   | 118 ++++++++++++++++
 dkulib/dku_model_accessor/surrogate_model.py |  41 ++++++
 6 files changed, 380 insertions(+)
 create mode 100644 dkulib/dku_model_accessor/__init__.py
 create mode 100644 dkulib/dku_model_accessor/constants.py
 create mode 100644 dkulib/dku_model_accessor/model_accessor.py
 create mode 100644 dkulib/dku_model_accessor/model_metadata.py
 create mode 100644 dkulib/dku_model_accessor/preprocessing.py
 create mode 100644 dkulib/dku_model_accessor/surrogate_model.py

diff --git a/dkulib/dku_model_accessor/__init__.py b/dkulib/dku_model_accessor/__init__.py
new file mode 100644
index 0000000..738d379
--- /dev/null
+++ b/dkulib/dku_model_accessor/__init__.py
@@ -0,0 +1,3 @@
+from dku_model_accessor.model_accessor import ModelAccessor
+from dku_model_accessor.model_metadata import get_model_handler
+from dku_model_accessor.constants import DkuModelAccessorConstants
diff --git a/dkulib/dku_model_accessor/constants.py b/dkulib/dku_model_accessor/constants.py
new file mode 100644
index 0000000..8328ad1
--- /dev/null
+++ b/dkulib/dku_model_accessor/constants.py
@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+
+class DkuModelAccessorConstants(object):
+    MODEL_ID = 'model_id'
+    VERSION_ID = 'version_id'
+    REGRRSSION_TYPE = 'REGRESSION'
+    DKU_MULTICLASS_CLASSIF = 'MULTICLASS'
+    DKU_BINARY_CLASSIF = 'BINARY_CLASSIFICATION'
+    CLASSIFICATION_TYPE = 'CLASSIFICATION'
+    CLUSTERING_TYPE = 'CLUSTERING'
+    MAX_NUM_ROW = 10000000
+    CUMULATIVE_PERCENTAGE_THRESHOLD = 90
+    SURROGATE_TARGET = "_dku_predicted_label_"
+    FEAT_IMP_CUMULATIVE_PERCENTAGE_THRESHOLD = 95
+    CUMULATIVE_IMPORTANCE = 'cumulative_importance'
+    FEATURE = 'feature'
+    IMPORTANCE = 'importance'
+    RANK = 'rank'
+    CLASS = 'class'
+    PERCENTAGE = 'percentage'
+    DKU_XGBOOST_CLASSIF = 'XGBOOST_CLASSIFICATION'
+    DKU_XGBOOST_REGRESSION = 'XGBOOST_REGRESSION'
diff --git a/dkulib/dku_model_accessor/model_accessor.py b/dkulib/dku_model_accessor/model_accessor.py
new file mode 100644
index 0000000..f08e1ad
--- /dev/null
+++ b/dkulib/dku_model_accessor/model_accessor.py
@@ -0,0 +1,136 @@
+# -*- coding: utf-8 -*-
+import logging
+import pandas as pd
+from dku_model_accessor.constants import DkuModelAccessorConstants
+from dku_model_accessor.surrogate_model import SurrogateModel
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, \
+    GradientBoostingRegressor, ExtraTreesClassifier, ExtraTreesRegressor
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+
+logger = logging.getLogger(__name__)
+
+ALGORITHMS_WITH_VARIABLE_IMPORTANCE = [RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier,
+                                       GradientBoostingRegressor, ExtraTreesClassifier, ExtraTreesRegressor,
+                                       DecisionTreeClassifier, DecisionTreeRegressor]
+
+
+class ModelAccessor(object):
+    """
+    Wrapper for our internal object PredictionModelInformationHandler
+    """
+    def __init__(self, model_handler=None):
+        """
+        model_handler: PredictionModelInformationHandler object
+        """
+        self.model_handler = model_handler
+
+    def get_prediction_type(self):
+        """
+        Wrap the prediction type accessor of the model
+        """
+        if self.model_handler.get_prediction_type() in [DkuModelAccessorConstants.DKU_BINARY_CLASSIF, DkuModelAccessorConstants.DKU_MULTICLASS_CLASSIF]:
+            return DkuModelAccessorConstants.CLASSIFICATION_TYPE
+        elif DkuModelAccessorConstants.REGRRSSION_TYPE == self.model_handler.get_prediction_type():
+            return DkuModelAccessorConstants.REGRRSSION_TYPE
+        else:
+            return DkuModelAccessorConstants.CLUSTERING_TYPE
+
+    def get_target_variable(self):
+        """
+        Return the name of the target variable
+        """
+        return self.model_handler.get_target_variable()
+
+    def get_original_test_df(self, limit=DkuModelAccessorConstants.MAX_NUM_ROW):
+        try:
+            full_test_df = self.model_handler.get_test_df()[0]
+            test_df = full_test_df[:limit]
+            logger.info('Loading {}/{} rows of the original test set'.format(len(test_df), len(full_test_df)))
+            return test_df
+        except Exception as e:
+            logger.warning('Can not retrieve original test set: {}. The plugin will take the whole original dataset.'.format(e))
+            full_test_df = self.model_handler.get_full_df()[0]
+            test_df = full_test_df[:limit]
+            logger.info('Loading {}/{} rows of the whole original test set'.format(len(test_df), len(full_test_df)))
+            return test_df
+
+    def get_train_df(self, limit=DkuModelAccessorConstants.MAX_NUM_ROW):
+        full_train_df = self.model_handler.get_train_df()[0]
+        train_df = full_train_df[:limit]
+        logger.info('Loading {}/{} rows of the original train set'.format(len(train_df), len(full_train_df)))
+        return train_df
+
+    def get_per_feature(self):
+        return self.model_handler.get_per_feature()
+
+    def get_predictor(self):
+        return self.model_handler.get_predictor()
+
+    def get_feature_importance(self,cumulative_percentage_threshold=DkuModelAccessorConstants.FEAT_IMP_CUMULATIVE_PERCENTAGE_THRESHOLD):
+        """
+        :param cumulative_percentage_threshold: only return the top n features whose sum of importance reaches this threshold
+        :return:
+        """
+        if self._algorithm_is_tree_based():
+            predictor = self.get_predictor()
+            clf = predictor._clf
+            feature_names = predictor.get_features()
+            feature_importances = clf.feature_importances_
+
+        else:  # use surrogate model
+            logger.info('Fitting surrogate model ...')
+            surrogate_model = SurrogateModel(self.get_prediction_type())
+            original_test_df = self.get_original_test_df()
+            predictions_on_original_test_df = self.get_predictor().predict(original_test_df)
+            surrogate_df = original_test_df[self.get_selected_features()]
+            surrogate_df[DkuModelAccessorConstants.SURROGATE_TARGET] = predictions_on_original_test_df['prediction']
+            surrogate_model.fit(surrogate_df, DkuModelAccessorConstants.SURROGATE_TARGET)
+            feature_names = surrogate_model.get_features()
+            feature_importances = surrogate_model.clf.feature_importances_
+
+        feature_importance = []
+        for feature_name, feat_importance in zip(feature_names, feature_importances):
+            feature_importance.append({
+                DkuModelAccessorConstants.FEATURE: feature_name,
+                DkuModelAccessorConstants.IMPORTANCE: 100 * feat_importance / sum(feature_importances)
+            })
+
+        dfx = pd.DataFrame(feature_importance).sort_values(by=DkuModelAccessorConstants.IMPORTANCE,
+                                                           ascending=False).reset_index(drop=True)
+        dfx[DkuModelAccessorConstants.CUMULATIVE_IMPORTANCE] = dfx[DkuModelAccessorConstants.IMPORTANCE].cumsum()
+        dfx_top = dfx.loc[dfx[DkuModelAccessorConstants.CUMULATIVE_IMPORTANCE] <= cumulative_percentage_threshold]
+        return dfx_top.rename_axis(DkuModelAccessorConstants.RANK).reset_index().set_index(
+            DkuModelAccessorConstants.FEATURE)
+
+    def get_selected_features(self):
+        """
+        Return only features used in the model
+        """
+        selected_features = []
+        for feat, feat_info in self.get_per_feature().items():
+            if feat_info.get('role') == 'INPUT':
+                selected_features.append(feat)
+        return selected_features
+
+    def get_selected_and_rejected_features(self):
+        """
+        Return all features in the input dataset except the target
+        """
+        selected_features = []
+        for feat, feat_info in self.get_per_feature().items():
+            if feat_info.get('role') in ['INPUT', 'REJECT']:
+                selected_features.append(feat)
+        return selected_features
+
+    def predict(self, df):
+        return self.get_predictor().predict(df)
+
+    def _algorithm_is_tree_based(self):
+        predictor = self.get_predictor()
+        algo = predictor._clf
+        for algorithm in ALGORITHMS_WITH_VARIABLE_IMPORTANCE:
+            if isinstance(algo, algorithm):
+                return True
+            elif predictor.params.modeling_params.get('algorithm') in [DkuModelAccessorConstants.DKU_XGBOOST_CLASSIF, DkuModelAccessorConstants.DKU_XGBOOST_REGRESSION]:
+                return True
+        return False
diff --git a/dkulib/dku_model_accessor/model_metadata.py b/dkulib/dku_model_accessor/model_metadata.py
new file mode 100644
index 0000000..cb56035
--- /dev/null
+++ b/dkulib/dku_model_accessor/model_metadata.py
@@ -0,0 +1,60 @@
+# -*- coding: utf-8 -*-
+import json
+import os
+import sys
+from dataiku.doctor.posttraining.model_information_handler import PredictionModelInformationHandler
+
+
+def get_model_handler(model, version_id=None):
+    """
+    model: a dku saved model returned by dataiku.Model(model_id)
+    version_id: if None, the active one is chosen
+    """
+    saved_model_version_id = _get_saved_model_version_id(model, version_id)
+    return _get_model_info_handler(saved_model_version_id)
+
+
+def _get_saved_model_version_id(model, version_id=None):
+    model_def = model.get_definition()
+    if version_id is None:
+        version_id = model_def.get('activeVersion')
+    saved_model_version_id = 'S-{0}-{1}-{2}'.format(model_def.get('projectKey'), model_def.get('id'), version_id)
+    return saved_model_version_id
+
+
+def _get_model_info_handler(saved_model_version_id):
+    infos = saved_model_version_id.split("-")
+    if len(infos) != 4 or infos[0] != "S":
+        raise ValueError("Invalid saved model id")
+    pkey = infos[1]
+    model_id = infos[2]
+    version_id = infos[3]
+
+    datadir_path = os.environ['DIP_HOME']
+    version_folder = os.path.join(datadir_path, "saved_models", pkey, model_id, "versions", version_id)
+
+    # Loading and resolving paths in split_desc
+    split_folder = os.path.join(version_folder, "split")
+    with open(os.path.join(split_folder, "split.json")) as split_file:
+        split_desc = json.load(split_file)
+
+    path_field_names = ["trainPath", "testPath", "fullPath"]
+    for field_name in path_field_names:
+        if split_desc.get(field_name, None) is not None:
+            split_desc[field_name] = os.path.join(split_folder, split_desc[field_name])
+
+    with open(os.path.join(version_folder, "core_params.json")) as core_params_file:
+        core_params = json.load(core_params_file)
+
+    try:
+        return PredictionModelInformationHandler(split_desc, core_params, version_folder, version_folder)
+    except Exception as e:
+        from future.utils import raise_
+        if "ordinal not in range(128)" in str(e):
+            raise_(Exception, "The plugin only supports python3, cannot load a python2 model. Original error: {}".format(e), sys.exc_info()[2])
+        elif str(e) == "non-string names in Numpy dtype unpickling":
+            raise_(Exception, "The plugin is using a python2 code-env, cannot load a python3 model. Original error: {}".format(e), sys.exc_info()[2])
+        elif str(e) == "Using saved models in python recipes is limited to models trained using the python engine":
+            raise_(Exception, "The plugin does not support Clustering model.", sys.exc_info()[2])
+        else:
+            raise_(Exception, "Fail to load saved model: {}".format(e), sys.exc_info()[2])
diff --git a/dkulib/dku_model_accessor/preprocessing.py b/dkulib/dku_model_accessor/preprocessing.py
new file mode 100644
index 0000000..eb05970
--- /dev/null
+++ b/dkulib/dku_model_accessor/preprocessing.py
@@ -0,0 +1,118 @@
+# -*- coding: utf-8 -*-
+import sys
+import random
+from collections import Counter
+from datetime import datetime
+import logging
+import numpy as np
+
+logger = logging.getLogger(__name__)
+EPOCH = datetime(1900, 1, 1)
+
+
+class Preprocessor(object):
+    """
+    An object that replicates default behavior of doctor
+    """
+
+    def __init__(self, df=None, target=None):
+        self.df = df
+        self.target = target
+        self._categorical_features = []
+        self._numerical_features = []
+        self._text_features = []
+
+    def check(self):
+        if self.df is None:
+            raise ValueError('df is not specified.')
+        if self.target is None:
+            raise ValueError('target is not specified.')
+
+    def get_processed_train_test(self):
+        self._categorical_features = [x for x in self._get_categorical_features() if x != self.target]
+        self._numerical_features = self._get_numerical_features()
+        self._text_features = self._get_text_features()
+        self._parse_data()
+        raw_train, raw_test = self._get_train_test_set()
+        imputed_train, imputed_test = self._impute(raw_train, raw_test)
+        dummy_values_dict = self._select_dummy_values(imputed_train, self._categorical_features)
+        final_train = self._dummy_encode(imputed_train, dummy_values_dict)
+        final_test = self._dummy_encode(imputed_test, dummy_values_dict)
+        return final_train, final_test
+
+    def _parse_data(self):
+        def _datetime_to_epoch(series):
+            return (series - EPOCH) / np.timedelta64(1, 's')
+
+        for feature in self._categorical_features:
+            self.df[feature] = self.df[feature].apply(self._coerce_to_unicode)
+        for feature in self._text_features:
+            self.df[feature] = self.df[feature].apply(self._coerce_to_unicode)
+        for feature in self._numerical_features:
+            if self.df[feature].dtype == np.dtype('M8[ns]'):
+                self.df[feature] = _datetime_to_epoch(self.df[feature])
+            else:
+                self.df[feature] = self.df[feature].astype('double')
+
+    def _get_numerical_features(self):
+        return self.df.select_dtypes(include=['number', 'M8[ns]']).columns.tolist()
+
+    def _get_categorical_features(self):
+        return self.df.select_dtypes(include=['object', 'category']).columns.tolist()
+
+    def _get_text_features(self):
+        return []
+
+    def _coerce_to_unicode(self, x):
+        if sys.version_info < (3, 0):
+            if isinstance(x, str):
+                return unicode(x, 'utf-8')
+            else:
+                return unicode(x)
+        else:
+            return str(x)
+
+    def _select_dummy_values(self, dfx, features, LIMIT_DUMMIES = 100):
+        dummy_values = {}
+        for feature in features:
+            values = [
+                value
+                for (value, _) in Counter(dfx[feature]).most_common(LIMIT_DUMMIES)
+            ]
+            dummy_values[feature] = values
+        return dummy_values
+
+    def _get_train_test_set(self, prop=0.8, seed=1234):
+        k = int(self.df.shape[0] * prop)
+        random.seed(seed)
+        sampler = random.sample(self.df.index.tolist(), k)
+        train = self.df.loc[sampler]
+        test = self.df[~self.df.index.isin(sampler)]
+        return train, test
+
+    def _impute(self, df_train, df_test):
+        for feature in self._numerical_features:
+            v = df_train[feature].mean()
+            df_train[feature] = df_train[feature].fillna(v)
+            df_test[feature] = df_test[feature].fillna(v)
+            logger.info('Imputed missing values in feature %s with value %s' % (feature, self._coerce_to_unicode(v)))
+
+        for feature in self._categorical_features:
+            v = 'NULL_CATEGORY'
+            df_train[feature] = df_train[feature].fillna(v)
+            df_test[feature] = df_test[feature].fillna(v)
+            logger.info('Imputed missing values in feature %s with value %s' % (feature, self._coerce_to_unicode(v)))
+
+        return df_train, df_test
+
+    def _dummy_encode(self, dfx, dummy_values_dict):
+        dfx_copy = dfx.copy()
+        for (feature, dummy_values) in dummy_values_dict.items():
+            for dummy_value in dummy_values:
+                #TODO add dummy:N/A and dummy:_Others_
+                dummy_name = u'dummy:%s:%s' % (feature, self._coerce_to_unicode(dummy_value))
+                dfx_copy[dummy_name] = (dfx_copy[feature] == dummy_value).astype(float)
+            del dfx_copy[feature]
+            logger.info('Dummy-encoded feature %s' % feature)
+
+        return dfx_copy
diff --git a/dkulib/dku_model_accessor/surrogate_model.py b/dkulib/dku_model_accessor/surrogate_model.py
new file mode 100644
index 0000000..e51f182
--- /dev/null
+++ b/dkulib/dku_model_accessor/surrogate_model.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+import logging
+from dku_model_accessor.constants import DkuModelAccessorConstants
+from dku_model_accessor.preprocessing import Preprocessor
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+
+logger = logging.getLogger(__name__)
+
+
+class SurrogateModel(object):
+    """
+    In case the chosen saved model uses a non-tree based algorithm (and thus does not have feature importance), we fit this surrogate model
+    on top of the prediction of the former one to be able to retrieve the feature importance information.
+    """
+
+    def __init__(self, prediction_type):
+        self.check(prediction_type)
+        self.feature_names = None
+        self.target = None
+        self.prediction_type = prediction_type
+        # TODO should we define some params of RF to avoid long computation ?
+        if prediction_type == DkuModelAccessorConstants.CLASSIFICATION_TYPE:
+            self.clf = RandomForestClassifier(random_state=1407)
+        else:
+            self.clf = RandomForestRegressor(random_state=1407)
+
+    def check(self, prediction_type):
+        if prediction_type not in [DkuModelAccessorConstants.CLASSIFICATION_TYPE,
+                                   DkuModelAccessorConstants.REGRRSSION_TYPE]:
+            raise ValueError('Prediction type must either be CLASSIFICATION or REGRESSION.')
+
+    def get_features(self):
+        return self.feature_names
+
+    def fit(self, df, target):
+        preprocessor = Preprocessor(df, target)
+        train, test = preprocessor.get_processed_train_test()
+        train_X = train.drop(target, axis=1)
+        train_Y = train[target]
+        self.clf.fit(train_X, train_Y)
+        self.feature_names = train_X.columns

From fba767f3ea0bf78c58b611451d244b4a84c6512a Mon Sep 17 00:00:00 2001
From: Du Phan <du.phan@dataiku.com>
Date: Wed, 30 Dec 2020 12:22:23 +0100
Subject: [PATCH 2/4] add .idea to .gitingore

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 23338a3..c6e347d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 plugenv
+.idea/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -138,4 +139,4 @@ dmypy.json
 .pytype/
 
 # Cython debug symbols
-cython_debug/
\ No newline at end of file
+cython_debug/

From 7a00fbe63132862fca206fa5c9eb6bb1fdc791c7 Mon Sep 17 00:00:00 2001
From: Du Phan <du.phan@dataiku.com>
Date: Wed, 30 Dec 2020 12:26:44 +0100
Subject: [PATCH 3/4] add requirements

---
 dkulib/dku_model_accessor/requirements.txt | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 dkulib/dku_model_accessor/requirements.txt

diff --git a/dkulib/dku_model_accessor/requirements.txt b/dkulib/dku_model_accessor/requirements.txt
new file mode 100644
index 0000000..5fcf95b
--- /dev/null
+++ b/dkulib/dku_model_accessor/requirements.txt
@@ -0,0 +1,8 @@
+flask==1.1.2
+future==0.18.2
+joblib==0.14.1
+numpy==1.16.6
+simplejson==3.17.2
+scikit-learn==0.20.2
+scipy==1.1.0
+xgboost==0.81
\ No newline at end of file

From 6b97942234e61c24ad1d3bf4bb966a66d6d9a9d2 Mon Sep 17 00:00:00 2001
From: Du Phan <du.phan@dataiku.com>
Date: Wed, 30 Dec 2020 12:45:49 +0100
Subject: [PATCH 4/4] add readme

---
 README.md                              |  3 ++
 dkulib/dku_model_accessor/README.md    | 42 ++++++++++++++++++++++++++
 dkulib/dku_model_accessor/constants.py |  2 +-
 3 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 dkulib/dku_model_accessor/README.md

diff --git a/README.md b/README.md
index 219b1ad..daa0746 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,9 @@ This repo contains reusable code for DSS Plugin development.
 - [dku_config](https://github.com/dataiku/dss-plugin-dkulib/tree/main/dkulib/dku_config) - 
 Gives the ability to check forms parameters in backend and to display understandable messages if fails.
 
+- [dku_model_accessor](https://github.com/dataiku/dss-plugin-dkulib/tree/main/dkulib/dku_model_accessor) - 
+Give the ability to interact with saved model data.
+
 ## Licence
 
 This library is distributed under the Apache License version 2.0
\ No newline at end of file
diff --git a/dkulib/dku_model_accessor/README.md b/dkulib/dku_model_accessor/README.md
new file mode 100644
index 0000000..324ab36
--- /dev/null
+++ b/dkulib/dku_model_accessor/README.md
@@ -0,0 +1,42 @@
+# DKU Model Accessor
+
+## Description
+This lib provides tools to interact with dss saved models data (getting the original train/test set for example).
+
+It has a surrogate model and a doctor-like default preprocessor allowing to retrieve feature importance of any non-tree-based models.
+
+It uses an internal api, `dataiku.doctor.posttraining.model_information_handler.PredictionModelInformationHandler` (merci mamène Coni) so beware of future api break.
+
+
+## Examples
+
+
+```python
+from dku_model_accessor import get_model_handler, ModelAccessor
+
+model_id = 'XQyU0TO0'
+model = dataiku.Model(model_id)
+model_handler = get_model_handler(model)
+model_accessor = ModelAccessor(model_handler)
+
+original_test_set = model_accessor.get_original_test_df()
+feature_importance = model_accessor.get_feature_importance() # works for any models
+selected_features = model_accessor.get_selected_features()
+```
+
+## Projects using the library
+
+Don't hesitate to check these plugins using the library for more examples :
+
+- [dss-plugin-model-drift](https://github.com/dataiku/dss-plugin-model-drift)
+- [dss-plugin-model-fairness-report](https://github.com/dataiku/dss-plugin-model-fairness-report)
+- [dss-plugin-model-error-analysis](https://github.com/dataiku/dss-plugin-model-error-analysis)
+
+## Version
+
+- Version: 0.1.0
+- State: <span style="color:green">Supported</span>
+
+## Credit
+
+Library created by Du Phan.
\ No newline at end of file
diff --git a/dkulib/dku_model_accessor/constants.py b/dkulib/dku_model_accessor/constants.py
index 8328ad1..8758205 100644
--- a/dkulib/dku_model_accessor/constants.py
+++ b/dkulib/dku_model_accessor/constants.py
@@ -8,7 +8,7 @@ class DkuModelAccessorConstants(object):
     DKU_BINARY_CLASSIF = 'BINARY_CLASSIFICATION'
     CLASSIFICATION_TYPE = 'CLASSIFICATION'
     CLUSTERING_TYPE = 'CLUSTERING'
-    MAX_NUM_ROW = 10000000
+    MAX_NUM_ROW = 1000000
     CUMULATIVE_PERCENTAGE_THRESHOLD = 90
     SURROGATE_TARGET = "_dku_predicted_label_"
     FEAT_IMP_CUMULATIVE_PERCENTAGE_THRESHOLD = 95