From 3c21a6904c0f3ccd7eff81930128bbc8ce0321b6 Mon Sep 17 00:00:00 2001 From: AllenBaranov Date: Mon, 8 Dec 2025 01:06:55 -0500 Subject: [PATCH 1/2] Added 1D K-means binning as an alternate discretization strategy --- sigllm/core.py | 18 ++- .../pipelines/detector/mistral_detector.json | 5 +- ...rimitives.transformation.Float2Scalar.json | 22 ++- ...rimitives.transformation.Scalar2Float.json | 12 +- sigllm/primitives/transformation.py | 136 ++++++++++++++---- 5 files changed, 141 insertions(+), 52 deletions(-) diff --git a/sigllm/core.py b/sigllm/core.py index 0008002..1059d94 100644 --- a/sigllm/core.py +++ b/sigllm/core.py @@ -15,7 +15,7 @@ LOGGER = logging.getLogger(__name__) INTERVAL_PRIMITIVE = 'mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1' -DECIMAL_PRIMITIVE = 'sigllm.primitives.transformation.Float2Scalar#1' +FLOAT2SCALAR_PRIMITIVE = 'sigllm.primitives.transformation.Float2Scalar#1' WINDOW_SIZE_PRIMITIVE = 'sigllm.primitives.forecasting.custom.rolling_window_sequences#1' @@ -35,8 +35,12 @@ class SigLLM(Orion): * A ``dict`` with an ``MLPipeline`` specification. interval (int): Number of time points between one sample and another. + strategy (str): + Discretization strategy: 'scaling' or 'binning'. Default to 'binning'. decimal (int): - Number of decimal points to keep from the float representation. + Number of decimal points to keep (scaling strategy only). + n_clusters (int): + Number of clusters for binning (binning strategy only). window_size (int): Size of the input window. hyperparameters (dict): @@ -46,7 +50,7 @@ class SigLLM(Orion): DEFAULT_PIPELINE = 'mistral_detector' def _augment_hyperparameters(self, primitive, key, value): - if not value: + if value is None: return if self._hyperparameters is None: @@ -61,7 +65,9 @@ def __init__( self, pipeline: Union[str, dict, MLPipeline] = None, interval: int = None, + strategy: str = None, decimal: int = None, + n_clusters: int = None, window_size: int = None, hyperparameters: dict = None, ): @@ -71,11 +77,15 @@ def __init__( self._fitted = False self.interval = interval + self.strategy = strategy self.decimal = decimal + self.n_clusters = n_clusters self.window_size = window_size self._augment_hyperparameters(INTERVAL_PRIMITIVE, 'interval', interval) - self._augment_hyperparameters(DECIMAL_PRIMITIVE, 'decimal', decimal) + self._augment_hyperparameters(FLOAT2SCALAR_PRIMITIVE, 'strategy', strategy) + self._augment_hyperparameters(FLOAT2SCALAR_PRIMITIVE, 'decimal', decimal) + self._augment_hyperparameters(FLOAT2SCALAR_PRIMITIVE, 'n_clusters', n_clusters) self._augment_hyperparameters(WINDOW_SIZE_PRIMITIVE, 'window_size', window_size) def __repr__(self): diff --git a/sigllm/pipelines/detector/mistral_detector.json b/sigllm/pipelines/detector/mistral_detector.json index ecd7d3e..a76de94 100644 --- a/sigllm/pipelines/detector/mistral_detector.json +++ b/sigllm/pipelines/detector/mistral_detector.json @@ -20,8 +20,7 @@ "method": "mean" }, "sigllm.primitives.transformation.Float2Scalar#1": { - "decimal": 2, - "rescale": true + "n_clusters": 100 }, "sigllm.primitives.forecasting.custom.rolling_window_sequences#1": { "target_column": 0, @@ -33,7 +32,7 @@ }, "sigllm.primitives.forecasting.huggingface.HF#1": { "name": "mistralai/Mistral-7B-Instruct-v0.2", - "steps": 5 + "steps": 2 }, "sigllm.primitives.transformation.format_as_integer#1": { "trunc": 1, diff --git a/sigllm/primitives/jsons/sigllm.primitives.transformation.Float2Scalar.json b/sigllm/primitives/jsons/sigllm.primitives.transformation.Float2Scalar.json index 3bdaae2..811ab35 100644 --- a/sigllm/primitives/jsons/sigllm.primitives.transformation.Float2Scalar.json +++ b/sigllm/primitives/jsons/sigllm.primitives.transformation.Float2Scalar.json @@ -4,7 +4,7 @@ "Sarah Alnegheimish ", "Linh Nguyen " ], - "description": "Transform float values into scalar.", + "description": "Transform float values into scalar using either scaling (multiply by 10^decimal) or binning (K-means clustering).", "classifiers": { "type": "preprocessor", "subtype": "transformer" @@ -34,23 +34,31 @@ "type": "ndarray" }, { - "name": "minimum", - "type": "float" - }, - { - "name": "decimal", - "type": "int" + "name": "metadata", + "type": "dict" } ] }, "hyperparameters": { "fixed": { + "strategy": { + "type": "str", + "description": "Discretization strategy: 'scaling' or 'binning'", + "default": "scaling" + }, + "n_clusters": { + "type": "int", + "description": "Number of clusters (binning strategy only)", + "default": 100 + }, "decimal": { "type": "int", + "description": "Number of decimal places (scaling strategy only)", "default": 2 }, "rescale": { "type": "bool", + "description": "Rescale so minimum becomes 0 (scaling strategy only)", "default": true } } diff --git a/sigllm/primitives/jsons/sigllm.primitives.transformation.Scalar2Float.json b/sigllm/primitives/jsons/sigllm.primitives.transformation.Scalar2Float.json index 2aefbaf..ba21a7a 100644 --- a/sigllm/primitives/jsons/sigllm.primitives.transformation.Scalar2Float.json +++ b/sigllm/primitives/jsons/sigllm.primitives.transformation.Scalar2Float.json @@ -4,7 +4,7 @@ "Sarah Alnegheimish ", "Linh Nguyen " ], - "description": "Transform scalar values to float.", + "description": "Transform scalar values to float using metadata from Float2Scalar.", "classifiers": { "type": "preprocessor", "subtype": "transformer" @@ -19,14 +19,8 @@ "type": "ndarray" }, { - "name": "minimum", - "type": "float", - "default": 0 - }, - { - "name": "decimal", - "type": "int", - "default": 2 + "name": "metadata", + "type": "dict" } ], "output": [ diff --git a/sigllm/primitives/transformation.py b/sigllm/primitives/transformation.py index 5965861..f3c1381 100644 --- a/sigllm/primitives/transformation.py +++ b/sigllm/primitives/transformation.py @@ -5,6 +5,7 @@ import numpy as np +from sklearn.cluster import KMeans def format_as_string(X, sep=',', space=False, single=False): """Format X to a list of string. @@ -141,46 +142,123 @@ class Float2Scalar: Whether to rescale the array such that the minimum value becomes 0. Default to `True`. """ - def __init__(self, decimal=2, rescale=True): + def __init__(self, strategy='scaling', n_clusters=100, decimal=2, rescale=True): + self.strategy = strategy + self.n_clusters = n_clusters self.decimal = decimal self.rescale = rescale + + # State variables self.minimum = None + self.centroids = None + self.labels = None def fit(self, X): - """Learn minimum value in fit data.""" - self.minimum = np.min(X) + """Learn parameters from data. + + For scaling: learns the minimum value. + For binning: learns K-means cluster centroids. + """ + if self.strategy == 'scaling': + self.minimum = np.min(X) + elif self.strategy == 'binning': + centroids_list = [] + labels = [] + for col in X.T: + if self.n_clusters >= len(np.unique(col)): + centroids = np.unique(col) + else: + kmeans = KMeans(n_clusters=self.n_clusters, random_state=0) + kmeans.fit(col.reshape(-1, 1)) + centroids = np.sort(kmeans.cluster_centers_.ravel()) + + col_labels = np.argmin(np.abs(col[:, None] - centroids[None, :]), axis=1) + + labels.append(col_labels) + centroids_list.append(centroids) + + self.labels = np.column_stack(labels) + self.centroids = centroids_list + else: + raise ValueError(f"Unknown strategy '{self.strategy}'. Use 'scaling' or 'binning'.") def transform(self, X): - """Transform data.""" - if self.rescale: - X = X - self.minimum - - sign = 1 * (X >= 0) - 1 * (X < 0) - values = np.abs(X) - - values = sign * (values * 10**self.decimal).astype(int) - - return values, self.minimum, self.decimal + """Transform data to integer representation. + + Returns: + tuple: (values, metadata) where metadata is a dict containing: + - For scaling: {'strategy': 'scaling', 'minimum': float, 'decimal': int} + - For binning: {'strategy': 'binning', 'centroids': list} + """ + print(f"[Float2Scalar] Using strategy: {self.strategy}") + if self.strategy == 'scaling': + if self.rescale: + X = X - self.minimum + + sign = 1 * (X >= 0) - 1 * (X < 0) + values = np.abs(X) + + values = sign * np.round(values * 10**self.decimal).astype(int) + + metadata = { + 'strategy': 'scaling', + 'minimum': self.minimum, + 'decimal': self.decimal + } + return values, metadata + + elif self.strategy == 'binning': + # Re-fit to get labels for this X (transform is same as fit for binning) + self.fit(X) + metadata = { + 'strategy': 'binning', + 'centroids': self.centroids + } + return self.labels, metadata + + else: + raise ValueError(f"Unknown strategy '{self.strategy}'. Use 'scaling' or 'binning'.") class Scalar2Float: """Convert an array of integer values to float. - Transforms an array of integers to an array floats. - Shift values by minimum and include a predetermined - number of decimal points. - - 105, 200, 310, 483, 500, 0 -> 1.05, 2., 3.1, 4.8342, 5, 0 - - Args: - minimum (float): - Bias to shift the data. Captured from Float2Scalar. - decimal (int): - Number of decimal points to keep from the float representation. Default to `2`. + Transforms an array of integers back to floats using the metadata from Float2Scalar. + + - 'scaling': Divide by 10^decimal and add minimum offset. + Example: 105, 200, 310, 483, 500, 0 -> 1.05, 2., 3.1, 4.83, 5, 0 + + - 'binning': Map cluster indices back to centroid values. """ - def transform(self, X, minimum=0, decimal=2): - """Convert data from integer to float.""" - values = X * 10 ** (-decimal) - - return values + minimum + def transform(self, X, metadata): + """Convert data from integer back to float. + + Args: + X (ndarray): Integer values to convert. + metadata (dict): Metadata from Float2Scalar containing strategy and parameters. + + Returns: + ndarray: Float values. + """ + strategy = metadata.get('strategy', 'binning') + print(f"[Scalar2Float] Using strategy: {strategy}") + print(f"[Scalar2Float] Full metadata: {metadata}") + + if strategy == 'scaling': + minimum = metadata.get('minimum', 0) + decimal = metadata.get('decimal', 2) + values = X * 10 ** (-decimal) + return values + minimum + + elif strategy == 'binning': + centroids = metadata.get('centroids') + if centroids is None: + raise ValueError("centroids must be provided in metadata for binning strategy") + base_centroids = np.asarray(centroids[0]) + idx = np.clip(X.astype(int), 0, len(base_centroids) - 1) + X_pred = np.take(base_centroids, idx) + return X_pred + + else: + raise ValueError(f"Unknown strategy '{strategy}'. Use 'scaling' or 'binning'.") From ff000bebf891ae6689cee84113437efcfbd48858 Mon Sep 17 00:00:00 2001 From: AllenBaranov Date: Mon, 8 Dec 2025 13:11:10 -0500 Subject: [PATCH 2/2] Simplify docstrings and use scaling as default for backwards compatibility --- .../pipelines/detector/mistral_detector.json | 5 +-- ...rimitives.transformation.Float2Scalar.json | 6 +--- ...rimitives.transformation.Scalar2Float.json | 2 +- sigllm/primitives/transformation.py | 35 ++----------------- 4 files changed, 8 insertions(+), 40 deletions(-) diff --git a/sigllm/pipelines/detector/mistral_detector.json b/sigllm/pipelines/detector/mistral_detector.json index a76de94..ecd7d3e 100644 --- a/sigllm/pipelines/detector/mistral_detector.json +++ b/sigllm/pipelines/detector/mistral_detector.json @@ -20,7 +20,8 @@ "method": "mean" }, "sigllm.primitives.transformation.Float2Scalar#1": { - "n_clusters": 100 + "decimal": 2, + "rescale": true }, "sigllm.primitives.forecasting.custom.rolling_window_sequences#1": { "target_column": 0, @@ -32,7 +33,7 @@ }, "sigllm.primitives.forecasting.huggingface.HF#1": { "name": "mistralai/Mistral-7B-Instruct-v0.2", - "steps": 2 + "steps": 5 }, "sigllm.primitives.transformation.format_as_integer#1": { "trunc": 1, diff --git a/sigllm/primitives/jsons/sigllm.primitives.transformation.Float2Scalar.json b/sigllm/primitives/jsons/sigllm.primitives.transformation.Float2Scalar.json index 811ab35..2f3bbc9 100644 --- a/sigllm/primitives/jsons/sigllm.primitives.transformation.Float2Scalar.json +++ b/sigllm/primitives/jsons/sigllm.primitives.transformation.Float2Scalar.json @@ -4,7 +4,7 @@ "Sarah Alnegheimish ", "Linh Nguyen " ], - "description": "Transform float values into scalar using either scaling (multiply by 10^decimal) or binning (K-means clustering).", + "description": "Transform float values into scalar.", "classifiers": { "type": "preprocessor", "subtype": "transformer" @@ -43,22 +43,18 @@ "fixed": { "strategy": { "type": "str", - "description": "Discretization strategy: 'scaling' or 'binning'", "default": "scaling" }, "n_clusters": { "type": "int", - "description": "Number of clusters (binning strategy only)", "default": 100 }, "decimal": { "type": "int", - "description": "Number of decimal places (scaling strategy only)", "default": 2 }, "rescale": { "type": "bool", - "description": "Rescale so minimum becomes 0 (scaling strategy only)", "default": true } } diff --git a/sigllm/primitives/jsons/sigllm.primitives.transformation.Scalar2Float.json b/sigllm/primitives/jsons/sigllm.primitives.transformation.Scalar2Float.json index ba21a7a..7fe7bec 100644 --- a/sigllm/primitives/jsons/sigllm.primitives.transformation.Scalar2Float.json +++ b/sigllm/primitives/jsons/sigllm.primitives.transformation.Scalar2Float.json @@ -4,7 +4,7 @@ "Sarah Alnegheimish ", "Linh Nguyen " ], - "description": "Transform scalar values to float using metadata from Float2Scalar.", + "description": "Transform scalar values to float.", "classifiers": { "type": "preprocessor", "subtype": "transformer" diff --git a/sigllm/primitives/transformation.py b/sigllm/primitives/transformation.py index f3c1381..64d0c25 100644 --- a/sigllm/primitives/transformation.py +++ b/sigllm/primitives/transformation.py @@ -127,28 +127,13 @@ def format_as_integer(X, sep=',', trunc=None, errors='ignore'): class Float2Scalar: - """Convert an array of float values to scalar. - - Transforms an array of floats to an array integers. With the - option to rescale such that the minimum value becomes zero - and you can keep certain decimal points. - - 1.05, 2., 3.1, 4.8342, 5, 0 -> 105, 200, 310, 483, 500, 0 - - Args: - decimal (int): - Number of decimal points to keep from the float representation. Default to `2`. - rescale (bool): - Whether to rescale the array such that the minimum value becomes 0. Default to `True`. - """ + """Convert an array of float values to scalar.""" def __init__(self, strategy='scaling', n_clusters=100, decimal=2, rescale=True): self.strategy = strategy self.n_clusters = n_clusters self.decimal = decimal self.rescale = rescale - - # State variables self.minimum = None self.centroids = None self.labels = None @@ -183,13 +168,7 @@ def fit(self, X): raise ValueError(f"Unknown strategy '{self.strategy}'. Use 'scaling' or 'binning'.") def transform(self, X): - """Transform data to integer representation. - - Returns: - tuple: (values, metadata) where metadata is a dict containing: - - For scaling: {'strategy': 'scaling', 'minimum': float, 'decimal': int} - - For binning: {'strategy': 'binning', 'centroids': list} - """ + """Transform data.""" print(f"[Float2Scalar] Using strategy: {self.strategy}") if self.strategy == 'scaling': if self.rescale: @@ -232,15 +211,7 @@ class Scalar2Float: """ def transform(self, X, metadata): - """Convert data from integer back to float. - - Args: - X (ndarray): Integer values to convert. - metadata (dict): Metadata from Float2Scalar containing strategy and parameters. - - Returns: - ndarray: Float values. - """ + """Transform data.""" strategy = metadata.get('strategy', 'binning') print(f"[Scalar2Float] Using strategy: {strategy}") print(f"[Scalar2Float] Full metadata: {metadata}")