From 2dc7e16f538fb71c3458cc80d1e8a1f6a586c2db Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Mon, 24 Nov 2025 10:13:55 +0100 Subject: [PATCH 1/6] =?UTF-8?q?=E2=9C=A8=20DropQuantile=20first=20working?= =?UTF-8?q?=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_processing.py | 88 +++++++++++++++++++ tide/processing.py | 179 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 267 insertions(+) diff --git a/tests/test_processing.py b/tests/test_processing.py index 518d340..d8f0870 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -38,6 +38,7 @@ KeepColumns, ReplaceTag, AddFourierPairs, + DropQuantile ) RESOURCES_PATH = Path(__file__).parent / "resources" @@ -1106,3 +1107,90 @@ def test_add_fourier_pairs(self): "1 day, 0:00:00_order_1_Sine__W__BLOCK__SUB_BLOCK", "1 day, 0:00:00_order_1_Cosine__W__BLOCK__SUB_BLOCK", ] + + def test_drop_quantile(self): + index = pd.date_range( + "2009-01-01", "2009-01-01 23:00:00", freq="15min", tz="UTC" + ) + cumsum_second = np.arange( + start=0, stop=(index[-1] - index[0]).total_seconds() + 1, step=15 * 60 + ) + + daily = 5 * np.sin( + 2 * np.pi / dt.timedelta(days=1).total_seconds() * cumsum_second + ) + + twice_daily = 5 * np.sin( + 2 * np.pi / dt.timedelta(hours=12).total_seconds() * cumsum_second + ) + + rng = np.random.default_rng(42) + + toy_df = pd.DataFrame( + { + "Temp_1": daily + rng.standard_normal(daily.shape[0]), + "Temp_2": twice_daily + 2 * rng.standard_normal(twice_daily.shape[0]), + }, + index=index, + ) + + dropper = DropQuantile( + upper_quantile=0.75, lower_quantile=0.25, n_iqr=1.5, detrend_method="Gaussian" + ) + + quant_filtered = dropper.fit_transform(toy_df) + + ref_temp_1 = pd.Series( + [np.nan], + pd.DatetimeIndex( + [pd.Timestamp("2009-01-01 07:30:00+00:00", tz="UTC")], freq="15min" + ), + name="Temp_1", + ) + + ref_temp_2 = pd.Series( + [np.nan], + pd.DatetimeIndex( + [pd.Timestamp("2009-01-01 11:30:00+0000", tz="UTC")], freq="15min" + ), + name="Temp_2", + ) + + pd.testing.assert_series_equal( + quant_filtered["Temp_1"][quant_filtered["Temp_1"].isna()], + ref_temp_1, + ) + + pd.testing.assert_series_equal( + quant_filtered["Temp_2"][quant_filtered["Temp_2"].isna()], + ref_temp_2, + ) + + # No filtering method + toy_df = pd.DataFrame( + { + "Noise_1": rng.standard_normal(daily.shape[0]), + "Noise_2": rng.standard_normal(twice_daily.shape[0]), + }, + index=index, + ) + + dropper = DropQuantile(upper_quantile=0.75, lower_quantile=0.25, n_iqr=1.5) + + filtered_noise = dropper.fit_transform(toy_df) + + pd.testing.assert_index_equal( + filtered_noise["Noise_1"][filtered_noise["Noise_1"].isna()].index, + pd.DatetimeIndex( + ["2009-01-01 10:00:00+00:00", "2009-01-01 15:30:00+00:00"], + dtype="datetime64[ns, UTC]", + freq=None, + ), + ) + + pd.testing.assert_index_equal( + filtered_noise["Noise_2"][filtered_noise["Noise_2"].isna()].index, + pd.DatetimeIndex( + ["2009-01-01 03:30:00+00:00"], dtype="datetime64[ns, UTC]", freq=None + ), + ) diff --git a/tide/processing.py b/tide/processing.py index 4aaa757..2d52213 100644 --- a/tide/processing.py +++ b/tide/processing.py @@ -1,6 +1,7 @@ import pandas as pd import numpy as np import datetime as dt +import warnings from functools import partial from collections.abc import Callable @@ -3018,3 +3019,181 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): ) return X + + +QUANTILE_METHODS = { + "Gaussian": (gaussian_filter1d, dict(sigma=5, truncate=5, mode="wrap")) +} + + +class DropQuantile(BaseProcessing): + """ + Filter outliers in time series data using quantile-based thresholds. + + This processor identifies and replaces outlier values with NaN based on + quantile boundaries. It can optionally apply detrending methods before + computing quantiles, making it robust against seasonal patterns or trends. + + Parameters + ---------- + upper_quantile : float, default=1.0 + Upper quantile threshold for outlier detection. Values above this + quantile are considered potential outliers. Must be in range [0, 1]. + lower_quantile : float, default=0.0 + Lower quantile threshold for outlier detection. Values below this + quantile are considered potential outliers. Must be in range [0, 1]. + n_iqr : float, optional + Number of interquartile ranges (IQR) to extend beyond the quantile + thresholds. When specified, the bounds are adjusted as: + lower_bound = q_low - n_iqr * IQR + upper_bound = q_up + n_iqr * IQR + Commonly used with quantiles 0.25 and 0.75 (default quartiles). + detrend_method : str, optional + Name of the detrending method to apply before computing quantiles. + Available methods depend on QUANTILE_METHODS dictionary. + At the moment only 'Gaussian' is available. + If None, quantiles are computed directly on raw data. + method_args : dict, optional + Additional keyword arguments to pass to the detrending method. + For example: {'window': 48} for moving average window size. + + Attributes + ---------- + upper_quantile : float + Stored upper quantile threshold. + lower_quantile : float + Stored lower quantile threshold. + n_iqr : float or None + Stored IQR multiplier. + detrend_method : str or None + Stored detrending method name. + method_args : dict or None + Stored detrending method arguments. + + Examples + -------- + Filter temperature data with outliers using IQR-based detection: + + >>> import pandas as pd + >>> import numpy as np + >>> from datetime import timedelta + >>> + >>> # Create toy temperature dataset with daily and semi-daily patterns + >>> index = pd.date_range("2009-01-01", "2009-01-01 23:00:00", + ... freq="15min", tz="UTC") + >>> t_seconds = np.arange(0, 24*3600 + 1, 15*60) + >>> + >>> # Daily sinusoidal pattern + >>> daily_pattern = 5 * np.sin(2 * np.pi * t_seconds / (24*3600)) + >>> + >>> # Semi-daily sinusoidal pattern + >>> semidaily_pattern = 5 * np.sin(2 * np.pi * t_seconds / (12*3600)) + >>> + >>> # Add random noise + >>> rng = np.random.default_rng(42) + >>> temp_data = pd.DataFrame({ + ... "Temp_1": daily_pattern + rng.standard_normal(len(daily_pattern)), + ... "Temp_2": semidaily_pattern + 2 * rng.standard_normal(len(semidaily_pattern)) + ... }, index=index) + >>> + >>> # Apply outlier detection with Gaussian detrending + >>> dropper = DropQuantile( + ... upper_quantile=0.75, + ... lower_quantile=0.25, + ... n_iqr=1.5, + ... detrend_method="Gaussian" + ... ) + >>> filtered_data = dropper.fit_transform(temp_data) + >>> + >>> # Check detected outliers + >>> print(f"Outliers in Temp_1: {filtered_data['Temp_1'].isna().sum()}") + >>> print(f"Outliers in Temp_2: {filtered_data['Temp_2'].isna().sum()}") + + Simple quantile-based filtering without detrending: + + >>> # Remove extreme values (top 5% and bottom 5%) + >>> simple_dropper = DropQuantile( + ... upper_quantile=0.95, + ... lower_quantile=0.05 + ... ) + >>> filtered_simple = simple_dropper.fit_transform(temp_data) + + Notes + ----- + - When using `n_iqr`, it is conventional to set `upper_quantile=0.75` and + `lower_quantile=0.25` (the first and third quartiles). Other quantile + values will trigger a warning. + + - The `detrend_method` parameter is critical for time series with trends + or seasonality. Without detrending, quantile thresholds may incorrectly + flag normal seasonal variations as outliers. Consider using: + + * 'Gaussian' for smooth trends + * 'MovingAverage' for local detrending + * 'Polynomial' for polynomial trends + + - If `detrend_method=None`, the method operates on raw values, which may + be appropriate only for stationary data without seasonal patterns. + + - The transformation replaces outliers with `np.nan` rather than removing + rows, preserving the time series structure for subsequent processing. + + See Also + -------- + pandas.DataFrame.quantile : Compute quantiles of DataFrame columns. + scipy.signal.detrend : Remove linear trend from data. + + References + ---------- + .. [1] Tukey, J. W. (1977). Exploratory Data Analysis. Pearson. + """ + def __init__( + self, + upper_quantile: float = 1.0, + lower_quantile: float = 0.0, + n_iqr: float = None, + detrend_method: str = None, + method_args: dict = None, + ): + super().__init__() + self.upper_quantile = upper_quantile + self.lower_quantile = lower_quantile + self.n_iqr = n_iqr + self.detrend_method = detrend_method + self.method_args = method_args + + if self.n_iqr and (self.upper_quantile != 0.75 or self.lower_quantile != 0.25): + warnings.warn("n_iqr is tipicaly used with quantile of 0.25 et 0.75") + + def _fit_implementation(self, X: pd.Series | pd.DataFrame, y=None): + pass + + def _transform_implementation(self, X: pd.Series | pd.DataFrame): + x = X.copy() + if self.detrend_method and self.detrend_method in QUANTILE_METHODS.keys(): + try: + method, kwargs = QUANTILE_METHODS[self.detrend_method] + except KeyError: + raise NotImplementedError( + f"The method {self.detrend_method} is not yet implmented" + ) + + if self.method_args: + kwargs.update(self.method_args) + + residue = X - X.apply(partial(method, **kwargs)) + else: + residue = X.copy() + + for col in x: + q_low = np.quantile(residue[col], self.lower_quantile) + q_up = np.quantile(residue[col], self.upper_quantile) + if self.n_iqr: + iqr = q_up - q_low + q_low -= self.n_iqr * iqr + q_up += self.n_iqr * iqr + + mask = (residue[col] < q_low) | (residue[col] > q_up) + x.loc[mask, col] = np.nan + + return x From 5b67b848cc29844cfd121e03d3ab15f19e7ab610 Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Mon, 24 Nov 2025 12:20:58 +0100 Subject: [PATCH 2/6] =?UTF-8?q?=F0=9F=9A=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_processing.py | 7 +++++-- tide/processing.py | 42 ++++++++++++++++------------------------ 2 files changed, 22 insertions(+), 27 deletions(-) diff --git a/tests/test_processing.py b/tests/test_processing.py index d8f0870..a785618 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -38,7 +38,7 @@ KeepColumns, ReplaceTag, AddFourierPairs, - DropQuantile + DropQuantile, ) RESOURCES_PATH = Path(__file__).parent / "resources" @@ -1135,7 +1135,10 @@ def test_drop_quantile(self): ) dropper = DropQuantile( - upper_quantile=0.75, lower_quantile=0.25, n_iqr=1.5, detrend_method="Gaussian" + upper_quantile=0.75, + lower_quantile=0.25, + n_iqr=1.5, + detrend_method="Gaussian", ) quant_filtered = dropper.fit_transform(toy_df) diff --git a/tide/processing.py b/tide/processing.py index 2d52213..5681c41 100644 --- a/tide/processing.py +++ b/tide/processing.py @@ -3079,29 +3079,34 @@ class DropQuantile(BaseProcessing): >>> from datetime import timedelta >>> >>> # Create toy temperature dataset with daily and semi-daily patterns - >>> index = pd.date_range("2009-01-01", "2009-01-01 23:00:00", - ... freq="15min", tz="UTC") - >>> t_seconds = np.arange(0, 24*3600 + 1, 15*60) + >>> index = pd.date_range( + ... "2009-01-01", "2009-01-01 23:00:00", freq="15min", tz="UTC" + ... ) + >>> t_seconds = np.arange(0, 24 * 3600 + 1, 15 * 60) >>> >>> # Daily sinusoidal pattern - >>> daily_pattern = 5 * np.sin(2 * np.pi * t_seconds / (24*3600)) + >>> daily_pattern = 5 * np.sin(2 * np.pi * t_seconds / (24 * 3600)) >>> >>> # Semi-daily sinusoidal pattern - >>> semidaily_pattern = 5 * np.sin(2 * np.pi * t_seconds / (12*3600)) + >>> semidaily_pattern = 5 * np.sin(2 * np.pi * t_seconds / (12 * 3600)) >>> >>> # Add random noise >>> rng = np.random.default_rng(42) - >>> temp_data = pd.DataFrame({ - ... "Temp_1": daily_pattern + rng.standard_normal(len(daily_pattern)), - ... "Temp_2": semidaily_pattern + 2 * rng.standard_normal(len(semidaily_pattern)) - ... }, index=index) + >>> temp_data = pd.DataFrame( + ... { + ... "Temp_1": daily_pattern + rng.standard_normal(len(daily_pattern)), + ... "Temp_2": semidaily_pattern + ... + 2 * rng.standard_normal(len(semidaily_pattern)), + ... }, + ... index=index, + ... ) >>> >>> # Apply outlier detection with Gaussian detrending >>> dropper = DropQuantile( ... upper_quantile=0.75, ... lower_quantile=0.25, ... n_iqr=1.5, - ... detrend_method="Gaussian" + ... detrend_method="Gaussian", ... ) >>> filtered_data = dropper.fit_transform(temp_data) >>> @@ -3109,15 +3114,6 @@ class DropQuantile(BaseProcessing): >>> print(f"Outliers in Temp_1: {filtered_data['Temp_1'].isna().sum()}") >>> print(f"Outliers in Temp_2: {filtered_data['Temp_2'].isna().sum()}") - Simple quantile-based filtering without detrending: - - >>> # Remove extreme values (top 5% and bottom 5%) - >>> simple_dropper = DropQuantile( - ... upper_quantile=0.95, - ... lower_quantile=0.05 - ... ) - >>> filtered_simple = simple_dropper.fit_transform(temp_data) - Notes ----- - When using `n_iqr`, it is conventional to set `upper_quantile=0.75` and @@ -3141,12 +3137,8 @@ class DropQuantile(BaseProcessing): See Also -------- pandas.DataFrame.quantile : Compute quantiles of DataFrame columns. - scipy.signal.detrend : Remove linear trend from data. - - References - ---------- - .. [1] Tukey, J. W. (1977). Exploratory Data Analysis. Pearson. """ + def __init__( self, upper_quantile: float = 1.0, @@ -3175,7 +3167,7 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): method, kwargs = QUANTILE_METHODS[self.detrend_method] except KeyError: raise NotImplementedError( - f"The method {self.detrend_method} is not yet implmented" + f"The method {self.detrend_method} is not yet implemented" ) if self.method_args: From ef8b9062fe509d0724b89710b5643e47c64612ee Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Mon, 24 Nov 2025 12:59:20 +0100 Subject: [PATCH 3/6] =?UTF-8?q?=E2=9C=A8=20Linear=20detrending=20added=20t?= =?UTF-8?q?o=20DropQuantile?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_processing.py | 33 +++++++++++++++++++++++++++++++++ tide/processing.py | 19 ++++++++++++++----- 2 files changed, 47 insertions(+), 5 deletions(-) diff --git a/tests/test_processing.py b/tests/test_processing.py index a785618..83b4fb5 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -1197,3 +1197,36 @@ def test_drop_quantile(self): ["2009-01-01 03:30:00+00:00"], dtype="datetime64[ns, UTC]", freq=None ), ) + + # Linear trend + + index = pd.date_range( + "2009-01-01", "2009-01-01 23:00:00", freq="15min", tz="UTC" + ) + + rng = np.random.default_rng(42) + + toy_df = pd.DataFrame( + { + "Temp_1": np.linspace(0, 10, len(index)) + + rng.standard_normal(len(index)), + "Temp_2": np.linspace(5, 10, len(index)) + + rng.standard_normal(len(index)), + }, + index=index, + ) + + filter_detrend = DropQuantile( + upper_quantile=0.75, + lower_quantile=0.25, + n_iqr=1.5, + detrend_method="Detrend", + ) + filtered = filter_detrend.fit_transform(toy_df) + + pd.testing.assert_index_equal( + filtered["Temp_2"][filtered["Temp_2"].isna()].index, + pd.DatetimeIndex( + ["2009-01-01 11:30:00+00:00"], dtype="datetime64[ns, UTC]", freq=None + ), + ) diff --git a/tide/processing.py b/tide/processing.py index 5681c41..3b6fab1 100644 --- a/tide/processing.py +++ b/tide/processing.py @@ -7,6 +7,7 @@ from sklearn.utils.validation import check_is_fitted from scipy.ndimage import gaussian_filter1d +from scipy.signal import detrend from tide.base import BaseProcessing, BaseFiller, BaseOikoMeteo from tide.math import time_gradient @@ -3022,7 +3023,8 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): QUANTILE_METHODS = { - "Gaussian": (gaussian_filter1d, dict(sigma=5, truncate=5, mode="wrap")) + "Gaussian": (gaussian_filter1d, dict(sigma=5, truncate=5, mode="wrap")), + "Detrend": (detrend, dict(type="linear")), } @@ -3051,7 +3053,8 @@ class DropQuantile(BaseProcessing): detrend_method : str, optional Name of the detrending method to apply before computing quantiles. Available methods depend on QUANTILE_METHODS dictionary. - At the moment only 'Gaussian' is available. + At the moment only 'Gaussian' and "Detrend" are available. + Detrend is a linear or constant detrending. See scipy.signal. If None, quantiles are computed directly on raw data. method_args : dict, optional Additional keyword arguments to pass to the detrending method. @@ -3125,8 +3128,7 @@ class DropQuantile(BaseProcessing): flag normal seasonal variations as outliers. Consider using: * 'Gaussian' for smooth trends - * 'MovingAverage' for local detrending - * 'Polynomial' for polynomial trends + * 'Detrend' linear dtrending - If `detrend_method=None`, the method operates on raw values, which may be appropriate only for stationary data without seasonal patterns. @@ -3173,10 +3175,17 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): if self.method_args: kwargs.update(self.method_args) - residue = X - X.apply(partial(method, **kwargs)) + # Separate methods that returns the trend from the ones that returns + # detrended series + if self.detrend_method in ["Gaussian"]: + residue = X - X.apply(partial(method, **kwargs)) + else: + residue = X.apply(partial(method, **kwargs)) else: residue = X.copy() + assert True + for col in x: q_low = np.quantile(residue[col], self.lower_quantile) q_up = np.quantile(residue[col], self.upper_quantile) From 430a78fece5a625be30240f8aeec8d5f3f65ee95 Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Mon, 24 Nov 2025 13:03:54 +0100 Subject: [PATCH 4/6] =?UTF-8?q?=F0=9F=93=9D=20DropQuantile?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/api_reference/processing.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/api_reference/processing.rst b/docs/api_reference/processing.rst index a4e5dc9..7460c21 100644 --- a/docs/api_reference/processing.rst +++ b/docs/api_reference/processing.rst @@ -114,3 +114,7 @@ The processing module provides transformers for data processing and manipulation .. autoclass:: tide.processing.AddFourierPairs :members: :show-inheritance: + +.. autoclass:: tide.processing.DropQuantile + :members: + :show-inheritance: From f879f3375e5abfdfbb63c4ef6958b7b72516dc86 Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Mon, 24 Nov 2025 15:19:56 +0100 Subject: [PATCH 5/6] =?UTF-8?q?=F0=9F=92=9A=20min=20pystan=20for=20prophet?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements/install-min.txt | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/requirements/install-min.txt b/requirements/install-min.txt index dbaf6f6..76998be 100644 --- a/requirements/install-min.txt +++ b/requirements/install-min.txt @@ -1,10 +1,16 @@ -numpy==1.22.4 -pandas==2.0.0 -scipy==1.9.1 -bigtree==0.21.3 +# minimal working environment (stable with Prophet 1.1.5 + cmdstanpy) +numpy==1.25.2 +pandas==2.1.4 +scipy==1.10.1 scikit-learn==1.2.2 statsmodels==0.14.4 -matplotlib==3.5.1 -plotly==5.3.1 -requests==2.32.3 -prophet==1.1.6 \ No newline at end of file +matplotlib==3.7.1 +plotly==5.13.1 +requests==2.31.0 +bigtree==0.21.3 + +# prophet pinned to stable pre-1.1.6 behavior +prophet==1.1.5 + +# make sure cmdstan backend is available and chosen by Prophet +cmdstanpy==1.2.0 From a4b50dc71625dd5fc719f8ca5a34a2a874dee912 Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Mon, 24 Nov 2025 15:51:36 +0100 Subject: [PATCH 6/6] =?UTF-8?q?=F0=9F=92=9A=20min=20pystan=20for=20prophet?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 30920c4..1761fb1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ dependencies = [ "plotly>=5.3.1", "requests>=2.32.3", "influxdb-client>=1.48.0", - "prophet>=1.1.6", + "prophet>=1.1.5", ] [project.optional-dependencies]