Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/api_reference/processing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,7 @@ The processing module provides transformers for data processing and manipulation
.. autoclass:: tide.processing.AddFourierPairs
:members:
:show-inheritance:

.. autoclass:: tide.processing.DropQuantile
:members:
:show-inheritance:
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ dependencies = [
"plotly>=5.3.1",
"requests>=2.32.3",
"influxdb-client>=1.48.0",
"prophet>=1.1.6",
"prophet>=1.1.5",
]

[project.optional-dependencies]
Expand Down
22 changes: 14 additions & 8 deletions requirements/install-min.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
numpy==1.22.4
pandas==2.0.0
scipy==1.9.1
bigtree==0.21.3
# minimal working environment (stable with Prophet 1.1.5 + cmdstanpy)
numpy==1.25.2
pandas==2.1.4
scipy==1.10.1
scikit-learn==1.2.2
statsmodels==0.14.4
matplotlib==3.5.1
plotly==5.3.1
requests==2.32.3
prophet==1.1.6
matplotlib==3.7.1
plotly==5.13.1
requests==2.31.0
bigtree==0.21.3

# prophet pinned to stable pre-1.1.6 behavior
prophet==1.1.5

# make sure cmdstan backend is available and chosen by Prophet
cmdstanpy==1.2.0
124 changes: 124 additions & 0 deletions tests/test_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
KeepColumns,
ReplaceTag,
AddFourierPairs,
DropQuantile,
)

RESOURCES_PATH = Path(__file__).parent / "resources"
Expand Down Expand Up @@ -1106,3 +1107,126 @@ def test_add_fourier_pairs(self):
"1 day, 0:00:00_order_1_Sine__W__BLOCK__SUB_BLOCK",
"1 day, 0:00:00_order_1_Cosine__W__BLOCK__SUB_BLOCK",
]

def test_drop_quantile(self):
index = pd.date_range(
"2009-01-01", "2009-01-01 23:00:00", freq="15min", tz="UTC"
)
cumsum_second = np.arange(
start=0, stop=(index[-1] - index[0]).total_seconds() + 1, step=15 * 60
)

daily = 5 * np.sin(
2 * np.pi / dt.timedelta(days=1).total_seconds() * cumsum_second
)

twice_daily = 5 * np.sin(
2 * np.pi / dt.timedelta(hours=12).total_seconds() * cumsum_second
)

rng = np.random.default_rng(42)

toy_df = pd.DataFrame(
{
"Temp_1": daily + rng.standard_normal(daily.shape[0]),
"Temp_2": twice_daily + 2 * rng.standard_normal(twice_daily.shape[0]),
},
index=index,
)

dropper = DropQuantile(
upper_quantile=0.75,
lower_quantile=0.25,
n_iqr=1.5,
detrend_method="Gaussian",
)

quant_filtered = dropper.fit_transform(toy_df)

ref_temp_1 = pd.Series(
[np.nan],
pd.DatetimeIndex(
[pd.Timestamp("2009-01-01 07:30:00+00:00", tz="UTC")], freq="15min"
),
name="Temp_1",
)

ref_temp_2 = pd.Series(
[np.nan],
pd.DatetimeIndex(
[pd.Timestamp("2009-01-01 11:30:00+0000", tz="UTC")], freq="15min"
),
name="Temp_2",
)

pd.testing.assert_series_equal(
quant_filtered["Temp_1"][quant_filtered["Temp_1"].isna()],
ref_temp_1,
)

pd.testing.assert_series_equal(
quant_filtered["Temp_2"][quant_filtered["Temp_2"].isna()],
ref_temp_2,
)

# No filtering method
toy_df = pd.DataFrame(
{
"Noise_1": rng.standard_normal(daily.shape[0]),
"Noise_2": rng.standard_normal(twice_daily.shape[0]),
},
index=index,
)

dropper = DropQuantile(upper_quantile=0.75, lower_quantile=0.25, n_iqr=1.5)

filtered_noise = dropper.fit_transform(toy_df)

pd.testing.assert_index_equal(
filtered_noise["Noise_1"][filtered_noise["Noise_1"].isna()].index,
pd.DatetimeIndex(
["2009-01-01 10:00:00+00:00", "2009-01-01 15:30:00+00:00"],
dtype="datetime64[ns, UTC]",
freq=None,
),
)

pd.testing.assert_index_equal(
filtered_noise["Noise_2"][filtered_noise["Noise_2"].isna()].index,
pd.DatetimeIndex(
["2009-01-01 03:30:00+00:00"], dtype="datetime64[ns, UTC]", freq=None
),
)

# Linear trend

index = pd.date_range(
"2009-01-01", "2009-01-01 23:00:00", freq="15min", tz="UTC"
)

rng = np.random.default_rng(42)

toy_df = pd.DataFrame(
{
"Temp_1": np.linspace(0, 10, len(index))
+ rng.standard_normal(len(index)),
"Temp_2": np.linspace(5, 10, len(index))
+ rng.standard_normal(len(index)),
},
index=index,
)

filter_detrend = DropQuantile(
upper_quantile=0.75,
lower_quantile=0.25,
n_iqr=1.5,
detrend_method="Detrend",
)
filtered = filter_detrend.fit_transform(toy_df)

pd.testing.assert_index_equal(
filtered["Temp_2"][filtered["Temp_2"].isna()].index,
pd.DatetimeIndex(
["2009-01-01 11:30:00+00:00"], dtype="datetime64[ns, UTC]", freq=None
),
)
180 changes: 180 additions & 0 deletions tide/processing.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import pandas as pd
import numpy as np
import datetime as dt
import warnings
from functools import partial
from collections.abc import Callable

from sklearn.utils.validation import check_is_fitted
from scipy.ndimage import gaussian_filter1d
from scipy.signal import detrend

from tide.base import BaseProcessing, BaseFiller, BaseOikoMeteo
from tide.math import time_gradient
Expand Down Expand Up @@ -3018,3 +3020,181 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame):
)

return X


QUANTILE_METHODS = {
"Gaussian": (gaussian_filter1d, dict(sigma=5, truncate=5, mode="wrap")),
"Detrend": (detrend, dict(type="linear")),
}


class DropQuantile(BaseProcessing):
"""
Filter outliers in time series data using quantile-based thresholds.

This processor identifies and replaces outlier values with NaN based on
quantile boundaries. It can optionally apply detrending methods before
computing quantiles, making it robust against seasonal patterns or trends.

Parameters
----------
upper_quantile : float, default=1.0
Upper quantile threshold for outlier detection. Values above this
quantile are considered potential outliers. Must be in range [0, 1].
lower_quantile : float, default=0.0
Lower quantile threshold for outlier detection. Values below this
quantile are considered potential outliers. Must be in range [0, 1].
n_iqr : float, optional
Number of interquartile ranges (IQR) to extend beyond the quantile
thresholds. When specified, the bounds are adjusted as:
lower_bound = q_low - n_iqr * IQR
upper_bound = q_up + n_iqr * IQR
Commonly used with quantiles 0.25 and 0.75 (default quartiles).
detrend_method : str, optional
Name of the detrending method to apply before computing quantiles.
Available methods depend on QUANTILE_METHODS dictionary.
At the moment only 'Gaussian' and "Detrend" are available.
Detrend is a linear or constant detrending. See scipy.signal.
If None, quantiles are computed directly on raw data.
method_args : dict, optional
Additional keyword arguments to pass to the detrending method.
For example: {'window': 48} for moving average window size.

Attributes
----------
upper_quantile : float
Stored upper quantile threshold.
lower_quantile : float
Stored lower quantile threshold.
n_iqr : float or None
Stored IQR multiplier.
detrend_method : str or None
Stored detrending method name.
method_args : dict or None
Stored detrending method arguments.

Examples
--------
Filter temperature data with outliers using IQR-based detection:

>>> import pandas as pd
>>> import numpy as np
>>> from datetime import timedelta
>>>
>>> # Create toy temperature dataset with daily and semi-daily patterns
>>> index = pd.date_range(
... "2009-01-01", "2009-01-01 23:00:00", freq="15min", tz="UTC"
... )
>>> t_seconds = np.arange(0, 24 * 3600 + 1, 15 * 60)
>>>
>>> # Daily sinusoidal pattern
>>> daily_pattern = 5 * np.sin(2 * np.pi * t_seconds / (24 * 3600))
>>>
>>> # Semi-daily sinusoidal pattern
>>> semidaily_pattern = 5 * np.sin(2 * np.pi * t_seconds / (12 * 3600))
>>>
>>> # Add random noise
>>> rng = np.random.default_rng(42)
>>> temp_data = pd.DataFrame(
... {
... "Temp_1": daily_pattern + rng.standard_normal(len(daily_pattern)),
... "Temp_2": semidaily_pattern
... + 2 * rng.standard_normal(len(semidaily_pattern)),
... },
... index=index,
... )
>>>
>>> # Apply outlier detection with Gaussian detrending
>>> dropper = DropQuantile(
... upper_quantile=0.75,
... lower_quantile=0.25,
... n_iqr=1.5,
... detrend_method="Gaussian",
... )
>>> filtered_data = dropper.fit_transform(temp_data)
>>>
>>> # Check detected outliers
>>> print(f"Outliers in Temp_1: {filtered_data['Temp_1'].isna().sum()}")
>>> print(f"Outliers in Temp_2: {filtered_data['Temp_2'].isna().sum()}")

Notes
-----
- When using `n_iqr`, it is conventional to set `upper_quantile=0.75` and
`lower_quantile=0.25` (the first and third quartiles). Other quantile
values will trigger a warning.

- The `detrend_method` parameter is critical for time series with trends
or seasonality. Without detrending, quantile thresholds may incorrectly
flag normal seasonal variations as outliers. Consider using:

* 'Gaussian' for smooth trends
* 'Detrend' linear dtrending

- If `detrend_method=None`, the method operates on raw values, which may
be appropriate only for stationary data without seasonal patterns.

- The transformation replaces outliers with `np.nan` rather than removing
rows, preserving the time series structure for subsequent processing.

See Also
--------
pandas.DataFrame.quantile : Compute quantiles of DataFrame columns.
"""

def __init__(
self,
upper_quantile: float = 1.0,
lower_quantile: float = 0.0,
n_iqr: float = None,
detrend_method: str = None,
method_args: dict = None,
):
super().__init__()
self.upper_quantile = upper_quantile
self.lower_quantile = lower_quantile
self.n_iqr = n_iqr
self.detrend_method = detrend_method
self.method_args = method_args

if self.n_iqr and (self.upper_quantile != 0.75 or self.lower_quantile != 0.25):
warnings.warn("n_iqr is tipicaly used with quantile of 0.25 et 0.75")

def _fit_implementation(self, X: pd.Series | pd.DataFrame, y=None):
pass

def _transform_implementation(self, X: pd.Series | pd.DataFrame):
x = X.copy()
if self.detrend_method and self.detrend_method in QUANTILE_METHODS.keys():
try:
method, kwargs = QUANTILE_METHODS[self.detrend_method]
except KeyError:
raise NotImplementedError(
f"The method {self.detrend_method} is not yet implemented"
)

if self.method_args:
kwargs.update(self.method_args)

# Separate methods that returns the trend from the ones that returns
# detrended series
if self.detrend_method in ["Gaussian"]:
residue = X - X.apply(partial(method, **kwargs))
else:
residue = X.apply(partial(method, **kwargs))
else:
residue = X.copy()

assert True

for col in x:
q_low = np.quantile(residue[col], self.lower_quantile)
q_up = np.quantile(residue[col], self.upper_quantile)
if self.n_iqr:
iqr = q_up - q_low
q_low -= self.n_iqr * iqr
q_up += self.n_iqr * iqr

mask = (residue[col] < q_low) | (residue[col] > q_up)
x.loc[mask, col] = np.nan

return x
Loading