diff --git a/.gitignore b/.gitignore index c66abc3dc..47f061709 100644 --- a/.gitignore +++ b/.gitignore @@ -125,3 +125,7 @@ certificates/ # Benchmark outputs benchmark_results*/ + +# Mlflow +mlflow +mlflow_artifacts_local \ No newline at end of file diff --git a/packages/openstef-beam/src/openstef_beam/backtesting/backtest_pipeline.py b/packages/openstef-beam/src/openstef_beam/backtesting/backtest_pipeline.py index 65a8393af..597f6ac5f 100644 --- a/packages/openstef-beam/src/openstef_beam/backtesting/backtest_pipeline.py +++ b/packages/openstef-beam/src/openstef_beam/backtesting/backtest_pipeline.py @@ -180,7 +180,6 @@ def run( return TimeSeriesDataset( data=pd.concat([pred.data for pred in prediction_list], axis=0), - sample_interval=self.config.prediction_sample_interval, ) def _process_train_event(self, event: BacktestEvent, dataset: VersionedTimeSeriesDataset) -> None: diff --git a/packages/openstef-beam/tests/unit/backtesting/test_backtest_pipeline.py b/packages/openstef-beam/tests/unit/backtesting/test_backtest_pipeline.py index 9e68fd5a1..cfb1b6778 100644 --- a/packages/openstef-beam/tests/unit/backtesting/test_backtest_pipeline.py +++ b/packages/openstef-beam/tests/unit/backtesting/test_backtest_pipeline.py @@ -139,8 +139,7 @@ def test_run_training_scenarios( # Assert assert isinstance(result, TimeSeriesDataset) - assert result.sample_interval == forecaster_config.predict_sample_interval - + assert result.sample_interval == timedelta(hours=6) # Validate call counts if expected_train_calls == ">0": assert mock_forecaster.train_call_count > 0 @@ -245,7 +244,7 @@ def create_prediction(data: RestrictedHorizonVersionedTimeSeries) -> TimeSeriesD ) # Assert - Basic structure - assert result.sample_interval == mock_forecaster.config.predict_sample_interval + assert result.sample_interval == timedelta(hours=6) assert mock_forecaster.predict_call_count >= 2 # Assert - Output validation @@ -352,18 +351,20 @@ def test_run_edge_cases( timestamps = pd.date_range("2025-01-01T12:00:00", "2025-01-01T15:00:00", freq="1h") start_time = "2025-01-01T12:00:00" end_time = "2025-01-01T15:00:00" + sample_interval = timedelta(hours=1) else: # sparse timestamps = pd.DatetimeIndex(["2025-01-01T12:00:00", "2025-01-01T18:00:00"]) start_time = "2025-01-01T18:00:00" end_time = "2025-01-01T20:00:00" + sample_interval = timedelta(hours=6) ground_truth = VersionedTimeSeriesDataset.from_dataframe( data=pd.DataFrame({"available_at": timestamps, "target": range(len(timestamps))}, index=timestamps), - sample_interval=timedelta(hours=1), + sample_interval=sample_interval, ) predictors = VersionedTimeSeriesDataset.from_dataframe( data=pd.DataFrame({"available_at": timestamps, "feature1": range(len(timestamps))}, index=timestamps), - sample_interval=timedelta(hours=1), + sample_interval=sample_interval, ) # Act diff --git a/packages/openstef-beam/tests/unit/backtesting/test_batch_prediction.py b/packages/openstef-beam/tests/unit/backtesting/test_batch_prediction.py index 09e6ac4f0..1863d8ec6 100644 --- a/packages/openstef-beam/tests/unit/backtesting/test_batch_prediction.py +++ b/packages/openstef-beam/tests/unit/backtesting/test_batch_prediction.py @@ -55,7 +55,6 @@ def predict(self, data: RestrictedHorizonVersionedTimeSeries) -> TimeSeriesDatas timestamps = pd.date_range(start=data.horizon, periods=2, freq="1h") return TimeSeriesDataset( data=pd.DataFrame({"quantile_P50": [0.5, 0.5]}, index=timestamps), - sample_interval=self.config.predict_sample_interval, ) def predict_batch(self, batch: list[RestrictedHorizonVersionedTimeSeries]) -> list[TimeSeriesDataset]: @@ -66,7 +65,6 @@ def predict_batch(self, batch: list[RestrictedHorizonVersionedTimeSeries]) -> li results.append( TimeSeriesDataset( data=pd.DataFrame({"quantile_P50": [0.5, 0.5]}, index=timestamps), - sample_interval=self.config.predict_sample_interval, ) ) return results diff --git a/packages/openstef-beam/tests/unit/benchmarking/test_target_provider.py b/packages/openstef-beam/tests/unit/benchmarking/test_target_provider.py index 1692f92b4..61b7f1b09 100644 --- a/packages/openstef-beam/tests/unit/benchmarking/test_target_provider.py +++ b/packages/openstef-beam/tests/unit/benchmarking/test_target_provider.py @@ -82,13 +82,13 @@ def test_get_predictors_for_target(tmp_path: Path, test_target: BenchmarkTarget) interval = timedelta(hours=1) weather = VersionedTimeSeriesDataset.from_dataframe( - pd.DataFrame({"temp": range(3), "available_at": index}, index=index), interval + pd.DataFrame({"temp": range(3), "available_at": index}, index=index), sample_interval=interval ) profiles = VersionedTimeSeriesDataset.from_dataframe( - pd.DataFrame({"prof": range(3), "available_at": index}, index=index), interval + pd.DataFrame({"prof": range(3), "available_at": index}, index=index), sample_interval=interval ) prices = VersionedTimeSeriesDataset.from_dataframe( - pd.DataFrame({"price": range(3), "available_at": index}, index=index), interval + pd.DataFrame({"price": range(3), "available_at": index}, index=index), sample_interval=interval ) class TestProvider(SimpleTargetProvider[BenchmarkTarget, None]): diff --git a/packages/openstef-core/src/openstef_core/datasets/timeseries_dataset.py b/packages/openstef-core/src/openstef_core/datasets/timeseries_dataset.py index d4dfb63d2..c316ba7c5 100644 --- a/packages/openstef-core/src/openstef_core/datasets/timeseries_dataset.py +++ b/packages/openstef-core/src/openstef_core/datasets/timeseries_dataset.py @@ -99,7 +99,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor def __init__( self, data: pd.DataFrame, - sample_interval: timedelta = timedelta(minutes=15), + sample_interval: timedelta | None = None, *, horizon_column: str = "horizon", available_at_column: str = "available_at", @@ -122,10 +122,27 @@ def __init__( Raises: TypeError: If data index is not a pandas DatetimeIndex or if versioning columns have incorrect types. + ValueError: If data frequency does not match sample_interval. """ if not isinstance(data.index, pd.DatetimeIndex): raise TypeError("Data index must be a pandas DatetimeIndex.") + if sample_interval is None: + inferred_freq = pd.Timedelta( + self._infer_frequency(data.index) if data.index.freq is None else data.index.freq # type: ignore + ) + sample_interval = inferred_freq.to_pytimedelta() + + # Check input data frequency matches sample_interval, only if there are enough data points to infer frequency + minimum_required_length = 2 + if len(data) >= minimum_required_length: + input_sample_interval = self._infer_frequency(data.index) if data.index.freq is None else data.index.freq + if input_sample_interval != sample_interval: + msg = ( + f"Data frequency ({input_sample_interval}) does not match the sample_interval ({sample_interval})." + ) + raise ValueError(msg) + self.data = data self.horizon_column = horizon_column self.available_at_column = available_at_column @@ -443,6 +460,44 @@ def copy_with(self, data: pd.DataFrame, *, is_sorted: bool = False) -> "TimeSeri is_sorted=is_sorted, ) + @staticmethod + def _infer_frequency(index: pd.DatetimeIndex) -> pd.Timedelta: + """Infer the frequency of a pandas DatetimeIndex if the freq attribute is not set. + + This method calculates the most common time difference between consecutive timestamps, + which is more permissive of missing chunks of data than the pandas infer_freq method. + + Args: + index (pd.DatetimeIndex): The datetime index to infer the frequency from. + + Returns: + pd.Timedelta: The inferred frequency as a pandas Timedelta. + + Raises: + ValueError: If the index has fewer than 2 timestamps. + """ + minimum_required_length = 2 + if len(index) < minimum_required_length: + raise ValueError("Cannot infer frequency from an index with fewer than 2 timestamps.") + + # Calculate the differences between consecutive timestamps + deltas = index.to_series().drop_duplicates().sort_values().diff().dropna() + + # Find the most common difference + return deltas.mode().iloc[0] + + def _frequency_matches(self, index: pd.DatetimeIndex) -> bool: + """Check if the frequency of the data matches the model frequency. + + Args: + index (pd.DatetimeIndex): The data to check. + + Returns: + bool: True if the frequencies match, False otherwise. + """ + input_sample_interval = self._infer_frequency(index) if index.freq is None else index.freq + return input_sample_interval == self.sample_interval + def validate_horizons_present(dataset: TimeSeriesDataset, horizons: list[LeadTime]) -> None: """Validate that the specified forecast horizons are present in the dataset. diff --git a/packages/openstef-models/src/openstef_models/models/forecasting/median_forecaster.py b/packages/openstef-models/src/openstef_models/models/forecasting/median_forecaster.py new file mode 100644 index 000000000..b6449c78f --- /dev/null +++ b/packages/openstef-models/src/openstef_models/models/forecasting/median_forecaster.py @@ -0,0 +1,325 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +"""Median regressor based forecasting models for energy forecasting. + +Provides median regression models for multi-quantile energy forecasting. + +Note that this is a autoregressive model, meaning that it uses the previous + predictions to predict the next value. + + This regressor is good for predicting two types of signals: + - Signals with very slow dynamics compared to the sampling rate, possibly + with a lot of noise. + - Signals that switch between two or more states, which random in nature or + depend on unknown features, but tend to be stable in each state. An example of + this may be waste heat delivered from an industrial process. Using a median + over the last few timesteps adds some hysterisis to avoid triggering on noise. + + Tips for using this regressor: + - Set the lags to be evenly spaced and at a frequency matching the + frequency of the input data. For example, if the input data is at 15 + minute intervals, set the lags to be at 15 minute intervals as well. + - Use a small training dataset, since there are no actual parameters to train. + - Set the frequency of the input data index to avoid inferring it. Inference might be + a problem if we get very small chunks of data in training or validation sets. + - Use only one training horizon, since the regressor will use the same lags for all + training horizons. +""" + +from datetime import timedelta +from typing import override + +import numpy as np +import pandas as pd +from pydantic import Field + +from openstef_core.datasets.validated_datasets import ForecastDataset, ForecastInputDataset +from openstef_core.mixins.predictor import HyperParams +from openstef_core.types import LeadTime, Quantile +from openstef_core.utils.pydantic import timedelta_from_isoformat +from openstef_models.explainability.mixins import ExplainableForecaster +from openstef_models.models.forecasting.forecaster import Forecaster, ForecasterConfig + + +class MedianForecasterHyperParams(HyperParams): + """Hyperparameter configuration for base case forecaster.""" + + primary_lag: timedelta = Field( + default=timedelta(days=7), + description="Primary lag to use for predictions (default: 7 days for weekly patterns)", + ) + fallback_lag: timedelta = Field( + default=timedelta(days=14), + description="Fallback lag to use when primary lag data is unavailable (default: 14 days)", + ) + + +class MedianForecasterConfig(ForecasterConfig): + """Configuration for base case forecaster.""" + + quantiles: list[Quantile] = Field( + default=[Quantile(0.5)], + description=( + "Probability levels for uncertainty estimation. Each quantile represents a confidence level " + "(e.g., 0.1 = 10th percentile, 0.5 = median, 0.9 = 90th percentile). " + "Models must generate predictions for all specified quantiles." + ), + min_length=1, + max_length=1, + ) + horizons: list[LeadTime] = Field( + default=..., + description=( + "Lead times for predictions, accounting for data availability and versioning cutoffs. " + "Each horizon defines how far ahead the model should predict." + ), + min_length=1, + max_length=1, + ) + + hyperparams: MedianForecasterHyperParams = Field( + default_factory=MedianForecasterHyperParams, + ) + + +class MedianForecaster(Forecaster, ExplainableForecaster): + """Median forecaster using lag features for predictions. + + This forecaster predicts the median value based on specified lag features. + It is particularly useful for signals with slow dynamics or state-switching behavior. + + Hyperparameters: + lags: List of time deltas representing the lag features to use for prediction. + These should be aligned with the data sampling frequency. + context_window: Time delta representing the context window size for input data. + This defines how much historical data is considered for making predictions. + + _config: MedianForecasterConfig + """ + + Config = MedianForecasterConfig + HyperParams = MedianForecasterHyperParams + + def __init__( + self, + config: MedianForecasterConfig, + ) -> None: + """Initialize the base case forecaster. + + Args: + config: Configuration specifying quantiles, horizons, and lag hyperparameters. + If None, uses default configuration with 7-day primary and 14-day fallback lags. + """ + self._config = config + self.is_fitted_ = False + self.feature_names_: list[str] = [] + + @property + @override + def config(self) -> MedianForecasterConfig: + return self._config + + @property + @override + def hyperparams(self) -> MedianForecasterHyperParams: + return self._config.hyperparams + + @property + @override + def is_fitted(self) -> bool: + return self.is_fitted_ + + @property + @override + def feature_importances(self) -> pd.DataFrame: + + return pd.DataFrame( + data=self.feature_importances_, columns=[self.config.quantiles[0].format()], index=self.feature_names_ + ) + + @property + def frequency(self) -> timedelta: + """Retrieve the model input frequency. + + Returns: + The frequency of the model input + + """ + return self.frequency_ + + @staticmethod + def _fill_diagonal_with_median(lag_array: np.ndarray, start: int, end: int, median: float) -> np.ndarray | None: + # Use the calculated median to fill in future lag values where this prediction would be used as input. + + # If the start index is beyond the array bounds, no future updates are needed from this step. + if start >= lag_array.shape[0]: + return lag_array + + # Ensure the end index does not exceed the array bounds. + end = min(end, lag_array.shape[0]) + + # Get a view of the sub-array where the diagonal needs to be filled. + # The slice represents future time steps (rows) and corresponding lag features (columns). + # Rows: from 'start' up to (but not including) 'end' + # Columns: from 0 up to (but not including) 'end - start' + # This selects the part of the array where lag_array[start + k, k] resides for k in range(end - start). + view = lag_array[start:end, 0 : (end - start)] + + # Create a mask for NaNs on the diagonal + diagonal_nan_mask = np.isnan(np.diag(view)) + + # Only update if there are NaNs on the diagonal + if np.any(diagonal_nan_mask): + # Create a temporary array to hold the new diagonal + updated_diagonal = np.diag(view).copy() + updated_diagonal[diagonal_nan_mask] = median + np.fill_diagonal(view, updated_diagonal) + return None + + @override + def predict(self, data: ForecastInputDataset) -> ForecastDataset: + """Predict the median of the lag features for each time step in the context window. + + Args: + data (ForecastInputDataset): The input data for prediction, + this should be a pandas dataframe with lag features. + + Returns: + np.array: The predicted median for each time step in the context window. + If any lag feature is NaN, this will be ignored. + If all lag features are NaN, the regressor will return NaN. + + Raises: + ValueError: If the input data is missing any of the required lag features. + AttributeError: If the model is not fitted yet. + """ + if not self.is_fitted: + msg = "This MedianForecaster instance is not fitted yet" + raise AttributeError(msg) + + input_data: pd.DataFrame = data.input_data(start=data.forecast_start) + + # Check that the input data contains the required lag features + missing_features = set(self.feature_names_) - set(data.feature_names) + if missing_features: + msg = f"The input data is missing the following lag features: {missing_features}" + raise ValueError(msg) + + # Reindex the input data to ensure there are no gaps in the time series. + # This is important for the autoregressive logic that follows. + # Create a new date range with the expected frequency. + new_index = pd.date_range(input_data.index[0], input_data.index[-1], freq=self.frequency_) + # Reindex the input DataFrame, filling any new timestamps with NaN. + # Select only the lag feature columns in the specified order. + lag_df = input_data.reindex(new_index, fill_value=np.nan)[self.feature_names_] + + # Convert the lag DataFrame and its index to NumPy arrays for faster processing. + lag_array = lag_df.to_numpy() + # Initialize the prediction array with NaNs. + prediction = np.full(lag_array.shape[0], np.nan) + + # Calculate the time step size based on the model frequency. + step_size = self.frequency_ + # Determine the number of steps corresponding to the smallest and largest lags. + smallest_lag_steps = int(self.lags_to_time_deltas_[self.feature_names_[0]] / step_size) + largest_lag_steps = int(self.lags_to_time_deltas_[self.feature_names_[-1]] / step_size) + + # Iterate through each time step in the reindexed data. + for time_step in range(lag_array.shape[0]): + # Get the lag features for the current time step. + current_lags = lag_array[time_step] + # Calculate the median of the available lag features, ignoring NaNs. + median = np.nanmedian(current_lags) # type: ignore + # If the median calculation resulted in NaN (e.g., all lags were NaN), skip the autoregression step. + if not np.isnan(median): # type: ignore + median = float(median) # type: ignore + else: + continue + + # Store the calculated median in the prediction array. + prediction[time_step] = median + + # Auto-regressive step: update the lag array for future time steps. + # Calculate the start and end indices in the future time steps that will be affected. + start, end = ( + time_step + smallest_lag_steps, + time_step + largest_lag_steps + 1, + ) + self._fill_diagonal_with_median(lag_array, start, end, median) + + # Convert the prediction array back to a pandas DataFrame using the reindexed time index. + prediction_df = pd.DataFrame(prediction, index=lag_df.index.to_numpy(), columns=["median"]) + + # Reindex the prediction DataFrame back to the original input data index. + prediction_df = prediction_df.reindex(input_data.index) + + return ForecastDataset( + data=prediction_df.dropna().rename(columns={"median": self.config.quantiles[0].format()}), # type: ignore + sample_interval=data.sample_interval, + forecast_start=data.forecast_start, + ) + + @override + def fit(self, data: ForecastInputDataset, data_val: ForecastInputDataset | None = None) -> None: + """Take care of fitting the median forecaster. + + This regressor does not need any fitting, + but it does need to know the feature names of the lag features and the order of these. + + Lag features are expected to be evently spaced and match the frequency of the input data. + The lag features are expected to be named in the format T- or T-d. + For example, T-1min, T-2min, T-3min or T-1d, T-2d. + + Which lag features are used is determined by the feature engineering step. + + Args: + data (ForecastInputDataset): The training data containing lag features. + data_val (ForecastInputDataset | None): Optional validation data, not used in this regressor. + + Raises: + ValueError: If the input data frequency does not match the model frequency. + ValueError: If no lag features are found in the input data. + + """ + self.frequency_ = data.sample_interval + + lag_prefix = f"{data.target_column}_lag_" + self.feature_names_ = [ + feature_name for feature_name in data.feature_names if feature_name.startswith(lag_prefix) + ] + + if not self.feature_names_: + msg = f"No lag features found in the input data with prefix '{lag_prefix}'." + raise ValueError(msg) + + self.lags_to_time_deltas_ = { + feature_name: timedelta_from_isoformat(feature_name.replace(lag_prefix, "")) + for feature_name in self.feature_names_ + } + + # Check if lags are evenly spaced + lag_deltas = sorted(self.lags_to_time_deltas_.values()) + lag_intervals = [(lag_deltas[i] - lag_deltas[i - 1]).total_seconds() for i in range(1, len(lag_deltas))] + if not all(interval == lag_intervals[0] for interval in lag_intervals): + msg = ( + "Lag features are not evenly spaced. " + "Please ensure lag features are evenly spaced and match the data frequency." + ) + raise ValueError(msg) + + # Check that lag frequency matches data frequency + expected_lag_interval = lag_intervals[0] + if expected_lag_interval != self.frequency_.total_seconds(): + msg = ( + f"Lag feature interval ({timedelta(seconds=expected_lag_interval)}) does not match " + f"data frequency ({self.frequency_}). " + "Please ensure lag features match the data frequency." + ) + raise ValueError(msg) + + self.feature_names_ = sorted(self.feature_names_, key=lambda f: self.lags_to_time_deltas_[f]) + + self.feature_importances_ = np.ones(len(self.feature_names_)) / (len(self.feature_names_) or 1.0) + self.is_fitted_ = True diff --git a/packages/openstef-models/src/openstef_models/presets/forecasting_workflow.py b/packages/openstef-models/src/openstef_models/presets/forecasting_workflow.py index f74aefb14..bb0221b54 100644 --- a/packages/openstef-models/src/openstef_models/presets/forecasting_workflow.py +++ b/packages/openstef-models/src/openstef_models/presets/forecasting_workflow.py @@ -30,6 +30,7 @@ from openstef_models.models import ForecastingModel from openstef_models.models.forecasting.flatliner_forecaster import FlatlinerForecaster from openstef_models.models.forecasting.gblinear_forecaster import GBLinearForecaster +from openstef_models.models.forecasting.median_forecaster import MedianForecaster from openstef_models.models.forecasting.xgboost_forecaster import XGBoostForecaster from openstef_models.transforms.energy_domain import WindPowerFeatureAdder from openstef_models.transforms.general import ( @@ -101,9 +102,9 @@ class ForecastingWorkflowConfig(BaseConfig): # PredictionJob ) # Model configuration - model: Literal["xgboost", "gblinear", "flatliner"] = Field( + model: Literal["xgboost", "gblinear", "flatliner", "median"] = Field( description="Type of forecasting model to use." - ) # TODO(#652): Implement median forecaster + ) quantiles: list[Quantile] = Field( default=[Q(0.5)], description="List of quantiles to predict for probabilistic forecasting." ) @@ -375,6 +376,22 @@ def create_forecasting_workflow(config: ForecastingWorkflowConfig) -> CustomFore add_quantiles_from_std=False, ), ] + elif config.model == "median": + preprocessing = [ + LagsAdder( + history_available=config.predict_history, + horizons=config.horizons, + add_trivial_lags=True, + target_column=config.target_column, + ) + ] + forecaster = MedianForecaster( + config=MedianForecaster.Config( + quantiles=config.quantiles, + horizons=config.horizons, + ), + ) + postprocessing = [] elif config.model == "flatliner": preprocessing = [] forecaster = FlatlinerForecaster( diff --git a/packages/openstef-models/src/openstef_models/transforms/general/nan_dropper.py b/packages/openstef-models/src/openstef_models/transforms/general/nan_dropper.py index e9144bba9..cf85856f5 100644 --- a/packages/openstef-models/src/openstef_models/transforms/general/nan_dropper.py +++ b/packages/openstef-models/src/openstef_models/transforms/general/nan_dropper.py @@ -38,7 +38,7 @@ class NaNDropper(BaseConfig, TimeSeriesTransform): ... 'temperature': [20.0, 22.0, np.nan, 23.0], ... 'humidity': [60.0, 65.0, 70.0, 75.0] ... }, index=pd.date_range('2025-01-01', periods=4, freq='1h')) - >>> dataset = TimeSeriesDataset(data, timedelta(hours=1)) + >>> dataset = TimeSeriesDataset(data) >>> >>> # Drop rows with NaN in load or temperature >>> dropper = NaNDropper(selection=FeatureSelection(include=['load', 'temperature'])) diff --git a/packages/openstef-models/tests/unit/models/forecasting/test_median_forecaster.py b/packages/openstef-models/tests/unit/models/forecasting/test_median_forecaster.py new file mode 100644 index 000000000..07ca2dfe5 --- /dev/null +++ b/packages/openstef-models/tests/unit/models/forecasting/test_median_forecaster.py @@ -0,0 +1,459 @@ +# SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 +import re +from datetime import timedelta + +import numpy as np +import pandas as pd +import pytest + +from openstef_core.datasets import ForecastInputDataset +from openstef_core.testing import create_timeseries_dataset +from openstef_core.types import LeadTime, Q +from openstef_models.models.forecasting.forecaster import ForecastDataset +from openstef_models.models.forecasting.median_forecaster import MedianForecaster, MedianForecasterConfig + + +def test_median_returns_median(): + # Arrange + index = pd.date_range("2020-01-01T00:00", periods=3, freq="h") + training_data = create_timeseries_dataset( + index=index, + load=[1.0, 4.0, 7.0], + load_lag_PT1H=[1.0, np.nan, np.nan], + load_lag_PT2H=[4.0, 1.0, np.nan], + load_lag_PT3H=[7.0, 4.0, 1.0], + ) + + training_input_data = ForecastInputDataset.from_timeseries( + dataset=training_data, + target_column="load", + forecast_start=index[0], + ) + + expected_result = ForecastDataset( + data=pd.DataFrame( + { + "quantile_P50": [4.0, 4.0, 4.0], + }, + index=index, + ), + sample_interval=training_input_data.sample_interval, + ) + expected_result.index.freq = None + + config = MedianForecasterConfig(quantiles=[Q(0.5)], horizons=[LeadTime.from_string("PT36H")]) + model = MedianForecaster(config=config) + + # Act + model.fit(training_input_data) + result = model.predict(training_input_data) + + # Assert + assert result.sample_interval == expected_result.sample_interval + pd.testing.assert_frame_equal(result.data, expected_result.data) + + +def test_median_handles_some_missing_data(): + # Arrange + index = pd.date_range("2023-01-01", periods=3, freq="h") + training_data = create_timeseries_dataset( + index=index, + load=[1.0, 2.0, 3.0], + load_lag_PT1H=[1.0, np.nan, np.nan], + load_lag_PT2H=[np.nan, 1.0, np.nan], + load_lag_PT3H=[3.0, np.nan, 1.0], + ) + + training_input_data = ForecastInputDataset.from_timeseries( + dataset=training_data, + target_column="load", + forecast_start=index[0], + ) + + expected_result = ForecastDataset( + data=pd.DataFrame( + { + "quantile_P50": [2.0, 1.5, 1.5], + }, + index=index, + ), + sample_interval=training_input_data.sample_interval, + ) + expected_result.index.freq = None + + config = MedianForecasterConfig(quantiles=[Q(0.5)], horizons=[LeadTime.from_string("PT3M")]) + model = MedianForecaster(config=config) + + # Act + model.fit(training_input_data) + result = model.predict(training_input_data) + + # Assert + assert result.sample_interval == expected_result.sample_interval + pd.testing.assert_frame_equal(result.data, expected_result.data) + + +def test_median_handles_missing_data_for_some_horizons(): + # Arrange + index = pd.date_range("2023-01-01", periods=3, freq="h") + training_data = create_timeseries_dataset( + index=index, + load=[5.0, 5.0, 5.0], + load_lag_PT1H=[5.0, np.nan, np.nan], + load_lag_PT2H=[np.nan, 5.0, np.nan], + load_lag_PT3H=[np.nan, np.nan, 5.0], + ) + + training_input_data = ForecastInputDataset.from_timeseries( + dataset=training_data, + target_column="load", + forecast_start=index[0], + ) + + expected_result = ForecastDataset( + data=pd.DataFrame( + { + "quantile_P50": [5.0, 5.0, 5.0], + }, + index=index, + ), + sample_interval=training_input_data.sample_interval, + ) + expected_result.index.freq = None + + config = MedianForecasterConfig(quantiles=[Q(0.5)], horizons=[LeadTime.from_string("PT3M")]) + model = MedianForecaster(config=config) + + # Act + model.fit(training_input_data) + result = model.predict(training_input_data) + + # Assert + assert result.sample_interval == expected_result.sample_interval + pd.testing.assert_frame_equal(result.data, expected_result.data) + + +def test_median_handles_all_missing_data(): + # Arrange + index = pd.date_range("2023-01-01", periods=3, freq="h") + training_data = create_timeseries_dataset( + index=index, + load=[np.nan, np.nan, np.nan], + load_lag_PT1H=[np.nan, np.nan, np.nan], + load_lag_PT2H=[np.nan, np.nan, np.nan], + load_lag_PT3H=[np.nan, np.nan, np.nan], + ) + + training_input_data = ForecastInputDataset.from_timeseries( + dataset=training_data, + target_column="load", + forecast_start=index[0], + ) + + expected_result = ForecastDataset( + data=pd.DataFrame( + { + "quantile_P50": [], + }, + index=pd.DatetimeIndex([], freq="h"), + ), + sample_interval=training_input_data.sample_interval, + forecast_start=training_input_data.forecast_start, + ) + + config = MedianForecasterConfig(quantiles=[Q(0.5)], horizons=[LeadTime.from_string("PT3M")]) + model = MedianForecaster(config=config) + + # Act + model.fit(training_input_data) + result = model.predict(training_input_data) + + # Assert + assert result.sample_interval == expected_result.sample_interval + pd.testing.assert_frame_equal(result.data, expected_result.data) + + +def test_median_uses_lag_features_if_available(): + # Arrange + index = pd.date_range("2023-01-01T00:00", periods=3, freq="h") + training_data = create_timeseries_dataset( + index=index, + load=[4.0, 5.0, 6.0], + load_lag_PT1H=[1.0, 2.0, 3.0], + load_lag_PT2H=[4.0, 5.0, 6.0], + load_lag_PT3H=[7.0, 8.0, 9.0], + ) + + training_input_data = ForecastInputDataset.from_timeseries( + dataset=training_data, + target_column="load", + forecast_start=index[0], + ) + + expected_result = ForecastDataset( + data=pd.DataFrame( + { + "quantile_P50": [4.0, 5.0, 6.0], + }, + index=index, + ), + sample_interval=training_input_data.sample_interval, + ) + expected_result.index.freq = None + + config = MedianForecasterConfig(quantiles=[Q(0.5)], horizons=[LeadTime.from_string("PT3M")]) + model = MedianForecaster(config=config) + + # Act + model.fit(training_input_data) + result = model.predict(training_input_data) + + # Assert + assert result.sample_interval == expected_result.sample_interval + pd.testing.assert_frame_equal(result.data, expected_result.data) + + +def test_median_handles_small_gap(): + # Arrange + index = pd.date_range("2023-01-01T00:00", periods=5, freq="h") + training_data = create_timeseries_dataset( + index=index, + load=[ + 5.0, + 4.0, + 3.0, + 2.0, + 1.0, + ], + load_lag_PT1H=[1.0, np.nan, np.nan, np.nan, np.nan], + load_lag_PT2H=[2.0, 1.0, np.nan, np.nan, np.nan], + load_lag_PT3H=[3.0, 2.0, 1.0, np.nan, np.nan], + load_lag_PT4H=[4.0, 3.0, 2.0, 1.0, np.nan], + load_lag_PT5H=[5.0, 4.0, 3.0, 2.0, 1.0], + ) + + training_input_data = ForecastInputDataset.from_timeseries( + dataset=training_data, + target_column="load", + forecast_start=index[0], + ) + + # Remove the second row to create a small gap + training_input_data.data = training_input_data.data[training_input_data.data.index != "2023-01-01T01:00"] + + expected_result = ForecastDataset( + data=pd.DataFrame( + { + "quantile_P50": [3.0, 3.0, 3.0, 3.0], + }, + index=pd.date_range("2023-01-01T00:00", periods=5, freq="h").delete(1), + ), + sample_interval=training_input_data.sample_interval, + ) + expected_result.index.freq = None + + config = MedianForecasterConfig(quantiles=[Q(0.5)], horizons=[LeadTime.from_string("PT3M")]) + model = MedianForecaster(config=config) + + # Act + model.fit(training_input_data) + result = model.predict(training_input_data) + + # Assert + assert result.sample_interval == expected_result.sample_interval + pd.testing.assert_frame_equal(result.data, expected_result.data) + + +def test_median_handles_large_gap(): + # Arrange + index_1 = pd.date_range("2023-01-01T00:00", periods=3, freq="h") + training_data_1 = create_timeseries_dataset( + index=index_1, + load=[1.0, 2.0, 3.0], + load_lag_PT1H=[4.0, 5.0, 6.0], + load_lag_PT2H=[7.0, 8.0, 9.0], + ) + + index_2 = pd.date_range("2023-01-02T01:00", periods=3, freq="h") + training_data_2 = create_timeseries_dataset( + index=index_2, + load=[10.0, 11.0, 12.0], + load_lag_PT1H=[13.0, 14.0, 15.0], + load_lag_PT2H=[16.0, 17.0, 18.0], + ) + + training_data = training_data_1 + + training_data.data = pd.concat([training_data_1.data, training_data_2.data]) + + training_input_data = ForecastInputDataset.from_timeseries( + dataset=training_data, + target_column="load", + forecast_start=index_1[0], + ) + + expected_result = ForecastDataset( + data=pd.DataFrame( + { + "quantile_P50": [5.5, 6.5, 7.5, 14.5, 15.5, 16.5], + }, + index=pd.DatetimeIndex(pd.concat([index_1.to_series(), index_2.to_series()])), + ), + sample_interval=training_input_data.sample_interval, + ) + expected_result.index.freq = None + + config = MedianForecasterConfig(quantiles=[Q(0.5)], horizons=[LeadTime.from_string("PT3M")]) + model = MedianForecaster(config=config) + + # Act + model.fit(training_input_data) + result = model.predict(training_input_data) + + # Assert + assert result.sample_interval == expected_result.sample_interval + pd.testing.assert_frame_equal(result.data, expected_result.data) + + +def test_median_fit_with_missing_features_raises(): + # Arrange + index = pd.date_range("2023-01-01", periods=3, freq="h") + training_data = create_timeseries_dataset( + index=index, + load=[1.0, 2.0, 3.0], + load_lag_PT1H=[1.0, 2.0, 3.0], + load_lag_PT2H=[4.0, 5.0, 6.0], + load_lag_PT3H=[7.0, 8.0, 9.0], + ) + + training_input_data = ForecastInputDataset.from_timeseries( + dataset=training_data, + target_column="load", + forecast_start=index[0], + ) + + config = MedianForecasterConfig(quantiles=[Q(0.5)], horizons=[LeadTime.from_string("PT3H")]) + model = MedianForecaster(config=config) + model.fit(training_input_data) + + # Create prediction data missing one lag feature + prediction_data = training_data.data.copy() + prediction_data = prediction_data.drop(columns=["load_lag_PT3H"]) + prediction_input_data = ForecastInputDataset( + data=prediction_data, + target_column="load", + forecast_start=index[0], + sample_interval=training_input_data.sample_interval, + ) + + # Act & Assert + with pytest.raises(ValueError, match="The input data is missing the following lag features"): + model.predict(prediction_input_data) + + +def test_median_fit_with_no_lag_features_raises(): + # Arrange + index = pd.date_range("2023-01-01", periods=3, freq="h") + training_data = create_timeseries_dataset( + index=index, + load=[1.0, 2.0, 3.0], + unrelated_feature=[1.0, 2.0, 3.0], + ) + + training_input_data = ForecastInputDataset.from_timeseries( + dataset=training_data, + target_column="load", + forecast_start=index[0], + ) + + config = MedianForecasterConfig(quantiles=[Q(0.5)], horizons=[LeadTime.from_string("PT3H")]) + model = MedianForecaster(config=config) + + # Act & Assert + with pytest.raises(ValueError, match=r"No lag features found in the input data."): + model.fit(training_input_data) + + +def test_median_fit_with_inconsistent_lag_features_raises(): + # Arrange + index = pd.date_range("2023-01-01", periods=3, freq="h") + training_data = create_timeseries_dataset( + index=index, + load=[1.0, 2.0, 3.0], + load_lag_PT1H=[1.0, 2.0, 3.0], + load_lag_PT5H=[4.0, 5.0, 6.0], + load_lag_PT60H=[7.0, 8.0, 9.0], + load_lag_P4D=[10.0, 11.0, 12.0], + ) + + training_input_data = ForecastInputDataset.from_timeseries( + dataset=training_data, + target_column="load", + forecast_start=index[0], + ) + + config = MedianForecasterConfig(quantiles=[Q(0.5)], horizons=[LeadTime.from_string("PT3H")]) + model = MedianForecaster(config=config) + + # Act & Assert + with pytest.raises(ValueError, match="Lag features are not evenly spaced"): + model.fit(training_input_data) + + +def test_median_fit_with_inconsistent_frequency_raises(): + # Arrange + index = pd.date_range("2023-01-01", periods=3, freq="min") + training_data = create_timeseries_dataset( + index=index, + load=[1.0, 2.0, 3.0], + load_lag_PT1H=[1.0, 2.0, 3.0], + load_lag_PT2H=[4.0, 5.0, 6.0], + load_lag_PT3H=[7.0, 8.0, 9.0], + sample_interval=timedelta(minutes=1), + ) + + training_input_data = ForecastInputDataset.from_timeseries( + dataset=training_data, + target_column="load", + forecast_start=index[0], + ) + + config = MedianForecasterConfig(quantiles=[Q(0.5)], horizons=[LeadTime.from_string("PT3H")]) + model = MedianForecaster(config=config) + + # Act & Assert + with pytest.raises( + ValueError, + match=re.escape( + r"Lag feature interval (1:00:00) does not match data frequency (0:01:00). Please ensure lag features match the data frequency." + ), + ): + model.fit(training_input_data) + + +def test_predicting_without_fitting_raises(): + # Arrange + index = pd.date_range("2023-01-01", periods=3, freq="min") + training_data = create_timeseries_dataset( + index=index, + load=[1.0, 2.0, 3.0], + load_lag_PT1M=[1.0, 2.0, 3.0], + load_lag_PT2M=[4.0, 5.0, 6.0], + load_lag_PT3M=[7.0, 8.0, 9.0], + sample_interval=timedelta(minutes=1), + ) + + training_input_data = ForecastInputDataset.from_timeseries( + dataset=training_data, + target_column="load", + forecast_start=index[0], + ) + + config = MedianForecasterConfig(quantiles=[Q(0.5)], horizons=[LeadTime.from_string("PT3M")]) + model = MedianForecaster(config=config) + + # Act & Assert + with pytest.raises(AttributeError, match="This MedianForecaster instance is not fitted yet"): + model.predict(training_input_data) diff --git a/packages/openstef-models/tests/unit/transforms/postprocessing/test_confidence_interval_applicator.py b/packages/openstef-models/tests/unit/transforms/postprocessing/test_confidence_interval_applicator.py index c07399345..79a16dd17 100644 --- a/packages/openstef-models/tests/unit/transforms/postprocessing/test_confidence_interval_applicator.py +++ b/packages/openstef-models/tests/unit/transforms/postprocessing/test_confidence_interval_applicator.py @@ -32,7 +32,7 @@ def validation_predictions() -> ForecastDataset: }, index=index, ), - sample_interval=timedelta(minutes=15), + sample_interval=timedelta(hours=1), forecast_start=datetime.fromisoformat("2018-01-01T00:00:00"), ) @@ -51,7 +51,7 @@ def predictions() -> ForecastDataset: }, index=forecast_index, ), - sample_interval=timedelta(minutes=15), + sample_interval=timedelta(hours=1), forecast_start=forecast_start, ) diff --git a/packages/openstef-models/tests/unit/transforms/time_domain/test_datetime_features_adder.py b/packages/openstef-models/tests/unit/transforms/time_domain/test_datetime_features_adder.py index 79b52f2e4..d4bad586a 100644 --- a/packages/openstef-models/tests/unit/transforms/time_domain/test_datetime_features_adder.py +++ b/packages/openstef-models/tests/unit/transforms/time_domain/test_datetime_features_adder.py @@ -48,7 +48,7 @@ def test_datetime_features_weekday_weekend(): {"value": [1.0, 2.0]}, index=pd.DatetimeIndex(["2025-01-06", "2025-01-11"]), # Monday, Saturday ) - input_data = TimeSeriesDataset(data, timedelta(days=1)) + input_data = TimeSeriesDataset(data, timedelta(days=5)) transform = DatetimeFeaturesAdder() result = transform.transform(input_data) @@ -82,7 +82,7 @@ def test_datetime_features_month_quarter(): {"value": [1.0, 2.0, 3.0]}, index=pd.DatetimeIndex(["2025-01-15", "2025-04-15", "2025-10-15"]), # Jan Q1, Apr Q2, Oct Q4 ) - input_data = TimeSeriesDataset(data, timedelta(days=1)) + input_data = TimeSeriesDataset(data, timedelta(days=90)) transform = DatetimeFeaturesAdder() result = transform.transform(input_data) @@ -102,7 +102,7 @@ def test_datetime_features_onehot_encoding(): {"value": [1.0, 2.0]}, index=pd.DatetimeIndex(["2025-01-15", "2025-04-15"]), # Jan, Apr ) - input_data = TimeSeriesDataset(data, timedelta(days=1)) + input_data = TimeSeriesDataset(data, timedelta(days=90)) transform = DatetimeFeaturesAdder(onehot_encode=True) result = transform.transform(input_data) diff --git a/packages/openstef-models/tests/unit/transforms/time_domain/test_rolling_aggregates_adder.py b/packages/openstef-models/tests/unit/transforms/time_domain/test_rolling_aggregates_adder.py index b74a42317..fe7e0bd2c 100644 --- a/packages/openstef-models/tests/unit/transforms/time_domain/test_rolling_aggregates_adder.py +++ b/packages/openstef-models/tests/unit/transforms/time_domain/test_rolling_aggregates_adder.py @@ -92,7 +92,7 @@ def test_rolling_aggregate_features_missing_column_raises_error(): {"not_load": [1.0, 2.0, 3.0]}, index=pd.date_range("2023-01-01 00:00:00", periods=3, freq="1h"), ) - dataset = TimeSeriesDataset(data, sample_interval=timedelta(minutes=15)) + dataset = TimeSeriesDataset(data, sample_interval=timedelta(hours=1)) transform = RollingAggregatesAdder( feature="load", horizons=[LeadTime.from_string("PT36H")], diff --git a/packages/openstef-models/tests/unit/transforms/weather_domain/test_daylight_feature_adder.py b/packages/openstef-models/tests/unit/transforms/weather_domain/test_daylight_feature_adder.py index 84b31ca38..a95de7aed 100644 --- a/packages/openstef-models/tests/unit/transforms/weather_domain/test_daylight_feature_adder.py +++ b/packages/openstef-models/tests/unit/transforms/weather_domain/test_daylight_feature_adder.py @@ -79,10 +79,10 @@ def test_daylight_features_nighttime_values(): def test_daylight_features_different_coordinates(): """Test daylight calculation with different geographical coordinates.""" # Same time, different locations should give different results - time_index = pd.DatetimeIndex(["2025-06-21 12:00:00"], tz="UTC") + time_index = pd.DatetimeIndex(["2025-06-21 12:00:00", "2025-06-21 13:00:00"], tz="UTC") # Amsterdam vs Cape Town (southern hemisphere, winter) - data = pd.DataFrame({"value": [1.0]}, index=time_index) + data = pd.DataFrame({"value": [1.0, 2.0]}, index=time_index) input_data = TimeSeriesDataset(data, timedelta(hours=1)) # Amsterdam (northern hemisphere, summer)