diff --git a/examples/benchmarks/liander_2024_benchmark_xgboost_gblinear.py b/examples/benchmarks/liander_2024_benchmark_xgboost_gblinear.py index b18773206..00c07bdb7 100644 --- a/examples/benchmarks/liander_2024_benchmark_xgboost_gblinear.py +++ b/examples/benchmarks/liander_2024_benchmark_xgboost_gblinear.py @@ -40,6 +40,7 @@ BENCHMARK_RESULTS_PATH_GBLINEAR = OUTPUT_PATH / "GBLinear" N_PROCESSES = multiprocessing.cpu_count() # Amount of parallel processes to use for the benchmark + # Model configuration FORECAST_HORIZONS = [LeadTime.from_string("P3D")] # Forecast horizon(s) PREDICTION_QUANTILES = [ diff --git a/examples/benchmarks/liander_2024_ensemble.py b/examples/benchmarks/liander_2024_ensemble.py new file mode 100644 index 000000000..29ecb9b25 --- /dev/null +++ b/examples/benchmarks/liander_2024_ensemble.py @@ -0,0 +1,127 @@ +"""Liander 2024 Benchmark Example. + +==================================== + +This example demonstrates how to set up and run the Liander 2024 STEF benchmark using OpenSTEF BEAM. +The benchmark will evaluate XGBoost and GBLinear models on the dataset from HuggingFace. +""" + +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +import os +import time + +os.environ["OMP_NUM_THREADS"] = "1" # Set OMP_NUM_THREADS to 1 to avoid issues with parallel execution and xgboost +os.environ["OPENBLAS_NUM_THREADS"] = "1" +os.environ["MKL_NUM_THREADS"] = "1" + +import logging +import multiprocessing +from datetime import timedelta +from pathlib import Path + +from openstef_beam.backtesting.backtest_forecaster import BacktestForecasterConfig +from openstef_beam.benchmarking.baselines import ( + create_openstef4_preset_backtest_forecaster, +) +from openstef_beam.benchmarking.benchmarks.liander2024 import Liander2024Category, create_liander2024_benchmark_runner +from openstef_beam.benchmarking.callbacks.strict_execution_callback import StrictExecutionCallback +from openstef_beam.benchmarking.storage.local_storage import LocalBenchmarkStorage +from openstef_core.types import LeadTime, Q +from openstef_meta.presets import ( + EnsembleWorkflowConfig, +) +from openstef_models.integrations.mlflow.mlflow_storage import MLFlowStorage + +logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s] %(message)s") + +OUTPUT_PATH = Path("./benchmark_results") + +N_PROCESSES = 1 if True else multiprocessing.cpu_count() # Amount of parallel processes to use for the benchmark + +ensemble_type = "learned_weights" # "stacking", "learned_weights" or "rules" +base_models = ["lgbm", "gblinear"] # combination of "lgbm", "gblinear", "xgboost" and "lgbm_linear" +combiner_model = ( + "lgbm" # "lgbm", "xgboost", "rf" or "logistic" for learned weights combiner, gblinear for stacking combiner +) + +model = "Ensemble_" + "_".join(base_models) + "_" + ensemble_type + "_" + combiner_model + +# Model configuration +FORECAST_HORIZONS = [LeadTime.from_string("PT36H")] # Forecast horizon(s) +PREDICTION_QUANTILES = [ + Q(0.05), + Q(0.1), + Q(0.3), + Q(0.5), + Q(0.7), + Q(0.9), + Q(0.95), +] # Quantiles for probabilistic forecasts + +BENCHMARK_FILTER: list[Liander2024Category] | None = None + +USE_MLFLOW_STORAGE = False + +if USE_MLFLOW_STORAGE: + storage = MLFlowStorage( + tracking_uri=str(OUTPUT_PATH / "mlflow_artifacts"), + local_artifacts_path=OUTPUT_PATH / "mlflow_tracking_artifacts", + ) +else: + storage = None + +workflow_config = EnsembleWorkflowConfig( + model_id="common_model_", + ensemble_type=ensemble_type, + base_models=base_models, # type: ignore + combiner_model=combiner_model, + horizons=FORECAST_HORIZONS, + quantiles=PREDICTION_QUANTILES, + model_reuse_enable=False, + mlflow_storage=None, + radiation_column="shortwave_radiation", + rolling_aggregate_features=["mean", "median", "max", "min"], + wind_speed_column="wind_speed_80m", + pressure_column="surface_pressure", + temperature_column="temperature_2m", + relative_humidity_column="relative_humidity_2m", + energy_price_column="EPEX_NL", + forecast_combiner_sample_weight_exponent=0, + forecaster_sample_weight_exponent={"gblinear": 1, "lgbm": 0, "xgboost": 0, "lgbm_linear": 0}, +) + + +# Create the backtest configuration +backtest_config = BacktestForecasterConfig( + requires_training=True, + predict_length=timedelta(days=7), + predict_min_length=timedelta(minutes=15), + predict_context_length=timedelta(days=14), # Context needed for lag features + predict_context_min_coverage=0.5, + training_context_length=timedelta(days=90), # Three months of training data + training_context_min_coverage=0.5, + predict_sample_interval=timedelta(minutes=15), +) + + +if __name__ == "__main__": + start_time = time.time() + create_liander2024_benchmark_runner( + storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH / model), + data_dir=Path("../data/liander2024-energy-forecasting-benchmark"), + callbacks=[StrictExecutionCallback()], + ).run( + forecaster_factory=create_openstef4_preset_backtest_forecaster( + workflow_config=workflow_config, + cache_dir=OUTPUT_PATH / "cache", + ), + run_name=model, + n_processes=N_PROCESSES, + filter_args=BENCHMARK_FILTER, + ) + + end_time = time.time() + print(f"Benchmark completed in {end_time - start_time:.2f} seconds.") diff --git a/examples/benchmarks/liander_2024_residual.py b/examples/benchmarks/liander_2024_residual.py new file mode 100644 index 000000000..aecb3de9e --- /dev/null +++ b/examples/benchmarks/liander_2024_residual.py @@ -0,0 +1,121 @@ +"""Liander 2024 Benchmark Example. + +==================================== + +This example demonstrates how to set up and run the Liander 2024 STEF benchmark using OpenSTEF BEAM. +The benchmark will evaluate XGBoost and GBLinear models on the dataset from HuggingFace. +""" + +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +import os +import time + +os.environ["OMP_NUM_THREADS"] = "1" # Set OMP_NUM_THREADS to 1 to avoid issues with parallel execution and xgboost +os.environ["OPENBLAS_NUM_THREADS"] = "1" +os.environ["MKL_NUM_THREADS"] = "1" + +import logging +import multiprocessing +from datetime import timedelta +from pathlib import Path + +from openstef_beam.backtesting.backtest_forecaster import BacktestForecasterConfig +from openstef_beam.benchmarking.baselines import ( + create_openstef4_preset_backtest_forecaster, +) +from openstef_beam.benchmarking.benchmarks.liander2024 import Liander2024Category, create_liander2024_benchmark_runner +from openstef_beam.benchmarking.callbacks.strict_execution_callback import StrictExecutionCallback +from openstef_beam.benchmarking.storage.local_storage import LocalBenchmarkStorage +from openstef_core.types import LeadTime, Q +from openstef_models.integrations.mlflow.mlflow_storage import MLFlowStorage +from openstef_models.presets import ( + ForecastingWorkflowConfig, +) + +logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s] %(message)s") + +logger = logging.getLogger(__name__) + +OUTPUT_PATH = Path("./benchmark_results") + +N_PROCESSES = multiprocessing.cpu_count() # Amount of parallel processes to use for the benchmark + +model = "residual" # Can be "stacking", "learned_weights" or "residual" + +# Model configuration +FORECAST_HORIZONS = [LeadTime.from_string("PT36H")] # Forecast horizon(s) +PREDICTION_QUANTILES = [ + Q(0.05), + Q(0.1), + Q(0.3), + Q(0.5), + Q(0.7), + Q(0.9), + Q(0.95), +] # Quantiles for probabilistic forecasts + +BENCHMARK_FILTER: list[Liander2024Category] | None = None + +USE_MLFLOW_STORAGE = False + +if USE_MLFLOW_STORAGE: + storage = MLFlowStorage( + tracking_uri=str(OUTPUT_PATH / "mlflow_artifacts"), + local_artifacts_path=OUTPUT_PATH / "mlflow_tracking_artifacts", + ) +else: + storage = None + +common_config = ForecastingWorkflowConfig( + model_id="common_model_", + model=model, + horizons=FORECAST_HORIZONS, + quantiles=PREDICTION_QUANTILES, + model_reuse_enable=False, + mlflow_storage=None, + radiation_column="shortwave_radiation", + rolling_aggregate_features=["mean", "median", "max", "min"], + wind_speed_column="wind_speed_80m", + pressure_column="surface_pressure", + temperature_column="temperature_2m", + relative_humidity_column="relative_humidity_2m", + energy_price_column="EPEX_NL", +) + + +# Create the backtest configuration +backtest_config = BacktestForecasterConfig( + requires_training=True, + predict_length=timedelta(days=7), + predict_min_length=timedelta(minutes=15), + predict_context_length=timedelta(days=14), # Context needed for lag features + predict_context_min_coverage=0.5, + training_context_length=timedelta(days=90), # Three months of training data + training_context_min_coverage=0.5, + predict_sample_interval=timedelta(minutes=15), +) + + +if __name__ == "__main__": + start_time = time.time() + + # Run for XGBoost model + create_liander2024_benchmark_runner( + storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH / model), + callbacks=[StrictExecutionCallback()], + ).run( + forecaster_factory=create_openstef4_preset_backtest_forecaster( + workflow_config=common_config, + cache_dir=OUTPUT_PATH / "cache", + ), + run_name=model, + n_processes=N_PROCESSES, + filter_args=BENCHMARK_FILTER, + ) + + end_time = time.time() + msg = f"Benchmark completed in {end_time - start_time:.2f} seconds." + logger.info(msg) diff --git a/packages/openstef-beam/src/openstef_beam/benchmarking/baselines/openstef4.py b/packages/openstef-beam/src/openstef_beam/benchmarking/baselines/openstef4.py index d4e7cc355..9019a2156 100644 --- a/packages/openstef-beam/src/openstef_beam/benchmarking/baselines/openstef4.py +++ b/packages/openstef-beam/src/openstef_beam/benchmarking/baselines/openstef4.py @@ -11,18 +11,32 @@ from pathlib import Path from typing import Any, cast, override +import pandas as pd from pydantic import Field, PrivateAttr from pydantic_extra_types.coordinate import Coordinate -from openstef_beam.backtesting.backtest_forecaster.mixins import BacktestForecasterConfig, BacktestForecasterMixin -from openstef_beam.backtesting.restricted_horizon_timeseries import RestrictedHorizonVersionedTimeSeries -from openstef_beam.benchmarking.benchmark_pipeline import BenchmarkContext, BenchmarkTarget, ForecasterFactory +from openstef_beam.backtesting.backtest_forecaster.mixins import ( + BacktestForecasterConfig, + BacktestForecasterMixin, +) +from openstef_beam.backtesting.restricted_horizon_timeseries import ( + RestrictedHorizonVersionedTimeSeries, +) +from openstef_beam.benchmarking.benchmark_pipeline import ( + BenchmarkContext, + BenchmarkTarget, + ForecasterFactory, +) from openstef_core.base_model import BaseConfig, BaseModel from openstef_core.datasets import TimeSeriesDataset from openstef_core.exceptions import FlatlinerDetectedError, NotFittedError from openstef_core.types import Q +from openstef_meta.models.ensemble_forecasting_model import EnsembleForecastingModel +from openstef_meta.presets import EnsembleWorkflowConfig, create_ensemble_workflow from openstef_models.presets import ForecastingWorkflowConfig -from openstef_models.workflows.custom_forecasting_workflow import CustomForecastingWorkflow +from openstef_models.workflows.custom_forecasting_workflow import ( + CustomForecastingWorkflow, +) class WorkflowCreationContext(BaseConfig): @@ -54,6 +68,10 @@ class OpenSTEF4BacktestForecaster(BaseModel, BacktestForecasterMixin): default=False, description="When True, saves intermediate input data for debugging", ) + contributions: bool = Field( + default=False, + description="When True, saves base Forecaster prediction contributions for ensemble models in cache_dir", + ) _workflow: CustomForecastingWorkflow | None = PrivateAttr(default=None) _is_flatliner_detected: bool = PrivateAttr(default=False) @@ -62,7 +80,7 @@ class OpenSTEF4BacktestForecaster(BaseModel, BacktestForecasterMixin): @override def model_post_init(self, context: Any) -> None: - if self.debug: + if self.debug or self.contributions: self.cache_dir.mkdir(parents=True, exist_ok=True) @property @@ -72,6 +90,10 @@ def quantiles(self) -> list[Q]: if self._workflow is None: self._workflow = self.workflow_factory(WorkflowCreationContext()) # Extract quantiles from the workflow's model + + if isinstance(self._workflow.model, EnsembleForecastingModel): + name = self._workflow.model.forecaster_names[0] + return self._workflow.model.forecasters[name].config.quantiles return self._workflow.model.forecaster.config.quantiles @override @@ -82,7 +104,9 @@ def fit(self, data: RestrictedHorizonVersionedTimeSeries) -> None: # Extract the dataset for training training_data = data.get_window( - start=data.horizon - self.config.training_context_length, end=data.horizon, available_before=data.horizon + start=data.horizon - self.config.training_context_length, + end=data.horizon, + available_before=data.horizon, ) if self.debug: @@ -136,6 +160,12 @@ def predict(self, data: RestrictedHorizonVersionedTimeSeries) -> TimeSeriesDatas predict_data.to_parquet(path=self.cache_dir / f"debug_{id_str}_predict.parquet") forecast.to_parquet(path=self.cache_dir / f"debug_{id_str}_forecast.parquet") + if self.contributions and isinstance(self._workflow.model, EnsembleForecastingModel): + contr_str = data.horizon.strftime("%Y%m%d%H%M%S") + contributions = self._workflow.model.predict_contributions(predict_data, forecast_start=data.horizon) + df = pd.concat([contributions, forecast.data.drop(columns=["load"])], axis=1) + + df.to_parquet(path=self.cache_dir / f"contrib_{contr_str}_predict.parquet") return forecast @@ -144,7 +174,7 @@ class OpenSTEF4PresetBacktestForecaster(OpenSTEF4BacktestForecaster): def _preset_target_forecaster_factory( - base_config: ForecastingWorkflowConfig, + base_config: ForecastingWorkflowConfig | EnsembleWorkflowConfig, backtest_config: BacktestForecasterConfig, cache_dir: Path, context: BenchmarkContext, @@ -158,6 +188,23 @@ def _preset_target_forecaster_factory( def _create_workflow(context: WorkflowCreationContext) -> CustomForecastingWorkflow: # Create a new workflow instance with fresh model. + if isinstance(base_config, EnsembleWorkflowConfig): + return create_ensemble_workflow( + config=base_config.model_copy( + update={ + "model_id": f"{prefix}_{target.name}", + "location": LocationConfig( + name=target.name, + description=target.description, + coordinate=Coordinate( + latitude=target.latitude, + longitude=target.longitude, + ), + ), + } + ) + ) + return create_forecasting_workflow( config=base_config.model_copy( update={ @@ -184,7 +231,7 @@ def _create_workflow(context: WorkflowCreationContext) -> CustomForecastingWorkf def create_openstef4_preset_backtest_forecaster( - workflow_config: ForecastingWorkflowConfig, + workflow_config: ForecastingWorkflowConfig | EnsembleWorkflowConfig, backtest_config: BacktestForecasterConfig | None = None, cache_dir: Path = Path("cache"), ) -> ForecasterFactory[BenchmarkTarget]: @@ -225,4 +272,8 @@ def create_openstef4_preset_backtest_forecaster( ) -__all__ = ["OpenSTEF4BacktestForecaster", "WorkflowCreationContext", "create_openstef4_preset_backtest_forecaster"] +__all__ = [ + "OpenSTEF4BacktestForecaster", + "WorkflowCreationContext", + "create_openstef4_preset_backtest_forecaster", +] diff --git a/packages/openstef-meta/README.md b/packages/openstef-meta/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/packages/openstef-meta/pyproject.toml b/packages/openstef-meta/pyproject.toml new file mode 100644 index 000000000..0f620e63b --- /dev/null +++ b/packages/openstef-meta/pyproject.toml @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +[project] +name = "openstef-meta" +version = "0.0.0" +description = "Meta models for OpenSTEF" +readme = "README.md" +keywords = [ "energy", "forecasting", "machinelearning" ] +license = "MPL-2.0" +authors = [ + { name = "Alliander N.V", email = "short.term.energy.forecasts@alliander.com" }, +] +requires-python = ">=3.12,<4.0" +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", +] + +dependencies = [ + "openstef-beam>=4.0.0.dev0,<5", + "openstef-core>=4.0.0.dev0,<5", + "openstef-models>=4.0.0.dev0,<5", +] + +urls.Documentation = "https://openstef.github.io/openstef/index.html" +urls.Homepage = "https://lfenergy.org/projects/openstef/" +urls.Issues = "https://github.com/OpenSTEF/openstef/issues" +urls.Repository = "https://github.com/OpenSTEF/openstef" + +[tool.hatch.build.targets.wheel] +packages = [ "src/openstef_meta" ] diff --git a/packages/openstef-meta/src/openstef_meta/__init__.py b/packages/openstef-meta/src/openstef_meta/__init__.py new file mode 100644 index 000000000..ff5902981 --- /dev/null +++ b/packages/openstef-meta/src/openstef_meta/__init__.py @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 +"""Meta models for OpenSTEF.""" + +import logging + +# Set up logging configuration +root_logger = logging.getLogger(name=__name__) +if not root_logger.handlers: + root_logger.addHandler(logging.NullHandler()) + +__all__ = [] diff --git a/packages/openstef-meta/src/openstef_meta/examples/__init__.py b/packages/openstef-meta/src/openstef_meta/examples/__init__.py new file mode 100644 index 000000000..765b7c107 --- /dev/null +++ b/packages/openstef-meta/src/openstef_meta/examples/__init__.py @@ -0,0 +1,5 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +"""Examples for OpenSTEF Meta.""" diff --git a/packages/openstef-meta/src/openstef_meta/models/__init__.py b/packages/openstef-meta/src/openstef_meta/models/__init__.py new file mode 100644 index 000000000..13175057c --- /dev/null +++ b/packages/openstef-meta/src/openstef_meta/models/__init__.py @@ -0,0 +1,5 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +"""Meta Forecasting models.""" diff --git a/packages/openstef-meta/src/openstef_meta/models/ensemble_forecasting_model.py b/packages/openstef-meta/src/openstef_meta/models/ensemble_forecasting_model.py new file mode 100644 index 000000000..5394ab476 --- /dev/null +++ b/packages/openstef-meta/src/openstef_meta/models/ensemble_forecasting_model.py @@ -0,0 +1,740 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +"""High-level forecasting model that orchestrates the complete prediction pipeline. + +Combines feature engineering, forecasting, and postprocessing into a unified interface. +Handles both single-horizon and multi-horizon forecasters while providing consistent +data transformation and validation. +""" + +import logging +from datetime import datetime, timedelta +from functools import partial +from typing import cast, override + +import pandas as pd +from pydantic import Field, PrivateAttr + +from openstef_beam.evaluation import EvaluationConfig, EvaluationPipeline, SubsetMetric +from openstef_beam.evaluation.metric_providers import MetricProvider, ObservedProbabilityProvider, R2Provider +from openstef_core.base_model import BaseModel +from openstef_core.datasets import ( + ForecastDataset, + ForecastInputDataset, + TimeSeriesDataset, +) +from openstef_core.datasets.timeseries_dataset import validate_horizons_present +from openstef_core.exceptions import NotFittedError +from openstef_core.mixins import Predictor, TransformPipeline +from openstef_meta.models.forecast_combiners.forecast_combiner import ForecastCombiner +from openstef_meta.utils.datasets import EnsembleForecastDataset +from openstef_models.models.forecasting import Forecaster +from openstef_models.models.forecasting.forecaster import ForecasterConfig +from openstef_models.models.forecasting_model import ModelFitResult +from openstef_models.utils.data_split import DataSplitter + +logger = logging.getLogger(__name__) + + +class EnsembleModelFitResult(BaseModel): + forecaster_fit_results: dict[str, ModelFitResult] = Field(description="ModelFitResult for each base Forecaster") + + combiner_fit_result: ModelFitResult = Field(description="ModelFitResult for the ForecastCombiner") + + # Make compatible with ModelFitResult interface + @property + def input_dataset(self) -> EnsembleForecastDataset: + """Returns the input dataset used for fitting the combiner.""" + return cast( + "EnsembleForecastDataset", + self.combiner_fit_result.input_dataset, + ) + + @property + def input_data_train(self) -> ForecastInputDataset: + """Returns the training input data used for fitting the combiner.""" + return self.combiner_fit_result.input_data_train + + @property + def input_data_val(self) -> ForecastInputDataset | None: + """Returns the validation input data used for fitting the combiner.""" + return self.combiner_fit_result.input_data_val + + @property + def input_data_test(self) -> ForecastInputDataset | None: + """Returns the test input data used for fitting the combiner.""" + return self.combiner_fit_result.input_data_test + + @property + def metrics_train(self) -> SubsetMetric: + """Returns the full metrics calculated during combiner fitting.""" + return self.combiner_fit_result.metrics_train + + @property + def metrics_val(self) -> SubsetMetric | None: + """Returns the full metrics calculated during combiner fitting.""" + return self.combiner_fit_result.metrics_val + + @property + def metrics_test(self) -> SubsetMetric | None: + """Returns the full metrics calculated during combiner fitting.""" + return self.combiner_fit_result.metrics_test + + @property + def metrics_full(self) -> SubsetMetric: + """Returns the full metrics calculated during combiner fitting.""" + return self.combiner_fit_result.metrics_full + + +class EnsembleForecastingModel(BaseModel, Predictor[TimeSeriesDataset, ForecastDataset]): + """Complete forecasting pipeline combining preprocessing, prediction, and postprocessing. + + Orchestrates the full forecasting workflow by managing feature engineering, + model training/prediction, and result postprocessing. Automatically handles + the differences between single-horizon and multi-horizon forecasters while + ensuring data consistency and validation throughout the pipeline. + + Invariants: + - fit() must be called before predict() + - Forecaster and preprocessing horizons must match during initialization + + Important: + The `cutoff_history` parameter is crucial when using lag-based features in + preprocessing. For example, a lag-14 transformation creates NaN values for + the first 14 days of data. Set `cutoff_history` to exclude these incomplete + rows from training. You must configure this manually based on your preprocessing + pipeline since lags cannot be automatically inferred from the transforms. + + Example: + Basic forecasting workflow: + + >>> from openstef_models.models.forecasting.constant_median_forecaster import ( + ... ConstantMedianForecaster, ConstantMedianForecasterConfig + ... ) + >>> from openstef_meta.models.forecast_combiners.learned_weights_combiner import WeightsCombiner + >>> from openstef_core.types import LeadTime + >>> + >>> # Note: This is a conceptual example showing the API structure + >>> # Real usage requires implemented forecaster classes + >>> forecaster_1 = ConstantMedianForecaster( + ... config=ConstantMedianForecasterConfig(horizons=[LeadTime.from_string("PT36H")]) + ... ) + >>> forecaster_2 = ConstantMedianForecaster( + ... config=ConstantMedianForecasterConfig(horizons=[LeadTime.from_string("PT36H")]) + ... ) + >>> combiner_config = WeightsCombiner.Config( + ... horizons=[LeadTime.from_string("PT36H")], + ... ) + >>> # Create and train model + >>> model = EnsembleForecastingModel( + ... forecasters={"constant_median": forecaster_1, "constant_median_2": forecaster_2}, + ... combiner=WeightsCombiner(config=combiner_config), + ... cutoff_history=timedelta(days=14), # Match your maximum lag in preprocessing + ... ) + >>> model.fit(training_data) # doctest: +SKIP + >>> + >>> # Generate forecasts + >>> forecasts = model.predict(new_data) # doctest: +SKIP + """ + + # Forecasting components + common_preprocessing: TransformPipeline[TimeSeriesDataset] = Field( + default_factory=TransformPipeline[TimeSeriesDataset], + description="Feature engineering pipeline for transforming raw input data into model-ready features.", + exclude=True, + ) + + model_specific_preprocessing: dict[str, TransformPipeline[TimeSeriesDataset]] = Field( + default_factory=dict, + description="Feature engineering pipeline for transforming raw input data into model-ready features.", + exclude=True, + ) + + forecasters: dict[str, Forecaster] = Field( + default=..., + description="Underlying forecasting algorithm, either single-horizon or multi-horizon.", + exclude=True, + ) + + combiner: ForecastCombiner = Field( + default=..., + description="Combiner to aggregate forecasts from multiple forecasters if applicable.", + exclude=True, + ) + + combiner_preprocessing: TransformPipeline[TimeSeriesDataset] = Field( + default_factory=TransformPipeline[TimeSeriesDataset], + description="Feature engineering for the forecast combiner.", + exclude=True, + ) + + postprocessing: TransformPipeline[ForecastDataset] = Field( + default_factory=TransformPipeline[ForecastDataset], + description="Postprocessing pipeline for transforming model outputs into final forecasts.", + exclude=True, + ) + target_column: str = Field( + default="load", + description="Name of the target variable column in datasets.", + ) + data_splitter: DataSplitter = Field( + default_factory=DataSplitter, + description="Data splitting strategy for train/validation/test sets.", + ) + cutoff_history: timedelta = Field( + default=timedelta(days=0), + description="Amount of historical data to exclude from training and prediction due to incomplete features " + "from lag-based preprocessing. When using lag transforms (e.g., lag-14), the first N days contain NaN values. " + "Set this to match your maximum lag duration (e.g., timedelta(days=14)). " + "Default of 0 assumes no invalid rows are created by preprocessing.", + ) + # Evaluation + evaluation_metrics: list[MetricProvider] = Field( + default_factory=lambda: [R2Provider(), ObservedProbabilityProvider()], + description="List of metric providers for evaluating model score.", + ) + # Metadata + tags: dict[str, str] = Field( + default_factory=dict, + description="Optional metadata tags for the model.", + ) + + _logger: logging.Logger = PrivateAttr(default=logging.getLogger(__name__)) + + @property + def config(self) -> list[ForecasterConfig]: + """Returns the configuration of the underlying forecaster.""" + return [x.config for x in self.forecasters.values()] + + @property + @override + def is_fitted(self) -> bool: + return all(f.is_fitted for f in self.forecasters.values()) and self.combiner.is_fitted + + @property + def forecaster_names(self) -> list[str]: + """Returns the names of the underlying forecasters.""" + return list(self.forecasters.keys()) + + @override + def fit( + self, + data: TimeSeriesDataset, + data_val: TimeSeriesDataset | None = None, + data_test: TimeSeriesDataset | None = None, + ) -> EnsembleModelFitResult: + """Train the forecasting model on the provided dataset. + + Fits the preprocessing pipeline and underlying forecaster. Handles both + single-horizon and multi-horizon forecasters appropriately. + + The data splitting follows this sequence: + 1. Split test set from full data (using test_splitter) + 2. Split validation from remaining train+val data (using val_splitter) + 3. Train on the final training set + + Args: + data: Historical time series data with features and target values. + data_val: Optional validation data. If provided, splitters are ignored for validation. + data_test: Optional test data. If provided, splitters are ignored for test. + + Returns: + FitResult containing training details and metrics. + """ + # Fit forecasters + train_ensemble, val_ensemble, test_ensemble, forecaster_fit_results = self._fit_forecasters( + data=data, + data_val=data_val, + data_test=data_test, + ) + + combiner_fit_result = self._fit_combiner( + train_ensemble_dataset=train_ensemble, + val_ensemble_dataset=val_ensemble, + test_ensemble_dataset=test_ensemble, + data=data, + data_val=data_val, + data_test=data_test, + ) + + return EnsembleModelFitResult( + forecaster_fit_results=forecaster_fit_results, + combiner_fit_result=combiner_fit_result, + ) + + @staticmethod + def _combine_datasets( + data: ForecastInputDataset, additional_features: ForecastInputDataset + ) -> ForecastInputDataset: + """Combine Forecaster learner predictions with additional features for ForecastCombiner input. + + Args: + data: ForecastInputDataset containing base Forecaster predictions. + additional_features: ForecastInputDataset containing additional features. + + Returns: + ForecastInputDataset with combined features. + """ + additional_df = additional_features.data.loc[ + :, [col for col in additional_features.data.columns if col not in data.data.columns] + ] + # Merge on index to combine datasets + combined_df = data.data.join(additional_df) + + return ForecastInputDataset( + data=combined_df, + sample_interval=data.sample_interval, + forecast_start=data.forecast_start, + ) + + def _transform_combiner_data(self, data: TimeSeriesDataset) -> ForecastInputDataset | None: + if len(self.combiner_preprocessing.transforms) == 0: + return None + combiner_data = self.combiner_preprocessing.transform(data) + return ForecastInputDataset.from_timeseries(combiner_data, target_column=self.target_column) + + def _fit_prepare_combiner_data( + self, + data: TimeSeriesDataset, + data_val: TimeSeriesDataset | None = None, + data_test: TimeSeriesDataset | None = None, + ) -> tuple[ForecastInputDataset | None, ForecastInputDataset | None, ForecastInputDataset | None]: + + if len(self.combiner_preprocessing.transforms) == 0: + return None, None, None + self.combiner_preprocessing.fit(data=data) + + input_data_train = self.combiner_preprocessing.transform(data) + input_data_val = self.combiner_preprocessing.transform(data_val) if data_val else None + input_data_test = self.combiner_preprocessing.transform(data_test) if data_test else None + + input_data_train, input_data_val, input_data_test = self.data_splitter.split_dataset( + data=input_data_train, data_val=input_data_val, data_test=input_data_test, target_column=self.target_column + ) + combiner_data = ForecastInputDataset.from_timeseries(input_data_train, target_column=self.target_column) + + combiner_data_val = ( + ForecastInputDataset.from_timeseries(input_data_val, target_column=self.target_column) + if input_data_val + else None + ) + + combiner_data_test = ( + ForecastInputDataset.from_timeseries(input_data_test, target_column=self.target_column) + if input_data_test + else None + ) + + return combiner_data, combiner_data_val, combiner_data_test + + def _fit_forecasters( + self, + data: TimeSeriesDataset, + data_val: TimeSeriesDataset | None = None, + data_test: TimeSeriesDataset | None = None, + ) -> tuple[ + EnsembleForecastDataset, + EnsembleForecastDataset | None, + EnsembleForecastDataset | None, + dict[str, ModelFitResult], + ]: + + predictions_train: dict[str, ForecastDataset] = {} + predictions_val: dict[str, ForecastDataset | None] = {} + predictions_test: dict[str, ForecastDataset | None] = {} + results: dict[str, ModelFitResult] = {} + + # Fit the feature engineering transforms + self.common_preprocessing.fit(data=data) + data_transformed = self.common_preprocessing.transform(data=data) + [ + self.model_specific_preprocessing[name].fit(data=data_transformed) + for name in self.model_specific_preprocessing + ] + logger.debug("Completed fitting preprocessing pipelines.") + + # Fit the forecasters + for name in self.forecasters: + logger.debug("Fitting Forecaster '%s'.", name) + predictions_train[name], predictions_val[name], predictions_test[name], results[name] = ( + self._fit_forecaster( + data=data, + data_val=data_val, + data_test=data_test, + forecaster_name=name, + ) + ) + + train_ensemble = EnsembleForecastDataset.from_forecast_datasets( + predictions_train, target_series=data.data[self.target_column] + ) + + if all(isinstance(v, ForecastDataset) for v in predictions_val.values()): + val_ensemble = EnsembleForecastDataset.from_forecast_datasets( + {k: v for k, v in predictions_val.items() if v is not None}, + target_series=data.data[self.target_column], + ) + else: + val_ensemble = None + + if all(isinstance(v, ForecastDataset) for v in predictions_test.values()): + test_ensemble = EnsembleForecastDataset.from_forecast_datasets( + {k: v for k, v in predictions_test.items() if v is not None}, + target_series=data.data[self.target_column], + ) + else: + test_ensemble = None + + return train_ensemble, val_ensemble, test_ensemble, results + + def _fit_forecaster( + self, + data: TimeSeriesDataset, + data_val: TimeSeriesDataset | None = None, + data_test: TimeSeriesDataset | None = None, + forecaster_name: str = "", + ) -> tuple[ + ForecastDataset, + ForecastDataset | None, + ForecastDataset | None, + ModelFitResult, + ]: + """Train the forecaster on the provided dataset. + + Args: + data: Historical time series data with features and target values. + data_val: Optional validation data. + data_test: Optional test data. + forecaster_name: Name of the forecaster to train. + + Returns: + ForecastDataset containing the trained forecaster's predictions. + """ + forecaster = self.forecasters[forecaster_name] + validate_horizons_present(data, forecaster.config.horizons) + + # Transform and split input data + input_data_train = self.prepare_input(data=data, forecaster_name=forecaster_name) + input_data_val = self.prepare_input(data=data_val, forecaster_name=forecaster_name) if data_val else None + input_data_test = self.prepare_input(data=data_test, forecaster_name=forecaster_name) if data_test else None + + # Drop target column nan's from training data. One can not train on missing targets. + target_dropna = partial(pd.DataFrame.dropna, subset=[self.target_column]) # pyright: ignore[reportUnknownMemberType] + input_data_train = input_data_train.pipe_pandas(target_dropna) + input_data_val = input_data_val.pipe_pandas(target_dropna) if input_data_val else None + input_data_test = input_data_test.pipe_pandas(target_dropna) if input_data_test else None + + # Transform the input data to a valid forecast input and split into train/val/test + input_data_train, input_data_val, input_data_test = self.data_splitter.split_dataset( + data=input_data_train, data_val=input_data_val, data_test=input_data_test, target_column=self.target_column + ) + + # Fit the model + logger.debug("Started fitting forecaster '%s'.", forecaster_name) + forecaster.fit(data=input_data_train, data_val=input_data_val) + logger.debug("Completed fitting forecaster '%s'.", forecaster_name) + + prediction_train = self._predict_forecaster(input_data=input_data_train, forecaster_name=forecaster_name) + metrics_train = self._calculate_score(prediction=prediction_train) + + if input_data_val is not None: + prediction_val = self._predict_forecaster(input_data=input_data_val, forecaster_name=forecaster_name) + metrics_val = self._calculate_score(prediction=prediction_val) + else: + prediction_val = None + metrics_val = None + + if input_data_test is not None: + prediction_test = self._predict_forecaster(input_data=input_data_test, forecaster_name=forecaster_name) + metrics_test = self._calculate_score(prediction=prediction_test) + else: + prediction_test = None + metrics_test = None + + result = ModelFitResult( + input_dataset=input_data_train, + input_data_train=input_data_train, + input_data_val=input_data_val, + input_data_test=input_data_test, + metrics_train=metrics_train, + metrics_val=metrics_val, + metrics_test=metrics_test, + metrics_full=metrics_train, + ) + + return prediction_train, prediction_val, prediction_test, result + + def _predict_forecaster(self, input_data: ForecastInputDataset, forecaster_name: str) -> ForecastDataset: + # Predict and restore target column + logger.debug("Predicting forecaster '%s'.", forecaster_name) + prediction_raw = self.forecasters[forecaster_name].predict(data=input_data) + prediction = self.postprocessing.transform(prediction_raw) + return restore_target(dataset=prediction, original_dataset=input_data, target_column=self.target_column) + + def _predict_forecasters( + self, + data: TimeSeriesDataset, + forecast_start: datetime | None = None, + ) -> EnsembleForecastDataset: + predictions: dict[str, ForecastDataset] = {} + for name in self.forecasters: + logger.debug("Generating predictions for forecaster '%s'.", name) + input_data = self.prepare_input(data=data, forecast_start=forecast_start, forecaster_name=name) + predictions[name] = self._predict_forecaster( + input_data=input_data, + forecaster_name=name, + ) + + return EnsembleForecastDataset.from_forecast_datasets(predictions, target_series=data.data[self.target_column]) + + def prepare_input( + self, + data: TimeSeriesDataset, + forecaster_name: str = "", + forecast_start: datetime | None = None, + ) -> ForecastInputDataset: + """Prepare input data for forecastingfiltering. + + Args: + data: Raw time series dataset to prepare for forecasting. + forecast_start: Optional start time for forecasts. If provided and earlier + than the cutoff time, overrides the cutoff for data filtering. + forecaster_name: Name of the forecaster for which to prepare input data. + + Returns: + Processed forecast input dataset ready for model prediction. + """ + logger.debug("Preparing input data for forecaster '%s'.", forecaster_name) + # Transform the data + input_data = self.common_preprocessing.transform(data=data) + if forecaster_name in self.model_specific_preprocessing: + logger.debug("Applying model-specific preprocessing for forecaster '%s'.", forecaster_name) + input_data = self.model_specific_preprocessing[forecaster_name].transform(data=input_data) + input_data = restore_target(dataset=input_data, original_dataset=data, target_column=self.target_column) + + # Cut away input history to avoid training on incomplete data + input_data_start = cast("pd.Series[pd.Timestamp]", input_data.index).min().to_pydatetime() + input_data_cutoff = input_data_start + self.cutoff_history + if forecast_start is not None and forecast_start < input_data_cutoff: + input_data_cutoff = forecast_start + self._logger.warning( + "Forecast start %s is after input data start + cutoff history %s. Using forecast start as cutoff.", + forecast_start, + input_data_cutoff, + ) + input_data = input_data.filter_by_range(start=input_data_cutoff) + + return ForecastInputDataset.from_timeseries( + dataset=input_data, + target_column=self.target_column, + forecast_start=forecast_start, + ) + + def _predict_transform_combiner( + self, ensemble_dataset: EnsembleForecastDataset, original_data: TimeSeriesDataset + ) -> ForecastDataset: + logger.debug("Predicting combiner.") + features = self._transform_combiner_data(data=original_data) + + return self._predict_combiner(ensemble_dataset, features) + + def _predict_combiner( + self, ensemble_dataset: EnsembleForecastDataset, features: ForecastInputDataset | None + ) -> ForecastDataset: + logger.debug("Predicting combiner.") + prediction_raw = self.combiner.predict(ensemble_dataset, additional_features=features) + prediction = self.postprocessing.transform(prediction_raw) + + return restore_target(dataset=prediction, original_dataset=ensemble_dataset, target_column=self.target_column) + + def _fit_combiner( + self, + data: TimeSeriesDataset, + train_ensemble_dataset: EnsembleForecastDataset, + data_val: TimeSeriesDataset | None = None, + data_test: TimeSeriesDataset | None = None, + val_ensemble_dataset: EnsembleForecastDataset | None = None, + test_ensemble_dataset: EnsembleForecastDataset | None = None, + ) -> ModelFitResult: + + features_train, features_val, features_test = self._fit_prepare_combiner_data( + data=data, data_val=data_val, data_test=data_test + ) + + logger.debug("Fitting combiner.") + self.combiner.fit( + data=train_ensemble_dataset, data_val=val_ensemble_dataset, additional_features=features_train + ) + + prediction_train = self._predict_combiner(train_ensemble_dataset, features=features_train) + metrics_train = self._calculate_score(prediction=prediction_train) + + if val_ensemble_dataset is not None: + prediction_val = self._predict_combiner(val_ensemble_dataset, features=features_val) + metrics_val = self._calculate_score(prediction=prediction_val) + else: + prediction_val = None + metrics_val = None + + if test_ensemble_dataset is not None: + prediction_test = self._predict_combiner(test_ensemble_dataset, features=features_test) + metrics_test = self._calculate_score(prediction=prediction_test) + else: + prediction_test = None + metrics_test = None + + return ModelFitResult( + input_dataset=train_ensemble_dataset, + input_data_train=train_ensemble_dataset.select_quantile(quantile=self.config[0].quantiles[0]), + input_data_val=val_ensemble_dataset.select_quantile(quantile=self.config[0].quantiles[0]) + if val_ensemble_dataset + else None, + input_data_test=test_ensemble_dataset.select_quantile(quantile=self.config[0].quantiles[0]) + if test_ensemble_dataset + else None, + metrics_train=metrics_train, + metrics_val=metrics_val, + metrics_test=metrics_test, + metrics_full=metrics_train, + ) + + def _predict_contributions_combiner( + self, ensemble_dataset: EnsembleForecastDataset, original_data: TimeSeriesDataset + ) -> pd.DataFrame: + + features = self._transform_combiner_data(data=original_data) + predictions = self.combiner.predict_contributions(ensemble_dataset, additional_features=features) + predictions[ensemble_dataset.target_column] = ensemble_dataset.target_series + return predictions + + def predict(self, data: TimeSeriesDataset, forecast_start: datetime | None = None) -> ForecastDataset: + """Generate forecasts for the provided dataset. + + Args: + data: Input time series dataset for prediction. + forecast_start: Optional start time for forecasts. + + Returns: + ForecastDataset containing the generated forecasts. + + Raises: + NotFittedError: If the model has not been fitted yet. + """ + if not self.is_fitted: + raise NotFittedError(self.__class__.__name__) + logger.debug("Generating predictions.") + + ensemble_predictions = self._predict_forecasters(data=data, forecast_start=forecast_start) + + # Predict and restore target column + return self._predict_transform_combiner( + ensemble_dataset=ensemble_predictions, + original_data=data, + ) + + def predict_contributions(self, data: TimeSeriesDataset, forecast_start: datetime | None = None) -> pd.DataFrame: + """Generate forecasts for the provided dataset. + + Args: + data: Input time series dataset for prediction. + forecast_start: Optional start time for forecasts. + + Returns: + ForecastDataset containing the generated forecasts. + + Raises: + NotFittedError: If the model has not been fitted yet. + """ + if not self.is_fitted: + raise NotFittedError(self.__class__.__name__) + + ensemble_predictions = self._predict_forecasters(data=data, forecast_start=forecast_start) + + return self._predict_contributions_combiner( + ensemble_dataset=ensemble_predictions, + original_data=data, + ) + + def score( + self, + data: TimeSeriesDataset, + ) -> SubsetMetric: + """Evaluate model performance on the provided dataset. + + Generates predictions for the dataset and calculates evaluation metrics + by comparing against ground truth values. Uses the configured evaluation + metrics to assess forecast quality at the maximum forecast horizon. + + Args: + data: Time series dataset containing both features and target values + for evaluation. + + Returns: + Evaluation metrics including configured providers (e.g., R2, observed + probability) computed at the maximum forecast horizon. + """ + prediction = self.predict(data=data) + + return self._calculate_score(prediction=prediction) + + def _calculate_score(self, prediction: ForecastDataset) -> SubsetMetric: + if prediction.target_series is None: + raise ValueError("Prediction dataset must contain target series for scoring.") + + # We need to make sure there are no NaNs in the target label for metric calculation + prediction = prediction.pipe_pandas(pd.DataFrame.dropna, subset=[self.target_column]) # pyright: ignore[reportUnknownArgumentType, reportUnknownMemberType] + + pipeline = EvaluationPipeline( + # Needs only one horizon since we are using only a single prediction step + # If a more comprehensive test is needed, a backtest should be run. + config=EvaluationConfig(available_ats=[], lead_times=[self.config[0].max_horizon]), + quantiles=self.config[0].quantiles, + # Similarly windowed metrics are not relevant for single predictions. + window_metric_providers=[], + global_metric_providers=self.evaluation_metrics, + ) + + evaluation_result = pipeline.run_for_subset( + filtering=self.config[0].max_horizon, + predictions=prediction, + ) + global_metric = evaluation_result.get_global_metric() + if not global_metric: + return SubsetMetric( + window="global", + timestamp=prediction.forecast_start, + metrics={}, + ) + + return global_metric + + +def restore_target[T: TimeSeriesDataset]( + dataset: T, + original_dataset: TimeSeriesDataset, + target_column: str, +) -> T: + """Restore the target column from the original dataset to the given dataset. + + Maps target values from the original dataset to the dataset using index alignment. + Ensures the target column is present in the dataset for downstream processing. + + Args: + dataset: Dataset to modify by adding the target column. + original_dataset: Source dataset containing the target values. + target_column: Name of the target column to restore. + + Returns: + Dataset with the target column restored from the original dataset. + """ + target_series = original_dataset.select_features([target_column]).select_version().data[target_column] + + def _transform_restore_target(df: pd.DataFrame) -> pd.DataFrame: + return df.assign(**{str(target_series.name): df.index.map(target_series)}) # type: ignore + + return dataset.pipe_pandas(_transform_restore_target) + + +__all__ = ["EnsembleForecastingModel", "ModelFitResult", "restore_target"] diff --git a/packages/openstef-meta/src/openstef_meta/models/forecast_combiners/__init__.py b/packages/openstef-meta/src/openstef_meta/models/forecast_combiners/__init__.py new file mode 100644 index 000000000..56a4cadff --- /dev/null +++ b/packages/openstef-meta/src/openstef_meta/models/forecast_combiners/__init__.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +"""Forecast Combiners.""" + +from .forecast_combiner import ForecastCombiner, ForecastCombinerConfig +from .learned_weights_combiner import ( + LGBMCombinerHyperParams, + LogisticCombinerHyperParams, + RFCombinerHyperParams, + WeightsCombiner, + WeightsCombinerConfig, + XGBCombinerHyperParams, +) +from .rules_combiner import RulesCombiner, RulesCombinerConfig +from .stacking_combiner import StackingCombiner, StackingCombinerConfig + +__all__ = [ + "ForecastCombiner", + "ForecastCombinerConfig", + "LGBMCombinerHyperParams", + "LogisticCombinerHyperParams", + "RFCombinerHyperParams", + "RulesCombiner", + "RulesCombinerConfig", + "StackingCombiner", + "StackingCombinerConfig", + "WeightsCombiner", + "WeightsCombinerConfig", + "XGBCombinerHyperParams", +] diff --git a/packages/openstef-meta/src/openstef_meta/models/forecast_combiners/forecast_combiner.py b/packages/openstef-meta/src/openstef_meta/models/forecast_combiners/forecast_combiner.py new file mode 100644 index 000000000..a8cd4864f --- /dev/null +++ b/packages/openstef-meta/src/openstef_meta/models/forecast_combiners/forecast_combiner.py @@ -0,0 +1,145 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 +"""Core meta model interfaces and configurations. + +Provides the fundamental building blocks for implementing meta models in OpenSTEF. +These mixins establish contracts that ensure consistent behavior across different meta model types +while ensuring full compatability with regular Forecasters. +""" + +from abc import abstractmethod +from typing import Self + +import pandas as pd +from pydantic import ConfigDict, Field + +from openstef_core.base_model import BaseConfig +from openstef_core.datasets import ForecastDataset, ForecastInputDataset +from openstef_core.mixins import HyperParams, Predictor +from openstef_core.types import LeadTime, Quantile +from openstef_meta.utils.datasets import EnsembleForecastDataset +from openstef_models.transforms.general.selector import Selector +from openstef_models.utils.feature_selection import FeatureSelection + +SELECTOR = Selector( + selection=FeatureSelection(include=None), +) + + +class ForecastCombinerConfig(BaseConfig): + """Hyperparameters for the Final Learner.""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + hyperparams: HyperParams = Field( + description="Hyperparameters for the final learner.", + ) + + quantiles: list[Quantile] = Field( + default=[Quantile(0.5)], + description=( + "Probability levels for uncertainty estimation. Each quantile represents a confidence level " + "(e.g., 0.1 = 10th percentile, 0.5 = median, 0.9 = 90th percentile). " + "Models must generate predictions for all specified quantiles." + ), + min_length=1, + ) + + horizons: list[LeadTime] = Field( + default=..., + description=( + "Lead times for predictions, accounting for data availability and versioning cutoffs. " + "Each horizon defines how far ahead the model should predict." + ), + min_length=1, + ) + + @property + def max_horizon(self) -> LeadTime: + """Returns the maximum lead time (horizon) from the configured horizons. + + Useful for determining the furthest prediction distance required by the model. + This is commonly used for data preparation and validation logic. + + Returns: + The maximum lead time. + """ + return max(self.horizons) + + def with_horizon(self, horizon: LeadTime) -> Self: + """Create a new configuration with a different horizon. + + Useful for creating multiple forecaster instances for different prediction + horizons from a single base configuration. + + Args: + horizon: The new lead time to use for predictions. + + Returns: + New configuration instance with the specified horizon. + """ + return self.model_copy(update={"horizons": [horizon]}) + + +class ForecastCombiner(Predictor[EnsembleForecastDataset, ForecastDataset]): + """Combines base Forecaster predictions for each quantile into final predictions.""" + + config: ForecastCombinerConfig + + @abstractmethod + def fit( + self, + data: EnsembleForecastDataset, + data_val: EnsembleForecastDataset | None = None, + additional_features: ForecastInputDataset | None = None, + ) -> None: + """Fit the final learner using base Forecaster predictions. + + Args: + data: EnsembleForecastDataset + data_val: Optional EnsembleForecastDataset for validation during fitting. Will be ignored + additional_features: Optional ForecastInputDataset containing additional features for the final learner. + """ + raise NotImplementedError("Subclasses must implement the fit method.") + + def predict( + self, + data: EnsembleForecastDataset, + additional_features: ForecastInputDataset | None = None, + ) -> ForecastDataset: + """Generate final predictions based on base Forecaster predictions. + + Args: + data: EnsembleForecastDataset containing base Forecaster predictions. + data_val: Optional EnsembleForecastDataset for validation during prediction. Will be ignored + additional_features: Optional ForecastInputDataset containing additional features for the final learner. + + Returns: + ForecastDataset containing the final predictions. + """ + raise NotImplementedError("Subclasses must implement the predict method.") + + @property + @abstractmethod + def is_fitted(self) -> bool: + """Indicates whether the final learner has been fitted.""" + raise NotImplementedError("Subclasses must implement the is_fitted property.") + + @abstractmethod + def predict_contributions( + self, + data: EnsembleForecastDataset, + additional_features: ForecastInputDataset | None = None, + ) -> pd.DataFrame: + """Generate final predictions based on base Forecaster predictions. + + Args: + data: EnsembleForecastDataset containing base Forecaster predictions. + data_val: Optional EnsembleForecastDataset for validation during prediction. Will be ignored + additional_features: Optional ForecastInputDataset containing additional features for the final learner. + + Returns: + ForecastDataset containing the final contributions. + """ + raise NotImplementedError("Subclasses must implement the predict method.") diff --git a/packages/openstef-meta/src/openstef_meta/models/forecast_combiners/learned_weights_combiner.py b/packages/openstef-meta/src/openstef_meta/models/forecast_combiners/learned_weights_combiner.py new file mode 100644 index 000000000..d2b0fac48 --- /dev/null +++ b/packages/openstef-meta/src/openstef_meta/models/forecast_combiners/learned_weights_combiner.py @@ -0,0 +1,431 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 +"""Learned Weights Combiner. + +Forecast combiner that uses a classification approach to learn weights for base forecasters. +It is designed to efficiently combine predictions from multiple base forecasters by learning which +forecaster is likely to perform best under different conditions. The combiner can operate in two modes: +- Hard Selection: Selects the base forecaster with the highest predicted probability for each instance. +- Soft Selection: Uses the predicted probabilities as weights to combine base forecaster predictions. +""" + +import logging +from abc import abstractmethod +from typing import Literal, override + +import pandas as pd +from lightgbm import LGBMClassifier +from pydantic import Field +from sklearn.dummy import DummyClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import LabelEncoder +from sklearn.utils.class_weight import compute_sample_weight # type: ignore +from xgboost import XGBClassifier + +from openstef_core.datasets import ForecastDataset, ForecastInputDataset +from openstef_core.exceptions import ( + NotFittedError, +) +from openstef_core.mixins.predictor import HyperParams +from openstef_core.types import LeadTime, Quantile +from openstef_meta.models.forecast_combiners.forecast_combiner import ( + ForecastCombiner, + ForecastCombinerConfig, +) +from openstef_meta.utils.datasets import EnsembleForecastDataset, combine_forecast_input_datasets + +logger = logging.getLogger(__name__) + + +# Base classes for Learned Weights Final Learner + +Classifier = LGBMClassifier | XGBClassifier | LogisticRegression | DummyClassifier +ClassifierNames = Literal["lgbm", "xgb", "logistic_regression", "dummy"] + + +class ClassifierParamsMixin: + """Hyperparameters for the Final Learner.""" + + @abstractmethod + def get_classifier(self) -> Classifier: + """Returns the classifier instance.""" + msg = "Subclasses must implement get_classifier method." + raise NotImplementedError(msg) + + +class LGBMCombinerHyperParams(HyperParams, ClassifierParamsMixin): + """Hyperparameters for Learned Weights Final Learner with LGBM Classifier.""" + + n_estimators: int = Field( + default=20, + description="Number of estimators for the LGBM Classifier. Defaults to 20.", + ) + + n_leaves: int = Field( + default=31, + description="Number of leaves for the LGBM Classifier. Defaults to 31.", + ) + + reg_alpha: float = Field( + default=0.0, + description="L1 regularization term on weights. Defaults to 0.0.", + ) + + reg_lambda: float = Field( + default=0.0, + description="L2 regularization term on weights. Defaults to 0.0.", + ) + + @override + def get_classifier(self) -> LGBMClassifier: + """Returns the LGBM Classifier.""" + return LGBMClassifier( + class_weight="balanced", + n_estimators=self.n_estimators, + num_leaves=self.n_leaves, + reg_alpha=self.reg_alpha, + reg_lambda=self.reg_lambda, + n_jobs=1, + ) + + +class RFCombinerHyperParams(HyperParams, ClassifierParamsMixin): + """Hyperparameters for Learned Weights Final Learner with LGBM Random Forest Classifier.""" + + n_estimators: int = Field( + default=20, + description="Number of estimators for the LGBM Classifier. Defaults to 20.", + ) + + n_leaves: int = Field( + default=31, + description="Number of leaves for the LGBM Classifier. Defaults to 31.", + ) + + bagging_freq: int = Field( + default=1, + description="Frequency for bagging in the Random Forest. Defaults to 1.", + ) + + bagging_fraction: float = Field( + default=0.8, + description="Fraction of data to be used for each iteration of the Random Forest. Defaults to 0.8.", + ) + + feature_fraction: float = Field( + default=1, + description="Fraction of features to be used for each iteration of the Random Forest. Defaults to 1.", + ) + + @override + def get_classifier(self) -> LGBMClassifier: + """Returns the Random Forest LGBMClassifier.""" + return LGBMClassifier( + boosting_type="rf", + class_weight="balanced", + n_estimators=self.n_estimators, + bagging_freq=self.bagging_freq, + bagging_fraction=self.bagging_fraction, + feature_fraction=self.feature_fraction, + num_leaves=self.n_leaves, + ) + + +# 3 XGB Classifier +class XGBCombinerHyperParams(HyperParams, ClassifierParamsMixin): + """Hyperparameters for Learned Weights Final Learner with LGBM Random Forest Classifier.""" + + n_estimators: int = Field( + default=20, + description="Number of estimators for the LGBM Classifier. Defaults to 20.", + ) + + @override + def get_classifier(self) -> XGBClassifier: + """Returns the XGBClassifier.""" + return XGBClassifier(n_estimators=self.n_estimators) + + +class LogisticCombinerHyperParams(HyperParams, ClassifierParamsMixin): + """Hyperparameters for Learned Weights Final Learner with LGBM Random Forest Classifier.""" + + fit_intercept: bool = Field( + default=True, + description="Whether to calculate the intercept for this model. Defaults to True.", + ) + + penalty: Literal["l1", "l2", "elasticnet"] = Field( + default="l2", + description="Specify the norm used in the penalization. Defaults to 'l2'.", + ) + + c: float = Field( + default=1.0, + description="Inverse of regularization strength; must be a positive float. Defaults to 1.0.", + ) + + @override + def get_classifier(self) -> LogisticRegression: + """Returns the LogisticRegression.""" + return LogisticRegression( + class_weight="balanced", + fit_intercept=self.fit_intercept, + penalty=self.penalty, + C=self.c, + ) + + +class WeightsCombinerConfig(ForecastCombinerConfig): + """Configuration for WeightsCombiner.""" + + hyperparams: HyperParams = Field( + default=LGBMCombinerHyperParams(), + description="Hyperparameters for the Weights Combiner.", + ) + + quantiles: list[Quantile] = Field( + default=[Quantile(0.5)], + description=( + "Probability levels for uncertainty estimation. Each quantile represents a confidence level " + "(e.g., 0.1 = 10th percentile, 0.5 = median, 0.9 = 90th percentile). " + "Models must generate predictions for all specified quantiles." + ), + min_length=1, + ) + + horizons: list[LeadTime] = Field( + default=..., + description=( + "Lead times for predictions, accounting for data availability and versioning cutoffs. " + "Each horizon defines how far ahead the model should predict." + ), + min_length=1, + ) + + hard_selection: bool = Field( + default=False, + description=( + "If True, the combiner will select the base model with the highest predicted probability " + "for each instance (hard selection). If False, it will use the predicted probabilities as " + "weights to combine base model predictions (soft selection)." + ), + ) + + @property + def get_classifier(self) -> Classifier: + """Returns the classifier instance from hyperparameters. + + Returns: + Classifier instance. + + Raises: + TypeError: If hyperparams do not implement ClassifierParamsMixin. + """ + if not isinstance(self.hyperparams, ClassifierParamsMixin): + msg = "hyperparams must implement ClassifierParamsMixin to get classifier." + raise TypeError(msg) + return self.hyperparams.get_classifier() + + +class WeightsCombiner(ForecastCombiner): + """Combines base Forecaster predictions with a classification approach. + + The classifier is used to predict model weights for each base forecaster. + Depending on the `hard_selection` parameter in the configuration, the combiner can either + select the base forecaster with the highest predicted probability (hard selection) or use + the predicted probabilities as weights to combine base forecaster predictions (soft selection). + """ + + Config = WeightsCombinerConfig + LGBMHyperParams = LGBMCombinerHyperParams + RFHyperParams = RFCombinerHyperParams + XGBHyperParams = XGBCombinerHyperParams + LogisticHyperParams = LogisticCombinerHyperParams + + def __init__(self, config: WeightsCombinerConfig) -> None: + """Initialize the Weigths Combiner.""" + self.quantiles = config.quantiles + self.config = config + self.hyperparams = config.hyperparams + self._is_fitted: bool = False + self._is_fitted = False + self._label_encoder = LabelEncoder() + self.hard_selection = config.hard_selection + + # Initialize a classifier per quantile + self.models: list[Classifier] = [config.get_classifier for _ in self.quantiles] + + @override + def fit( + self, + data: EnsembleForecastDataset, + data_val: EnsembleForecastDataset | None = None, + additional_features: ForecastInputDataset | None = None, + ) -> None: + + self._label_encoder.fit(data.forecaster_names) + + for i, q in enumerate(self.quantiles): + # Data preparation + dataset = data.select_quantile_classification(quantile=q) + combined_data = combine_forecast_input_datasets( + dataset=dataset, + other=additional_features, + ) + input_data = combined_data.input_data() + labels = combined_data.target_series + self._validate_labels(labels=labels, model_index=i) + labels = self._label_encoder.transform(labels) + + # Balance classes, adjust with sample weights + weights = compute_sample_weight("balanced", labels) * combined_data.sample_weight_series + + self.models[i].fit(X=input_data, y=labels, sample_weight=weights) # type: ignore + self._is_fitted = True + + @staticmethod + def _prepare_input_data( + dataset: ForecastInputDataset, additional_features: ForecastInputDataset | None + ) -> pd.DataFrame: + """Prepare input data by combining base predictions with additional features if provided. + + Args: + dataset: ForecastInputDataset containing base predictions. + additional_features: Optional ForecastInputDataset containing additional features. + + Returns: + pd.DataFrame: Combined DataFrame of base predictions and additional features if provided. + """ + df = dataset.input_data(start=dataset.index[0]) + if additional_features is not None: + df_a = additional_features.input_data(start=dataset.index[0]) + df = pd.concat( + [df, df_a], + axis=1, + join="inner", + ) + return df + + def _validate_labels(self, labels: pd.Series, model_index: int) -> None: + if len(labels.unique()) == 1: + msg = f"""Final learner for quantile {self.quantiles[model_index].format()} has + less than 2 classes in the target. + Switching to dummy classifier """ + logger.warning(msg=msg) + self.models[model_index] = DummyClassifier(strategy="most_frequent") + + def _predict_model_weights_quantile(self, base_predictions: pd.DataFrame, model_index: int) -> pd.DataFrame: + model = self.models[model_index] + if isinstance(model, DummyClassifier): + weights_array = pd.DataFrame(0, index=base_predictions.index, columns=self._label_encoder.classes_) + weights_array[self._label_encoder.classes_[0]] = 1.0 + else: + weights_array = model.predict_proba(base_predictions) # type: ignore + + return pd.DataFrame(weights_array, index=base_predictions.index, columns=self._label_encoder.classes_) # type: ignore + + def _generate_predictions_quantile( + self, + dataset: ForecastInputDataset, + additional_features: ForecastInputDataset | None, + model_index: int, + ) -> pd.Series: + + input_data = self._prepare_input_data( + dataset=dataset, + additional_features=additional_features, + ) + + weights = self._predict_model_weights_quantile(base_predictions=input_data, model_index=model_index) + + if self.hard_selection: + # If selection mode is hard, set the max weight to 1 and others to 0 + # Edge case if max weights are equal, distribute equally + weights = (weights == weights.max(axis=1).to_frame().to_numpy()) / weights.sum(axis=1).to_frame().to_numpy() + + return dataset.input_data().mul(weights).sum(axis=1) + + @override + def predict( + self, + data: EnsembleForecastDataset, + additional_features: ForecastInputDataset | None = None, + ) -> ForecastDataset: + if not self.is_fitted: + raise NotFittedError(self.__class__.__name__) + + # Generate predictions + predictions = pd.DataFrame({ + Quantile(q).format(): self._generate_predictions_quantile( + dataset=data.select_quantile(quantile=Quantile(q)), + additional_features=additional_features, + model_index=i, + ) + for i, q in enumerate(self.quantiles) + }) + target_series = data.target_series + if target_series is not None: + predictions[data.target_column] = target_series + + return ForecastDataset( + data=predictions, + sample_interval=data.sample_interval, + target_column=data.target_column, + forecast_start=data.forecast_start, + ) + + @override + def predict_contributions( + self, + data: EnsembleForecastDataset, + additional_features: ForecastInputDataset | None = None, + ) -> pd.DataFrame: + if not self.is_fitted: + raise NotFittedError(self.__class__.__name__) + + # Generate predictions + contribution_list = [ + self._generate_contributions_quantile( + dataset=data.select_quantile(quantile=Quantile(q)), + additional_features=additional_features, + model_index=i, + ) + for i, q in enumerate(self.quantiles) + ] + + contributions = pd.concat(contribution_list, axis=1) + + target_series = data.target_series + if target_series is not None: + contributions[data.target_column] = target_series + + return contributions + + def _generate_contributions_quantile( + self, + dataset: ForecastInputDataset, + additional_features: ForecastInputDataset | None, + model_index: int, + ) -> pd.DataFrame: + input_data = self._prepare_input_data( + dataset=dataset, + additional_features=additional_features, + ) + weights = self._predict_model_weights_quantile(base_predictions=input_data, model_index=model_index) + weights.columns = [f"{col}_{Quantile(self.quantiles[model_index]).format()}" for col in weights.columns] + return weights + + @property + @override + def is_fitted(self) -> bool: + return self._is_fitted + + +__all__ = [ + "LGBMCombinerHyperParams", + "LogisticCombinerHyperParams", + "RFCombinerHyperParams", + "WeightsCombiner", + "XGBCombinerHyperParams", +] diff --git a/packages/openstef-meta/src/openstef_meta/models/forecast_combiners/rules_combiner.py b/packages/openstef-meta/src/openstef_meta/models/forecast_combiners/rules_combiner.py new file mode 100644 index 000000000..93a12744f --- /dev/null +++ b/packages/openstef-meta/src/openstef_meta/models/forecast_combiners/rules_combiner.py @@ -0,0 +1,174 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 +"""Rules-based Meta Forecaster Module.""" + +import logging +from typing import cast, override + +import pandas as pd +from pydantic import Field, field_validator + +from openstef_core.datasets import ForecastDataset, ForecastInputDataset +from openstef_core.mixins import HyperParams +from openstef_core.types import LeadTime, Quantile +from openstef_meta.models.forecast_combiners.forecast_combiner import ForecastCombiner, ForecastCombinerConfig +from openstef_meta.utils.datasets import EnsembleForecastDataset +from openstef_meta.utils.decision_tree import Decision, DecisionTree + +logger = logging.getLogger(__name__) + + +class RulesLearnerHyperParams(HyperParams): + """HyperParams for Stacking Final Learner.""" + + decision_tree: DecisionTree = Field( + description="Decision tree defining the rules for the final learner.", + default=DecisionTree( + nodes=[Decision(idx=0, decision="LGBMForecaster")], + outcomes={"LGBMForecaster"}, + ), + ) + + +class RulesCombinerConfig(ForecastCombinerConfig): + """Configuration for Rules-based Forecast Combiner.""" + + hyperparams: HyperParams = Field( + description="Hyperparameters for the Rules-based final learner.", + default=RulesLearnerHyperParams(), + ) + + quantiles: list[Quantile] = Field( + default=[Quantile(0.5)], + description=( + "Probability levels for uncertainty estimation. Each quantile represents a confidence level " + "(e.g., 0.1 = 10th percentile, 0.5 = median, 0.9 = 90th percentile). " + "Models must generate predictions for all specified quantiles." + ), + min_length=1, + ) + + horizons: list[LeadTime] = Field( + default=..., + description=( + "Lead times for predictions, accounting for data availability and versioning cutoffs. " + "Each horizon defines how far ahead the model should predict." + ), + min_length=1, + ) + + @field_validator("hyperparams", mode="after") + @staticmethod + def _validate_hyperparams(v: HyperParams) -> HyperParams: + if not isinstance(v, RulesLearnerHyperParams): + raise TypeError("hyperparams must be an instance of RulesLearnerHyperParams.") + return v + + +class RulesCombiner(ForecastCombiner): + """Combines base Forecaster predictions per quantile into final predictions using hard-coded rules.""" + + Config = RulesCombinerConfig + + def __init__(self, config: RulesCombinerConfig) -> None: + """Initialize the Rules Learner. + + Args: + config: Configuration for the Rules Combiner. + """ + hyperparams = cast(RulesLearnerHyperParams, config.hyperparams) + self.tree = hyperparams.decision_tree + self.quantiles = config.quantiles + self.config = config + + @override + def fit( + self, + data: EnsembleForecastDataset, + data_val: EnsembleForecastDataset | None = None, + additional_features: ForecastInputDataset | None = None, + ) -> None: + # No fitting needed for rule-based final learner + # Check that additional features are provided + if additional_features is None: + raise ValueError("Additional features must be provided for RulesForecastCombiner prediction.") + + def _predict_tree(self, data: pd.DataFrame, columns: pd.Index) -> pd.DataFrame: + """Predict using the decision tree rules. + + Args: + data: DataFrame containing the additional features. + columns: Expected columns for the output DataFrame. + + Returns: + DataFrame with predictions for each quantile. + """ + predictions = data.apply(self.tree.get_decision, axis=1) + + return pd.get_dummies(predictions).reindex(columns=columns) + + @override + def predict( + self, + data: EnsembleForecastDataset, + additional_features: ForecastInputDataset | None = None, + ) -> ForecastDataset: + if additional_features is None: + raise ValueError("Additional features must be provided for RulesForecastCombiner prediction.") + + decisions = self._predict_tree( + additional_features.data, columns=data.select_quantile(quantile=self.quantiles[0]).data.columns + ) + + # Generate predictions + predictions: list[pd.DataFrame] = [] + for q in self.quantiles: + dataset = data.select_quantile(quantile=q) + preds = dataset.input_data().multiply(decisions).sum(axis=1) + + predictions.append(preds.to_frame(name=Quantile(q).format())) + + # Concatenate predictions along columns to form a DataFrame with quantile columns + df = pd.concat(predictions, axis=1) + + return ForecastDataset( + data=df, + sample_interval=data.sample_interval, + ) + + @override + def predict_contributions( + self, + data: EnsembleForecastDataset, + additional_features: ForecastInputDataset | None = None, + ) -> pd.DataFrame: + if additional_features is None: + raise ValueError("Additional features must be provided for RulesForecastCombiner prediction.") + + decisions = self._predict_tree( + additional_features.data, columns=data.select_quantile(quantile=self.quantiles[0]).data.columns + ) + + # Generate predictions + predictions: list[pd.DataFrame] = [] + for q in self.quantiles: + dataset = data.select_quantile(quantile=q) + preds = dataset.input_data().multiply(decisions).sum(axis=1) + + predictions.append(preds.to_frame(name=Quantile(q).format())) + + # Concatenate predictions along columns to form a DataFrame with quantile columns + return pd.concat(predictions, axis=1) + + @property + def is_fitted(self) -> bool: + """Check the Rules Final Learner is fitted.""" + return True + + +__all__ = [ + "RulesCombiner", + "RulesCombinerConfig", + "RulesLearnerHyperParams", +] diff --git a/packages/openstef-meta/src/openstef_meta/models/forecast_combiners/stacking_combiner.py b/packages/openstef-meta/src/openstef_meta/models/forecast_combiners/stacking_combiner.py new file mode 100644 index 000000000..d59811453 --- /dev/null +++ b/packages/openstef-meta/src/openstef_meta/models/forecast_combiners/stacking_combiner.py @@ -0,0 +1,245 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 +"""Stacking Forecast Combiner. + +This module implements a Stacking Combiner that integrates predictions from multiple base Forecasters. +It uses a regression approach to combine the predictions for each quantile into final forecasts. +""" + +import logging +from functools import partial +from typing import TYPE_CHECKING, cast, override + +import pandas as pd +from pydantic import Field, field_validator + +from openstef_core.datasets import ForecastDataset, ForecastInputDataset +from openstef_core.exceptions import ( + NotFittedError, +) +from openstef_core.mixins import HyperParams +from openstef_core.types import LeadTime, Quantile +from openstef_meta.models.forecast_combiners.forecast_combiner import ForecastCombiner, ForecastCombinerConfig +from openstef_meta.utils.datasets import EnsembleForecastDataset +from openstef_models.explainability.mixins import ExplainableForecaster +from openstef_models.models.forecasting.gblinear_forecaster import ( + GBLinearForecaster, + GBLinearHyperParams, +) +from openstef_models.models.forecasting.lgbm_forecaster import LGBMForecaster, LGBMHyperParams + +if TYPE_CHECKING: + from openstef_models.models.forecasting.forecaster import Forecaster + +logger = logging.getLogger(__name__) + +ForecasterHyperParams = GBLinearHyperParams | LGBMHyperParams +ForecasterType = GBLinearForecaster | LGBMForecaster + + +class StackingCombinerConfig(ForecastCombinerConfig): + """Configuration for the Stacking final learner.""" + + hyperparams: HyperParams = Field( + description="Hyperparameters for the Stacking Combiner.", + ) + + quantiles: list[Quantile] = Field( + default=[Quantile(0.5)], + description=( + "Probability levels for uncertainty estimation. Each quantile represents a confidence level " + "(e.g., 0.1 = 10th percentile, 0.5 = median, 0.9 = 90th percentile). " + "Models must generate predictions for all specified quantiles." + ), + min_length=1, + ) + + horizons: list[LeadTime] = Field( + default=..., + description=( + "Lead times for predictions, accounting for data availability and versioning cutoffs. " + "Each horizon defines how far ahead the model should predict." + ), + min_length=1, + ) + + @field_validator("hyperparams", mode="after") + @staticmethod + def validate_forecaster( + v: HyperParams, + ) -> HyperParams: + """Validate that the forecaster class is set in the hyperparameters. + + Args: + v: Hyperparameters to validate. + + Returns: + Validated hyperparameters. + + Raises: + ValueError: If the forecaster class is not set. + """ + if not hasattr(v, "forecaster_class"): + raise ValueError("forecaster_class must be set in hyperparameters for StackingCombinerConfig.") + return v + + +class StackingCombiner(ForecastCombiner): + """Combines base Forecaster predictions per quantile into final predictions using a regression approach.""" + + Config = StackingCombinerConfig + LGBMHyperParams = LGBMHyperParams + GBLinearHyperParams = GBLinearHyperParams + + def __init__( + self, + config: StackingCombinerConfig, + ) -> None: + """Initialize the Stacking final learner. + + Args: + config: Configuration for the Stacking combiner. + """ + forecaster_hyperparams = cast(ForecasterHyperParams, config.hyperparams) + self.quantiles = config.quantiles + self.config = config + self.hyperparams = forecaster_hyperparams + self._is_fitted: bool = False + + # Split forecaster per quantile + models: list[Forecaster] = [] + for q in self.quantiles: + forecaster_cls = forecaster_hyperparams.forecaster_class() + forecaster_config = forecaster_cls.Config( + horizons=[config.max_horizon], + quantiles=[q], + ) + if "hyperparams" in forecaster_cls.Config.model_fields: + forecaster_config = forecaster_config.model_copy(update={"hyperparams": forecaster_hyperparams}) + + model = forecaster_config.forecaster_from_config() + models.append(model) + self.models = models + + @staticmethod + def _combine_datasets( + data: ForecastInputDataset, additional_features: ForecastInputDataset + ) -> ForecastInputDataset: + """Combine base Forecaster predictions with additional features for final learner input. + + Args: + data: ForecastInputDataset containing base Forecaster predictions. + additional_features: ForecastInputDataset containing additional features. + + Returns: + ForecastInputDataset with combined features. + """ + additional_df = additional_features.data.loc[ + :, [col for col in additional_features.data.columns if col not in data.data.columns] + ] + # Merge on index to combine datasets + combined_df = data.data.join(additional_df) + + return ForecastInputDataset( + data=combined_df, + sample_interval=data.sample_interval, + forecast_start=data.forecast_start, + ) + + @override + def fit( + self, + data: EnsembleForecastDataset, + data_val: EnsembleForecastDataset | None = None, + additional_features: ForecastInputDataset | None = None, + ) -> None: + + for i, q in enumerate(self.quantiles): + if additional_features is not None: + dataset = data.select_quantile(quantile=q) + input_data = self._combine_datasets( + data=dataset, + additional_features=additional_features, + ) + else: + input_data = data.select_quantile(quantile=q) + + # Prepare input data by dropping rows with NaN target values + target_dropna = partial(pd.DataFrame.dropna, subset=[input_data.target_column]) # pyright: ignore[reportUnknownMemberType] + input_data = input_data.pipe_pandas(target_dropna) + + self.models[i].fit(data=input_data, data_val=None) + + @override + def predict( + self, + data: EnsembleForecastDataset, + additional_features: ForecastInputDataset | None = None, + ) -> ForecastDataset: + if not self.is_fitted: + raise NotFittedError(self.__class__.__name__) + + # Generate predictions + predictions: list[pd.DataFrame] = [] + for i, q in enumerate(self.quantiles): + if additional_features is not None: + input_data = self._combine_datasets( + data=data.select_quantile(quantile=q), + additional_features=additional_features, + ) + else: + input_data = data.select_quantile(quantile=q) + + if isinstance(self.models[i], GBLinearForecaster): + feature_cols = [x for x in input_data.data.columns if x != data.target_column] + feature_dropna = partial(pd.DataFrame.dropna, subset=feature_cols) # pyright: ignore[reportUnknownMemberType] + input_data = input_data.pipe_pandas(feature_dropna) + + p = self.models[i].predict(data=input_data).data + predictions.append(p) + + # Concatenate predictions along columns to form a DataFrame with quantile columns + df = pd.concat(predictions, axis=1) + + return ForecastDataset( + data=df, + sample_interval=data.sample_interval, + ) + + @override + def predict_contributions( + self, + data: EnsembleForecastDataset, + additional_features: ForecastInputDataset | None = None, + ) -> pd.DataFrame: + + predictions: list[pd.DataFrame] = [] + for i, q in enumerate(self.quantiles): + if additional_features is not None: + input_data = self._combine_datasets( + data=data.select_quantile(quantile=q), + additional_features=additional_features, + ) + else: + input_data = data.select_quantile(quantile=q) + model = self.models[i] + if not isinstance(model, ExplainableForecaster): + raise NotImplementedError( + "Predicting contributions is only supported for ExplainableForecaster models." + ) + p = model.predict_contributions(data=input_data, scale=True) + predictions.append(p) + + contributions = pd.concat(predictions, axis=1) + + target_series = data.target_series + if target_series is not None: + contributions[data.target_column] = target_series + + return contributions + + @property + def is_fitted(self) -> bool: + """Check the StackingForecastCombiner is fitted.""" + return all(x.is_fitted for x in self.models) diff --git a/packages/openstef-meta/src/openstef_meta/models/forecasting/__init__.py b/packages/openstef-meta/src/openstef_meta/models/forecasting/__init__.py new file mode 100644 index 000000000..fce9bcb92 --- /dev/null +++ b/packages/openstef-meta/src/openstef_meta/models/forecasting/__init__.py @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 +"""This module provides meta-forecasting models.""" + +from .residual_forecaster import ResidualForecaster, ResidualForecasterConfig, ResidualHyperParams + +__all__ = [ + "ResidualForecaster", + "ResidualForecasterConfig", + "ResidualHyperParams", +] diff --git a/packages/openstef-meta/src/openstef_meta/models/forecasting/residual_forecaster.py b/packages/openstef-meta/src/openstef_meta/models/forecasting/residual_forecaster.py new file mode 100644 index 000000000..de44e003c --- /dev/null +++ b/packages/openstef-meta/src/openstef_meta/models/forecasting/residual_forecaster.py @@ -0,0 +1,327 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 +"""Residual Forecaster. + +Provides method that attempts to combine the advantages of a linear model (Extraplolation) +and tree-based model (Non-linear patterns). This is achieved by training a primary model, +typically linear, followed by a secondary model that learns to predict the residuals (errors) of the primary model. +""" + +import logging +from typing import override + +import pandas as pd +from pydantic import Field, model_validator + +from openstef_core.datasets import ForecastDataset, ForecastInputDataset +from openstef_core.exceptions import ( + NotFittedError, +) +from openstef_core.mixins import HyperParams +from openstef_core.types import Quantile +from openstef_models.models.forecasting.forecaster import ( + Forecaster, + ForecasterConfig, +) +from openstef_models.models.forecasting.gblinear_forecaster import ( + GBLinearForecaster, + GBLinearHyperParams, +) +from openstef_models.models.forecasting.lgbm_forecaster import LGBMForecaster, LGBMHyperParams +from openstef_models.models.forecasting.lgbmlinear_forecaster import LGBMLinearForecaster, LGBMLinearHyperParams +from openstef_models.models.forecasting.xgboost_forecaster import XGBoostForecaster, XGBoostHyperParams + +logger = logging.getLogger(__name__) + +ResidualBaseForecaster = LGBMForecaster | LGBMLinearForecaster | XGBoostForecaster | GBLinearForecaster +ResidualBaseForecasterHyperParams = LGBMHyperParams | LGBMLinearHyperParams | XGBoostHyperParams | GBLinearHyperParams + + +class ResidualHyperParams(HyperParams): + """Hyperparameters for Stacked LGBM GBLinear Regressor.""" + + primary_hyperparams: ResidualBaseForecasterHyperParams = Field( + default=GBLinearHyperParams(), + description="Primary model hyperparams. Defaults to GBLinearHyperParams.", + ) + + secondary_hyperparams: ResidualBaseForecasterHyperParams = Field( + default=LGBMHyperParams(), + description="Hyperparameters for the final learner. Defaults to LGBMHyperparams.", + ) + + primary_name: str = Field( + default="primary_model", + description="Name identifier for the primary model.", + ) + + secondary_name: str = Field( + default="secondary_model", + description="Name identifier for the secondary model.", + ) + + @model_validator(mode="after") + def validate_names(self) -> "ResidualHyperParams": + """Validate that primary and secondary names are not the same. + + Raises: + ValueError: If primary and secondary names are the same. + + Returns: + ResidualHyperParams: The validated hyperparameters. + """ + if self.primary_name == self.secondary_name: + raise ValueError("Primary and secondary model names must be different.") + return self + + +class ResidualForecasterConfig(ForecasterConfig): + """Configuration for Hybrid-based forecasting models.""" + + hyperparams: ResidualHyperParams = ResidualHyperParams() + + verbosity: bool = Field( + default=True, + description="Enable verbose output from the Hybrid model (True/False).", + ) + + +class ResidualForecaster(Forecaster): + """MetaForecaster that implements residual modeling. + + It takes in a primary forecaster and a residual forecaster. The primary forecaster makes initial predictions, + and the residual forecaster models the residuals (errors) of the primary forecaster to improve overall accuracy. + """ + + Config = ResidualForecasterConfig + HyperParams = ResidualHyperParams + + def __init__(self, config: ResidualForecasterConfig) -> None: + """Initialize the Hybrid forecaster.""" + self._config = config + + self._primary_model: ResidualBaseForecaster = self._init_base_learners( + config=config, base_hyperparams=[config.hyperparams.primary_hyperparams] + )[0] + + self._secondary_model: list[ResidualBaseForecaster] = self._init_secondary_model( + hyperparams=config.hyperparams.secondary_hyperparams + ) + self.primary_name = config.hyperparams.primary_name + self.secondary_name = config.hyperparams.secondary_name + self._is_fitted = False + + def _init_secondary_model(self, hyperparams: ResidualBaseForecasterHyperParams) -> list[ResidualBaseForecaster]: + """Initialize secondary model for residual forecasting. + + Returns: + list[Forecaster]: List containing the initialized secondary model forecaster. + """ + models: list[ResidualBaseForecaster] = [] + # Different datasets per quantile, so we need a model per quantile + for q in self.config.quantiles: + config = self._config.model_copy(update={"quantiles": [q]}) + secondary_model = self._init_base_learners(config=config, base_hyperparams=[hyperparams])[0] + models.append(secondary_model) + + return models + + @staticmethod + def _init_base_learners( + config: ForecasterConfig, base_hyperparams: list[ResidualBaseForecasterHyperParams] + ) -> list[ResidualBaseForecaster]: + """Initialize base Forecaster based on provided hyperparameters. + + Returns: + list[Forecaster]: List of initialized base Forecaster forecasters. + """ + base_learners: list[ResidualBaseForecaster] = [] + horizons = config.horizons + quantiles = config.quantiles + + for hyperparams in base_hyperparams: + forecaster_cls = hyperparams.forecaster_class() + config = forecaster_cls.Config(horizons=horizons, quantiles=quantiles) + if "hyperparams" in forecaster_cls.Config.model_fields: + config = config.model_copy(update={"hyperparams": hyperparams}) + + base_learners.append(config.forecaster_from_config()) + + return base_learners + + @override + def fit(self, data: ForecastInputDataset, data_val: ForecastInputDataset | None = None) -> None: + """Fit the Hybrid model to the training data. + + Args: + data: Training data in the expected ForecastInputDataset format. + data_val: Validation data for tuning the model (optional, not used in this implementation). + + """ + # Fit primary model + self._primary_model.fit(data=data, data_val=data_val) + + # Reset forecast start date to ensure we fit on the full training set + full_dataset = ForecastInputDataset( + data=data.data, + sample_interval=data.sample_interval, + target_column=data.target_column, + forecast_start=data.index[0], + ) + + secondary_input = self._prepare_secondary_input( + quantiles=self.config.quantiles, + base_predictions=self._primary_model.predict(data=full_dataset), + data=data, + ) + # Predict primary model on validation data if provided + if data_val is not None: + full_val_dataset = ForecastInputDataset( + data=data_val.data, + sample_interval=data_val.sample_interval, + target_column=data_val.target_column, + forecast_start=data_val.index[0], + ) + + secondary_val_input = self._prepare_secondary_input( + quantiles=self.config.quantiles, + base_predictions=self._primary_model.predict(data=full_val_dataset), + data=data_val, + ) + # Fit secondary model on residuals + [ + self._secondary_model[i].fit(data=secondary_input[q], data_val=secondary_val_input[q]) + for i, q in enumerate(secondary_input) + ] + + else: + # Fit secondary model on residuals + [ + self._secondary_model[i].fit(data=secondary_input[q], data_val=None) + for i, q in enumerate(secondary_input) + ] + + self._is_fitted = True + + @property + @override + def is_fitted(self) -> bool: + """Check the ResidualForecaster is fitted.""" + return self._is_fitted + + @staticmethod + def _prepare_secondary_input( + quantiles: list[Quantile], + base_predictions: ForecastDataset, + data: ForecastInputDataset, + ) -> dict[Quantile, ForecastInputDataset]: + """Adjust target series to be residuals for secondary model training. + + Args: + quantiles: List of quantiles to prepare data for. + base_predictions: Predictions from the primary model. + data: Original input data. + + Returns: + dict[Quantile, ForecastInputDataset]: Prepared datasets for each quantile. + """ + predictions_quantiles: dict[Quantile, ForecastInputDataset] = {} + sample_interval = data.sample_interval + for q in quantiles: + predictions = base_predictions.data[q.format()] + df = data.data.copy() + df[data.target_column] = data.target_series - predictions + predictions_quantiles[q] = ForecastInputDataset( + data=df, + sample_interval=sample_interval, + target_column=data.target_column, + forecast_start=df.index[0], + ) + + return predictions_quantiles + + def _predict_secodary_model(self, data: ForecastInputDataset) -> ForecastDataset: + predictions: dict[str, pd.Series] = {} + for model in self._secondary_model: + pred = model.predict(data=data) + q = model.config.quantiles[0].format() + predictions[q] = pred.data[q] + + return ForecastDataset( + data=pd.DataFrame(predictions), + sample_interval=data.sample_interval, + ) + + def predict(self, data: ForecastInputDataset) -> ForecastDataset: + """Generate predictions using the ResidualForecaster model. + + Args: + data: Input data for prediction. + + Returns: + ForecastDataset containing the predictions. + + Raises: + NotFittedError: If the ResidualForecaster instance is not fitted yet. + """ + if not self.is_fitted: + raise NotFittedError("The ResidualForecaster instance is not fitted yet. Call 'fit' first.") + + primary_predictions = self._primary_model.predict(data=data).data + + secondary_predictions = self._predict_secodary_model(data=data).data + + final_predictions = primary_predictions + secondary_predictions + + return ForecastDataset( + data=final_predictions, + sample_interval=data.sample_interval, + ) + + def predict_contributions(self, data: ForecastInputDataset, *, scale: bool = True) -> pd.DataFrame: + """Generate prediction contributions using the ResidualForecaster model. + + Args: + data: Input data for prediction contributions. + scale: Whether to scale contributions to sum to 1. Defaults to True. + + Returns: + pd.DataFrame containing the prediction contributions. + """ + primary_predictions = self._primary_model.predict(data=data).data + + secondary_predictions = self._predict_secodary_model(data=data).data + + if not scale: + primary_contributions = primary_predictions + primary_name = self._primary_model.__class__.__name__ + primary_contributions.columns = [f"{primary_name}_{q}" for q in primary_contributions.columns] + + secondary_contributions = secondary_predictions + secondary_name = self._secondary_model[0].__class__.__name__ + secondary_contributions.columns = [f"{secondary_name}_{q}" for q in secondary_contributions.columns] + + return pd.concat([primary_contributions, secondary_contributions], axis=1) + + primary_contributions = primary_predictions.abs() / (primary_predictions.abs() + secondary_predictions.abs()) + primary_contributions.columns = [f"{self.primary_name}_{q}" for q in primary_contributions.columns] + + secondary_contributions = secondary_predictions.abs() / ( + primary_predictions.abs() + secondary_predictions.abs() + ) + secondary_contributions.columns = [f"{self.secondary_name}_{q}" for q in secondary_contributions.columns] + + return pd.concat([primary_contributions, secondary_contributions], axis=1) + + @property + def config(self) -> ResidualForecasterConfig: + """Get the configuration of the ResidualForecaster. + + Returns: + ResidualForecasterConfig: The configuration of the forecaster. + """ + return self._config + + +__all__ = ["ResidualForecaster", "ResidualForecasterConfig", "ResidualHyperParams"] diff --git a/packages/openstef-meta/src/openstef_meta/presets/__init__.py b/packages/openstef-meta/src/openstef_meta/presets/__init__.py new file mode 100644 index 000000000..ad62320c2 --- /dev/null +++ b/packages/openstef-meta/src/openstef_meta/presets/__init__.py @@ -0,0 +1,9 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +"""Package for preset forecasting workflows.""" + +from .forecasting_workflow import EnsembleForecastingModel, EnsembleWorkflowConfig, create_ensemble_workflow + +__all__ = ["EnsembleForecastingModel", "EnsembleWorkflowConfig", "create_ensemble_workflow"] diff --git a/packages/openstef-meta/src/openstef_meta/presets/forecasting_workflow.py b/packages/openstef-meta/src/openstef_meta/presets/forecasting_workflow.py new file mode 100644 index 000000000..52568b3a1 --- /dev/null +++ b/packages/openstef-meta/src/openstef_meta/presets/forecasting_workflow.py @@ -0,0 +1,515 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +"""Ensemble forecasting workflow preset. + +Mimics OpenSTEF-models forecasting workflow with ensemble capabilities. +""" + +from collections.abc import Sequence +from datetime import timedelta +from typing import TYPE_CHECKING, Literal, cast + +from pydantic import Field + +from openstef_beam.evaluation.metric_providers import ( + MetricDirection, + MetricProvider, + ObservedProbabilityProvider, + R2Provider, +) +from openstef_core.base_model import BaseConfig +from openstef_core.datasets.timeseries_dataset import TimeSeriesDataset +from openstef_core.mixins.transform import Transform, TransformPipeline +from openstef_core.types import LeadTime, Q, Quantile, QuantileOrGlobal +from openstef_meta.models.ensemble_forecasting_model import EnsembleForecastingModel +from openstef_meta.models.forecast_combiners.learned_weights_combiner import WeightsCombiner +from openstef_meta.models.forecast_combiners.rules_combiner import RulesCombiner +from openstef_meta.models.forecast_combiners.stacking_combiner import StackingCombiner +from openstef_meta.models.forecasting.residual_forecaster import ResidualForecaster +from openstef_models.integrations.mlflow import MLFlowStorage +from openstef_models.mixins.model_serializer import ModelIdentifier +from openstef_models.models.forecasting.gblinear_forecaster import GBLinearForecaster +from openstef_models.models.forecasting.lgbm_forecaster import LGBMForecaster +from openstef_models.models.forecasting.lgbmlinear_forecaster import LGBMLinearForecaster +from openstef_models.models.forecasting.xgboost_forecaster import XGBoostForecaster +from openstef_models.presets.forecasting_workflow import LocationConfig +from openstef_models.transforms.energy_domain import WindPowerFeatureAdder +from openstef_models.transforms.general import Clipper, EmptyFeatureRemover, SampleWeighter, Scaler +from openstef_models.transforms.general.imputer import Imputer +from openstef_models.transforms.general.nan_dropper import NaNDropper +from openstef_models.transforms.general.selector import Selector +from openstef_models.transforms.postprocessing import QuantileSorter +from openstef_models.transforms.time_domain import ( + CyclicFeaturesAdder, + DatetimeFeaturesAdder, + HolidayFeatureAdder, + RollingAggregatesAdder, +) +from openstef_models.transforms.time_domain.lags_adder import LagsAdder +from openstef_models.transforms.time_domain.rolling_aggregates_adder import AggregationFunction +from openstef_models.transforms.validation import CompletenessChecker, FlatlineChecker, InputConsistencyChecker +from openstef_models.transforms.weather_domain import ( + AtmosphereDerivedFeaturesAdder, + DaylightFeatureAdder, + RadiationDerivedFeaturesAdder, +) +from openstef_models.utils.data_split import DataSplitter +from openstef_models.utils.feature_selection import Exclude, FeatureSelection, Include +from openstef_models.workflows.custom_forecasting_workflow import CustomForecastingWorkflow, ForecastingCallback + +if TYPE_CHECKING: + from openstef_models.models.forecasting.forecaster import Forecaster + + +class EnsembleWorkflowConfig(BaseConfig): + """Configuration for ensemble forecasting workflows.""" + + model_id: ModelIdentifier + + # Ensemble configuration + ensemble_type: Literal["learned_weights", "stacking", "rules"] = Field(default="learned_weights") + base_models: Sequence[Literal["lgbm", "gblinear", "xgboost", "lgbm_linear"]] = Field(default=["lgbm", "gblinear"]) + combiner_model: Literal["lgbm", "rf", "xgboost", "logistic", "gblinear"] = Field(default="lgbm") + + # Forecast configuration + quantiles: list[Quantile] = Field( + default=[Q(0.5)], + description="List of quantiles to predict for probabilistic forecasting.", + ) + + sample_interval: timedelta = Field( + default=timedelta(minutes=15), + description="Time interval between consecutive data samples.", + ) + horizons: list[LeadTime] = Field( + default=[LeadTime.from_string("PT48H")], + description="List of forecast horizons to predict.", + ) + + location: LocationConfig = Field( + default=LocationConfig(), + description="Location information for the forecasting workflow.", + ) + + # Forecaster hyperparameters + xgboost_hyperparams: XGBoostForecaster.HyperParams = Field( + default=XGBoostForecaster.HyperParams(), + description="Hyperparameters for XGBoost forecaster.", + ) + gblinear_hyperparams: GBLinearForecaster.HyperParams = Field( + default=GBLinearForecaster.HyperParams(), + description="Hyperparameters for GBLinear forecaster.", + ) + + lgbm_hyperparams: LGBMForecaster.HyperParams = Field( + default=LGBMForecaster.HyperParams(), + description="Hyperparameters for LightGBM forecaster.", + ) + + lgbmlinear_hyperparams: LGBMLinearForecaster.HyperParams = Field( + default=LGBMLinearForecaster.HyperParams(), + description="Hyperparameters for LightGBM forecaster.", + ) + + residual_hyperparams: ResidualForecaster.HyperParams = Field( + default=ResidualForecaster.HyperParams(), + description="Hyperparameters for Residual forecaster.", + ) + + # Data properties + target_column: str = Field(default="load", description="Name of the target variable column in datasets.") + energy_price_column: str = Field( + default="day_ahead_electricity_price", + description="Name of the energy price column in datasets.", + ) + radiation_column: str = Field(default="radiation", description="Name of the radiation column in datasets.") + wind_speed_column: str = Field(default="windspeed", description="Name of the wind speed column in datasets.") + pressure_column: str = Field(default="pressure", description="Name of the pressure column in datasets.") + temperature_column: str = Field(default="temperature", description="Name of the temperature column in datasets.") + relative_humidity_column: str = Field( + default="relative_humidity", + description="Name of the relative humidity column in datasets.", + ) + predict_history: timedelta = Field( + default=timedelta(days=14), + description="Amount of historical data available at prediction time.", + ) + cutoff_history: timedelta = Field( + default=timedelta(days=0), + description="Amount of historical data to exclude from training and prediction due to incomplete features " + "from lag-based preprocessing. When using lag transforms (e.g., lag-14), the first N days contain NaN values. " + "Set this to match your maximum lag duration (e.g., timedelta(days=14)). " + "Default of 0 assumes no invalid rows are created by preprocessing. " + "Note: should be same as predict_history if you are using lags. We default to disabled to keep the same " + "behaviour as openstef 3.0.", + ) + + # Feature engineering and validation + completeness_threshold: float = Field( + default=0.5, + description="Minimum fraction of data that should be available for making a regular forecast.", + ) + flatliner_threshold: timedelta = Field( + default=timedelta(hours=24), + description="Number of minutes that the load has to be constant to detect a flatliner.", + ) + detect_non_zero_flatliner: bool = Field( + default=False, + description="If True, flatliners are also detected on non-zero values (median of the load).", + ) + rolling_aggregate_features: list[AggregationFunction] = Field( + default=[], + description="If not None, rolling aggregate(s) of load will be used as features in the model.", + ) + clip_features: FeatureSelection = Field( + default=FeatureSelection(include=None, exclude=None), + description="Feature selection for which features to clip.", + ) + sample_weight_scale_percentile: int = Field( + default=95, + description="Percentile of target values used as scaling reference. " + "Values are normalized relative to this percentile before weighting.", + ) + forecaster_sample_weight_exponent: dict[str, float] = Field( + default={"gblinear": 1.0, "lgbm": 0, "xgboost": 0, "lgbm_linear": 0}, + description="Exponent applied to scale the sample weights. " + "0=uniform weights, 1=linear scaling, >1=stronger emphasis on high values. " + "Note: Defaults to 1.0 for gblinear congestion models.", + ) + + forecast_combiner_sample_weight_exponent: float = Field( + default=0, + description="Exponent applied to scale the sample weights for the forecast combiner model. " + "0=uniform weights, 1=linear scaling, >1=stronger emphasis on high values.", + ) + + sample_weight_floor: float = Field( + default=0.1, + description="Minimum weight value to ensure all samples contribute to training.", + ) + + # Data splitting strategy + data_splitter: DataSplitter = Field( + default=DataSplitter( + # Copied from OpenSTEF3 pipeline defaults + val_fraction=0.15, + test_fraction=0.0, + stratification_fraction=0.15, + min_days_for_stratification=4, + ), + description="Configuration for splitting data into training, validation, and test sets.", + ) + + # Evaluation + evaluation_metrics: list[MetricProvider] = Field( + default_factory=lambda: [R2Provider(), ObservedProbabilityProvider()], + description="List of metric providers for evaluating model score.", + ) + + # Callbacks + mlflow_storage: MLFlowStorage | None = Field( + default_factory=MLFlowStorage, + description="Configuration for MLflow experiment tracking and model storage.", + ) + + model_reuse_enable: bool = Field( + default=True, + description="Whether to enable reuse of previously trained models.", + ) + model_reuse_max_age: timedelta = Field( + default=timedelta(days=7), + description="Maximum age of a model to be considered for reuse.", + ) + + model_selection_enable: bool = Field( + default=True, + description="Whether to enable automatic model selection based on performance.", + ) + model_selection_metric: tuple[QuantileOrGlobal, str, MetricDirection] = Field( + default=(Q(0.5), "R2", "higher_is_better"), + description="Metric to monitor for model performance when retraining.", + ) + model_selection_old_model_penalty: float = Field( + default=1.2, + description="Penalty to apply to the old model's metric to bias selection towards newer models.", + ) + + verbosity: Literal[0, 1, 2, 3, True] = Field( + default=0, description="Verbosity level. 0=silent, 1=warning, 2=info, 3=debug" + ) + + # Metadata + tags: dict[str, str] = Field( + default_factory=dict, + description="Optional metadata tags for the model.", + ) + + +# Build preprocessing components +def checks(config: EnsembleWorkflowConfig) -> list[Transform[TimeSeriesDataset, TimeSeriesDataset]]: + return [ + InputConsistencyChecker(), + FlatlineChecker( + load_column=config.target_column, + flatliner_threshold=config.flatliner_threshold, + detect_non_zero_flatliner=config.detect_non_zero_flatliner, + error_on_flatliner=False, + ), + CompletenessChecker(completeness_threshold=config.completeness_threshold), + ] + + +def feature_adders(config: EnsembleWorkflowConfig) -> list[Transform[TimeSeriesDataset, TimeSeriesDataset]]: + return [ + LagsAdder( + history_available=config.predict_history, + horizons=config.horizons, + add_trivial_lags=True, + target_column=config.target_column, + ), + WindPowerFeatureAdder( + windspeed_reference_column=config.wind_speed_column, + ), + AtmosphereDerivedFeaturesAdder( + pressure_column=config.pressure_column, + relative_humidity_column=config.relative_humidity_column, + temperature_column=config.temperature_column, + ), + RadiationDerivedFeaturesAdder( + coordinate=config.location.coordinate, + radiation_column=config.radiation_column, + ), + CyclicFeaturesAdder(), + DaylightFeatureAdder( + coordinate=config.location.coordinate, + ), + RollingAggregatesAdder( + feature=config.target_column, + aggregation_functions=config.rolling_aggregate_features, + horizons=config.horizons, + ), + ] + + +def feature_standardizers(config: EnsembleWorkflowConfig) -> list[Transform[TimeSeriesDataset, TimeSeriesDataset]]: + return cast( + list[Transform[TimeSeriesDataset, TimeSeriesDataset]], + [ + Clipper(selection=Include(config.energy_price_column).combine(config.clip_features), mode="standard"), + Scaler(selection=Exclude(config.target_column), method="standard"), + EmptyFeatureRemover(), + ], + ) + + +def create_ensemble_workflow(config: EnsembleWorkflowConfig) -> CustomForecastingWorkflow: # noqa: C901, PLR0912, PLR0915 + """Create an ensemble forecasting workflow from configuration. + + Args: + config (EnsembleWorkflowConfig): Configuration for the ensemble workflow. + + Returns: + CustomForecastingWorkflow: Configured ensemble forecasting workflow. + + Raises: + ValueError: If an unsupported base model or combiner type is specified. + """ + # Common preprocessing + common_preprocessing = TransformPipeline( + transforms=[ + *checks(config), + *feature_adders(config), + HolidayFeatureAdder(country_code=config.location.country_code), + DatetimeFeaturesAdder(onehot_encode=False), + *feature_standardizers(config), + ] + ) + + # Build forecasters and their processing pipelines + forecaster_preprocessing: dict[str, list[Transform[TimeSeriesDataset, TimeSeriesDataset]]] = {} + forecasters: dict[str, Forecaster] = {} + for model_type in config.base_models: + if model_type == "lgbm": + forecasters[model_type] = LGBMForecaster( + config=LGBMForecaster.Config(quantiles=config.quantiles, horizons=config.horizons) + ) + forecaster_preprocessing[model_type] = [ + SampleWeighter( + target_column=config.target_column, + weight_exponent=config.forecaster_sample_weight_exponent[model_type], + weight_floor=config.sample_weight_floor, + weight_scale_percentile=config.sample_weight_scale_percentile, + ), + ] + + elif model_type == "gblinear": + forecasters[model_type] = GBLinearForecaster( + config=GBLinearForecaster.Config(quantiles=config.quantiles, horizons=config.horizons) + ) + forecaster_preprocessing[model_type] = [ + SampleWeighter( + target_column=config.target_column, + weight_exponent=config.forecaster_sample_weight_exponent[model_type], + weight_floor=config.sample_weight_floor, + weight_scale_percentile=config.sample_weight_scale_percentile, + ), + # Remove lags + Selector( + selection=FeatureSelection( + exclude=set( + LagsAdder( + history_available=config.predict_history, + horizons=config.horizons, + add_trivial_lags=True, + target_column=config.target_column, + ).features_added() + ).difference({"load_lag_P7D"}) + ) + ), + # Remove holiday features to avoid linear dependencies + Selector( + selection=FeatureSelection( + exclude=set(HolidayFeatureAdder(country_code=config.location.country_code).features_added()) + ) + ), + Selector( + selection=FeatureSelection(exclude=set(DatetimeFeaturesAdder(onehot_encode=False).features_added())) + ), + Imputer( + selection=Exclude(config.target_column), + imputation_strategy="mean", + fill_future_values=Include(config.energy_price_column), + ), + NaNDropper( + selection=Exclude(config.target_column), + ), + ] + elif model_type == "xgboost": + forecasters[model_type] = XGBoostForecaster( + config=XGBoostForecaster.Config(quantiles=config.quantiles, horizons=config.horizons) + ) + forecaster_preprocessing[model_type] = [ + SampleWeighter( + target_column=config.target_column, + weight_exponent=config.forecaster_sample_weight_exponent[model_type], + weight_floor=config.sample_weight_floor, + weight_scale_percentile=config.sample_weight_scale_percentile, + ), + ] + elif model_type == "lgbm_linear": + forecasters[model_type] = LGBMLinearForecaster( + config=LGBMLinearForecaster.Config(quantiles=config.quantiles, horizons=config.horizons) + ) + forecaster_preprocessing[model_type] = [ + SampleWeighter( + target_column=config.target_column, + weight_exponent=config.forecaster_sample_weight_exponent[model_type], + weight_floor=config.sample_weight_floor, + weight_scale_percentile=config.sample_weight_scale_percentile, + ), + ] + else: + msg = f"Unsupported base model type: {model_type}" + raise ValueError(msg) + + # Build combiner + # Case: Ensemble type, combiner model + match (config.ensemble_type, config.combiner_model): + case ("learned_weights", "lgbm"): + combiner_hp = WeightsCombiner.LGBMHyperParams() + combiner_config = WeightsCombiner.Config( + hyperparams=combiner_hp, horizons=config.horizons, quantiles=config.quantiles + ) + combiner = WeightsCombiner( + config=combiner_config, + ) + case ("learned_weights", "rf"): + combiner_hp = WeightsCombiner.RFHyperParams() + combiner_config = WeightsCombiner.Config( + hyperparams=combiner_hp, horizons=config.horizons, quantiles=config.quantiles + ) + combiner = WeightsCombiner( + config=combiner_config, + ) + case ("learned_weights", "xgboost"): + combiner_hp = WeightsCombiner.XGBHyperParams() + combiner_config = WeightsCombiner.Config( + hyperparams=combiner_hp, horizons=config.horizons, quantiles=config.quantiles + ) + combiner = WeightsCombiner( + config=combiner_config, + ) + case ("learned_weights", "logistic"): + combiner_hp = WeightsCombiner.LogisticHyperParams() + combiner_config = WeightsCombiner.Config( + hyperparams=combiner_hp, horizons=config.horizons, quantiles=config.quantiles + ) + combiner = WeightsCombiner( + config=combiner_config, + ) + case ("stacking", "lgbm"): + combiner_hp = StackingCombiner.LGBMHyperParams() + combiner_config = StackingCombiner.Config( + hyperparams=combiner_hp, horizons=config.horizons, quantiles=config.quantiles + ) + combiner = StackingCombiner( + config=combiner_config, + ) + case ("stacking", "gblinear"): + combiner_hp = StackingCombiner.GBLinearHyperParams(reg_alpha=0.0, reg_lambda=0.0) + combiner_config = StackingCombiner.Config( + hyperparams=combiner_hp, horizons=config.horizons, quantiles=config.quantiles + ) + combiner = StackingCombiner( + config=combiner_config, + ) + case ("rules", _): + combiner_config = RulesCombiner.Config(horizons=config.horizons, quantiles=config.quantiles) + combiner = RulesCombiner( + config=combiner_config, + ) + case _: + msg = f"Unsupported ensemble and combiner combination: {config.ensemble_type}, {config.combiner_model}" + raise ValueError(msg) + + postprocessing = [QuantileSorter()] + + model_specific_preprocessing: dict[str, TransformPipeline[TimeSeriesDataset]] = { + name: TransformPipeline(transforms=transforms) for name, transforms in forecaster_preprocessing.items() + } + + if config.forecast_combiner_sample_weight_exponent != 0: + combiner_transforms = [ + SampleWeighter( + target_column=config.target_column, + weight_exponent=config.forecast_combiner_sample_weight_exponent, + weight_floor=config.sample_weight_floor, + weight_scale_percentile=config.sample_weight_scale_percentile, + ), + Selector(selection=Include("sample_weight", config.target_column)), + ] + else: + combiner_transforms = [] + + combiner_preprocessing: TransformPipeline[TimeSeriesDataset] = TransformPipeline(transforms=combiner_transforms) + + ensemble_model = EnsembleForecastingModel( + common_preprocessing=common_preprocessing, + model_specific_preprocessing=model_specific_preprocessing, + combiner_preprocessing=combiner_preprocessing, + postprocessing=TransformPipeline(transforms=postprocessing), + forecasters=forecasters, + combiner=combiner, + target_column=config.target_column, + data_splitter=config.data_splitter, + ) + + callbacks: list[ForecastingCallback] = [] + # TODO(Egor): Implement MLFlow for OpenSTEF-meta # noqa: TD003 + + return CustomForecastingWorkflow(model=ensemble_model, model_id=config.model_id, callbacks=callbacks) + + +__all__ = ["EnsembleWorkflowConfig", "create_ensemble_workflow"] diff --git a/packages/openstef-meta/src/openstef_meta/utils/__init__.py b/packages/openstef-meta/src/openstef_meta/utils/__init__.py new file mode 100644 index 000000000..a6b9e93a4 --- /dev/null +++ b/packages/openstef-meta/src/openstef_meta/utils/__init__.py @@ -0,0 +1,15 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +"""Utility functions and classes for OpenSTEF Meta.""" + +from .decision_tree import Decision, DecisionTree, Rule +from .pinball_errors import calculate_pinball_errors + +__all__ = [ + "Decision", + "DecisionTree", + "Rule", + "calculate_pinball_errors", +] diff --git a/packages/openstef-meta/src/openstef_meta/utils/datasets.py b/packages/openstef-meta/src/openstef_meta/utils/datasets.py new file mode 100644 index 000000000..e85c05b09 --- /dev/null +++ b/packages/openstef-meta/src/openstef_meta/utils/datasets.py @@ -0,0 +1,282 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 +"""Ensemble Forecast Dataset. + +Validated dataset for ensemble forecasters first stage output. +Implements methods to select quantile-specific ForecastInputDatasets for final learners. +Also supports constructing classifation targets based on pinball loss. +""" + +from datetime import datetime, timedelta +from typing import Self, override + +import pandas as pd + +from openstef_core.datasets.validated_datasets import ForecastDataset, ForecastInputDataset, TimeSeriesDataset +from openstef_core.types import Quantile +from openstef_meta.utils.pinball_errors import calculate_pinball_errors + +DEFAULT_TARGET_COLUMN = {Quantile(0.5): "load"} + + +def combine_forecast_input_datasets( + dataset: ForecastInputDataset, other: ForecastInputDataset | None, join: str = "inner" +) -> ForecastInputDataset: + """Combine multiple TimeSeriesDatasets into a single dataset. + + Args: + dataset: First ForecastInputDataset. + other: Second ForecastInputDataset or None. + join: Type of join to perform on the datasets. Defaults to "inner". + + Returns: + Combined ForecastDataset. + """ + if not isinstance(other, ForecastInputDataset): + return dataset + if join != "inner": + raise NotImplementedError("Only 'inner' join is currently supported.") + df_other = other.data + if dataset.target_column in df_other.columns: + df_other = df_other.drop(columns=[dataset.target_column]) + + df_one = dataset.data + df = pd.concat( + [df_one, df_other], + axis=1, + join="inner", + ) + + return ForecastInputDataset( + data=df, + sample_interval=dataset.sample_interval, + target_column=dataset.target_column, + sample_weight_column=dataset.sample_weight_column, + forecast_start=dataset.forecast_start, + ) + + +class EnsembleForecastDataset(TimeSeriesDataset): + """First stage output format for ensemble forecasters.""" + + forecast_start: datetime + quantiles: list[Quantile] + forecaster_names: list[str] + target_column: str + + @override + def __init__( + self, + data: pd.DataFrame, + sample_interval: timedelta = timedelta(minutes=15), + forecast_start: datetime | None = None, + target_column: str = "load", + *, + horizon_column: str = "horizon", + available_at_column: str = "available_at", + ) -> None: + if "forecast_start" in data.attrs: + self.forecast_start = datetime.fromisoformat(data.attrs["forecast_start"]) + else: + self.forecast_start = forecast_start if forecast_start is not None else data.index.min().to_pydatetime() + self.target_column = data.attrs.get("target_column", target_column) + + super().__init__( + data=data, + sample_interval=sample_interval, + horizon_column=horizon_column, + available_at_column=available_at_column, + ) + quantile_feature_names = [col for col in self.feature_names if col != target_column] + + self.forecaster_names, self.quantiles = self.get_learner_and_quantile(pd.Index(quantile_feature_names)) + n_cols = len(self.forecaster_names) * len(self.quantiles) + if len(data.columns) not in {n_cols + 1, n_cols}: + raise ValueError("Data columns do not match the expected number based on base Forecasters and quantiles.") + + @property + def target_series(self) -> pd.Series | None: + """Return the target series if available.""" + if self.target_column in self.data.columns: + return self.data[self.target_column] + return None + + @staticmethod + def get_learner_and_quantile(feature_names: pd.Index) -> tuple[list[str], list[Quantile]]: + """Extract base Forecaster names and quantiles from feature names. + + Args: + feature_names: Index of feature names in the dataset. + + Returns: + Tuple containing a list of base Forecaster names and a list of quantiles. + + Raises: + ValueError: If an invalid base Forecaster name is found in a feature name. + """ + forecasters: set[str] = set() + quantiles: set[Quantile] = set() + + for feature_name in feature_names: + quantile_part = "_".join(feature_name.split("_")[-2:]) + learner_part = feature_name[: -(len(quantile_part) + 1)] + if not Quantile.is_valid_quantile_string(quantile_part): + msg = f"Column has no valid quantile string: {feature_name}" + raise ValueError(msg) + + forecasters.add(learner_part) + quantiles.add(Quantile.parse(quantile_part)) + + return list(forecasters), list(quantiles) + + @staticmethod + def get_quantile_feature_name(feature_name: str) -> tuple[str, Quantile]: + """Generate the feature name for a given base Forecaster and quantile. + + Args: + feature_name: Feature name string in the format "model_Quantile". + + Returns: + Tuple containing the base Forecaster name and Quantile object. + """ + learner_part, quantile_part = feature_name.split("_", maxsplit=1) + return learner_part, Quantile.parse(quantile_part) + + @classmethod + def from_forecast_datasets( + cls, + datasets: dict[str, ForecastDataset], + target_series: pd.Series | None = None, + sample_weights: pd.Series | None = None, + ) -> Self: + """Create an EnsembleForecastDataset from multiple ForecastDatasets. + + Args: + datasets: Dict of ForecastDatasets to combine. + target_series: Optional target series to include in the dataset. + sample_weights: Optional sample weights series to include in the dataset. + + Returns: + EnsembleForecastDataset combining all input datasets. + """ + ds1 = next(iter(datasets.values())) + additional_columns: dict[str, pd.Series] = {} + if isinstance(ds1.target_series, pd.Series): + additional_columns[ds1.target_column] = ds1.target_series + elif target_series is not None: + additional_columns[ds1.target_column] = target_series + + sample_weight_column = "sample_weight" + if sample_weights is not None: + additional_columns[sample_weight_column] = sample_weights + + combined_data = pd.DataFrame({ + f"{learner}_{q.format()}": ds.data[q.format()] for learner, ds in datasets.items() for q in ds.quantiles + }).assign(**additional_columns) + + return cls( + data=combined_data, + sample_interval=ds1.sample_interval, + forecast_start=ds1.forecast_start, + target_column=ds1.target_column, + ) + + @staticmethod + def _prepare_classification(data: pd.DataFrame, target: pd.Series, quantile: Quantile) -> pd.Series: + """Prepare data for classification tasks by converting quantile columns to binary indicators. + + Args: + data: DataFrame containing quantile predictions. + target: Series containing true target values. + quantile: Quantile for which to prepare classification data. + + Returns: + Series with categorical indicators of best-performing base Forecasters. + """ + + # Calculate pinball loss for each base Forecaster + def column_pinball_losses(preds: pd.Series) -> pd.Series: + return calculate_pinball_errors(y_true=target, y_pred=preds, quantile=quantile) + + pinball_losses = data.apply(column_pinball_losses) + + return pinball_losses.idxmin(axis=1) + + def select_quantile_classification(self, quantile: Quantile) -> ForecastInputDataset: + """Select classification target for a specific quantile. + + Args: + quantile: Quantile to select. + + Returns: + Series containing binary indicators of best-performing base Forecasters for the specified quantile. + + Raises: + ValueError: If the target column is not found in the dataset. + """ + if self.target_column not in self.data.columns: + msg = f"Target column '{self.target_column}' not found in dataset." + raise ValueError(msg) + + selected_columns = [f"{learner}_{quantile.format()}" for learner in self.forecaster_names] + prediction_data = self.data[selected_columns].copy() + prediction_data.columns = self.forecaster_names + + target = self._prepare_classification( + data=prediction_data, + target=self.data[self.target_column], + quantile=quantile, + ) + prediction_data[self.target_column] = target + return ForecastInputDataset( + data=prediction_data, + sample_interval=self.sample_interval, + target_column=self.target_column, + forecast_start=self.forecast_start, + ) + + def select_quantile(self, quantile: Quantile) -> ForecastInputDataset: + """Select data for a specific quantile. + + Args: + quantile: Quantile to select. + + Returns: + ForecastInputDataset containing base predictions for the specified quantile. + """ + selected_columns = [f"{learner}_{quantile.format()}" for learner in self.forecaster_names] + selected_columns.append(self.target_column) + prediction_data = self.data[selected_columns].copy() + prediction_data.columns = [*self.forecaster_names, self.target_column] + + return ForecastInputDataset( + data=prediction_data, + sample_interval=self.sample_interval, + target_column=self.target_column, + forecast_start=self.forecast_start, + ) + + def select_forecaster(self, forecaster_name: str) -> ForecastDataset: + """Select data for a specific base Forecaster across all quantiles. + + Args: + forecaster_name: Name of the base Forecaster to select. + + Returns: + ForecastDataset containing predictions from the specified base Forecaster. + """ + selected_columns = [ + f"{forecaster_name}_{q.format()}" for q in self.quantiles if f"{forecaster_name}_{q.format()}" in self.data + ] + prediction_data = self.data[selected_columns].copy() + prediction_data.columns = [q.format() for q in self.quantiles] + + prediction_data[self.target_column] = self.data[self.target_column] + + return ForecastDataset( + data=prediction_data, + sample_interval=self.sample_interval, + forecast_start=self.forecast_start, + target_column=self.target_column, + ) diff --git a/packages/openstef-meta/src/openstef_meta/utils/decision_tree.py b/packages/openstef-meta/src/openstef_meta/utils/decision_tree.py new file mode 100644 index 000000000..8e3940dfa --- /dev/null +++ b/packages/openstef-meta/src/openstef_meta/utils/decision_tree.py @@ -0,0 +1,143 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 +"""A simple decision tree implementation for making decisions based on feature rules.""" + +from typing import Literal + +import pandas as pd +from pydantic import BaseModel, Field, model_validator + + +class Node(BaseModel): + """A node in the decision tree, either a rule or a decision.""" + + idx: int = Field( + description="Index of the rule in the decision tree.", + ) + + +class Rule(Node): + """A single rule in the decision tree.""" + + idx: int = Field( + description="Index of the decision in the decision tree.", + ) + + rule_type: Literal["greater_than", "less_than"] = Field( + ..., + description="Type of the rule to apply.", + ) + feature_name: str = Field( + ..., + description="Name of the feature to which the rule applies.", + ) + + threshold: float | int = Field( + ..., + description="Threshold value for the rule.", + ) + + next_true: int = Field( + ..., + description="Index of the next rule if the condition is true.", + ) + + next_false: int = Field( + ..., + description="Index of the next rule if the condition is false.", + ) + + +class Decision(Node): + """A leaf decision in the decision tree.""" + + idx: int = Field( + description="Index of the decision in the decision tree.", + ) + + decision: str = Field( + ..., + description="The prediction value at this leaf.", + ) + + +class DecisionTree(BaseModel): + """A simple decision tree defined by a list of rules.""" + + nodes: list[Node] = Field( + ..., + description="List of rules that define the decision tree.", + ) + + outcomes: set[str] = Field( + ..., + description="Set of possible outcomes from the decision tree.", + ) + + @model_validator(mode="after") + def validate_tree_structure(self) -> "DecisionTree": + """Validate that the tree structure is correct. + + Raises: + ValueError: If tree is not built correctly. + + Returns: + The validated DecisionTree instance. + """ + node_idx = {node.idx for node in self.nodes} + if node_idx != set(range(len(self.nodes))): + raise ValueError("Rule indices must be consecutive starting from 0.") + + for node in self.nodes: + if isinstance(node, Rule): + if node.next_true not in node_idx: + msg = f"next_true index {node.next_true} not found in nodes." + raise ValueError(msg) + if node.next_false not in node_idx: + msg = f"next_false index {node.next_false} not found in nodes." + raise ValueError(msg) + if isinstance(node, Decision) and node.decision not in self.outcomes: + msg = f"Decision '{node.decision}' not in defined outcomes {self.outcomes}." + raise ValueError(msg) + + return self + + def get_decision(self, row: pd.Series) -> str: + """Get decision from the decision tree based on input features. + + Args: + row: Series containing feature values. + + Returns: + The decision outcome as a string. + + Raises: + ValueError: If the tree structure is invalid. + TypeError: If a node type is invalid. + """ + current_idx = 0 + while True: + current_node = self.nodes[current_idx] + if isinstance(current_node, Decision): + return current_node.decision + if isinstance(current_node, Rule): + feature_value = row[current_node.feature_name] + if current_node.rule_type == "greater_than": + if feature_value > current_node.threshold: + current_idx = current_node.next_true + else: + current_idx = current_node.next_false + elif current_node.rule_type == "less_than": + if feature_value < current_node.threshold: + current_idx = current_node.next_true + else: + current_idx = current_node.next_false + else: + msg = f"Invalid rule type '{current_node.rule_type}' at index {current_idx}." + raise ValueError(msg) + else: + msg = f"Invalid node type at index {current_idx}." + raise TypeError(msg) + + __all__ = ["Node", "Rule", "Decision", "DecisionTree"] diff --git a/packages/openstef-meta/src/openstef_meta/utils/pinball_errors.py b/packages/openstef-meta/src/openstef_meta/utils/pinball_errors.py new file mode 100644 index 000000000..08e1c7704 --- /dev/null +++ b/packages/openstef-meta/src/openstef_meta/utils/pinball_errors.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +"""Utility functions for calculating pinball loss errors. + +This module provides a function to compute the pinball loss for quantile regression. +""" + +import numpy as np +import pandas as pd + + +def calculate_pinball_errors(y_true: pd.Series, y_pred: pd.Series, quantile: float) -> pd.Series: + """Calculate pinball loss for given true and predicted values. + + Args: + y_true: True values as a pandas Series. + y_pred: Predicted values as a pandas Series. + quantile: Quantile value. + + Returns: + A pandas Series containing the pinball loss for each sample. + """ + errors = y_true - y_pred + pinball_loss = np.where( + errors >= 0, + quantile * errors, # Under-prediction + (quantile - 1) * errors, # Over-prediction + ) + + return pd.Series(pinball_loss, index=y_true.index) diff --git a/packages/openstef-meta/tests/regression/__init__.py b/packages/openstef-meta/tests/regression/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/packages/openstef-meta/tests/regression/test_ensemble_forecasting_model.py b/packages/openstef-meta/tests/regression/test_ensemble_forecasting_model.py new file mode 100644 index 000000000..23835d6e7 --- /dev/null +++ b/packages/openstef-meta/tests/regression/test_ensemble_forecasting_model.py @@ -0,0 +1,100 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +from datetime import timedelta +from typing import cast + +import numpy as np +import pandas as pd +import pytest + +from openstef_core.datasets.validated_datasets import TimeSeriesDataset +from openstef_core.types import LeadTime, Q +from openstef_meta.models.ensemble_forecasting_model import EnsembleForecastingModel +from openstef_meta.presets import EnsembleWorkflowConfig, create_ensemble_workflow +from openstef_models.models.forecasting_model import ForecastingModel +from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow + + +@pytest.fixture +def sample_timeseries_dataset() -> TimeSeriesDataset: + """Create sample time series data with typical energy forecasting features.""" + n_samples = 25 + rng = np.random.default_rng(seed=42) + + data = pd.DataFrame( + { + "load": 100.0 + rng.normal(10.0, 5.0, n_samples), + "temperature": 20.0 + rng.normal(1.0, 0.5, n_samples), + "radiation": rng.uniform(0.0, 500.0, n_samples), + }, + index=pd.date_range("2025-01-01 10:00", periods=n_samples, freq="h", tz="UTC"), + ) + + return TimeSeriesDataset(data, timedelta(hours=1)) + + +@pytest.fixture +def config() -> EnsembleWorkflowConfig: + return EnsembleWorkflowConfig( + model_id="ensemble_model_", + ensemble_type="learned_weights", + base_models=["gblinear", "lgbm"], + combiner_model="lgbm", + quantiles=[Q(0.1), Q(0.5), Q(0.9)], + horizons=[LeadTime.from_string("PT36H")], + forecaster_sample_weight_exponent={"gblinear": 1, "lgbm": 0}, + ) + + +@pytest.fixture +def create_models( + config: EnsembleWorkflowConfig, +) -> tuple[EnsembleForecastingModel, dict[str, ForecastingModel]]: + + ensemble_model = cast(EnsembleForecastingModel, create_ensemble_workflow(config=config).model) + + base_models: dict[str, ForecastingModel] = {} + for forecaster_name in config.base_models: + model_config = ForecastingWorkflowConfig( + model_id=f"{forecaster_name}_model_", + model=forecaster_name, # type: ignore + quantiles=config.quantiles, + horizons=config.horizons, + sample_weight_exponent=config.forecaster_sample_weight_exponent[forecaster_name], + ) + base_model = create_forecasting_workflow(config=model_config).model + base_models[forecaster_name] = cast(ForecastingModel, base_model) + + return ensemble_model, base_models + + +def test_preprocessing( + sample_timeseries_dataset: TimeSeriesDataset, + create_models: tuple[EnsembleForecastingModel, dict[str, ForecastingModel]], +) -> None: + + ensemble_model, base_models = create_models + + ensemble_model.common_preprocessing.fit(data=sample_timeseries_dataset) + + # Check all base models + for name, model in base_models.items(): + # Ensemble model + common_ensemble = ensemble_model.common_preprocessing.transform( + data=sample_timeseries_dataset.copy_with(sample_timeseries_dataset.data) + ) + ensemble_model.model_specific_preprocessing[name].fit(data=common_ensemble) + transformed_ensemble = ensemble_model.model_specific_preprocessing[name].transform(data=common_ensemble) + # Base model + model.preprocessing.fit(data=sample_timeseries_dataset) + transformed_base = model.preprocessing.transform(data=sample_timeseries_dataset) + # Compare + pd.testing.assert_frame_equal( + transformed_ensemble.data, + transformed_base.data, + check_dtype=False, + check_index_type=False, + check_column_type=False, + ) diff --git a/packages/openstef-meta/tests/unit/models/__init__.py b/packages/openstef-meta/tests/unit/models/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/packages/openstef-meta/tests/unit/models/conftest.py b/packages/openstef-meta/tests/unit/models/conftest.py new file mode 100644 index 000000000..968e68d8c --- /dev/null +++ b/packages/openstef-meta/tests/unit/models/conftest.py @@ -0,0 +1,59 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +from datetime import datetime, timedelta + +import numpy as np +import pandas as pd +import pytest + +from openstef_core.datasets import ForecastInputDataset + + +@pytest.fixture +def sample_forecast_input_dataset() -> ForecastInputDataset: + """Create sample input dataset for forecaster training and prediction.""" + rng = np.random.default_rng(42) + num_samples = 14 + start_date = datetime.fromisoformat("2025-01-01T00:00:00") + + feature_1 = rng.normal(loc=0, scale=1, size=num_samples) + feature_2 = rng.normal(loc=0, scale=1, size=num_samples) + feature_3 = rng.uniform(low=-1, high=1, size=num_samples) + + return ForecastInputDataset( + data=pd.DataFrame( + { + "load": (feature_1 + feature_2 + feature_3) / 3, + "feature1": feature_1, + "feature2": feature_2, + "feature3": feature_3, + }, + index=pd.date_range(start=start_date, periods=num_samples, freq="1d"), + ), + sample_interval=timedelta(days=1), + target_column="load", + forecast_start=start_date + timedelta(days=num_samples // 2), + ) + + +@pytest.fixture +def sample_dataset_with_weights(sample_forecast_input_dataset: ForecastInputDataset) -> ForecastInputDataset: + """Create sample dataset with sample weights by adding weights to the base dataset.""" + rng = np.random.default_rng(42) + num_samples = len(sample_forecast_input_dataset.data) + + # Create varied sample weights (some high, some low) + sample_weights = rng.uniform(low=0.1, high=2.0, size=num_samples) + + # Add sample weights to existing data + data_with_weights = sample_forecast_input_dataset.data.copy() + data_with_weights["sample_weight"] = sample_weights + + return ForecastInputDataset( + data=data_with_weights, + sample_interval=sample_forecast_input_dataset.sample_interval, + target_column=sample_forecast_input_dataset.target_column, + forecast_start=sample_forecast_input_dataset.forecast_start, + ) diff --git a/packages/openstef-meta/tests/unit/models/forecast_combiners/__init__.py b/packages/openstef-meta/tests/unit/models/forecast_combiners/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/packages/openstef-meta/tests/unit/models/forecast_combiners/conftest.py b/packages/openstef-meta/tests/unit/models/forecast_combiners/conftest.py new file mode 100644 index 000000000..cf4edb982 --- /dev/null +++ b/packages/openstef-meta/tests/unit/models/forecast_combiners/conftest.py @@ -0,0 +1,59 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +from collections.abc import Callable +from datetime import timedelta + +import numpy as np +import pandas as pd +import pytest + +from openstef_core.datasets.validated_datasets import ForecastDataset +from openstef_meta.utils.datasets import EnsembleForecastDataset + + +@pytest.fixture +def forecast_dataset_factory() -> Callable[[], ForecastDataset]: + def _make() -> ForecastDataset: + rng = np.random.default_rng() + coef = rng.normal(0, 1, 3) + + df = pd.DataFrame( + data={ + "quantile_P10": np.array([1, 2, 3]) * coef[0], + "quantile_P50": np.array([1, 2, 3]) * coef[1], + "quantile_P90": np.array([1, 2, 3]) * coef[2], + "load": [100, 200, 300], + }, + index=pd.to_datetime([ + "2023-01-01T10:00:00", + "2023-01-01T11:00:00", + "2023-01-01T12:00:00", + ]), + ) + df += rng.normal(0, 1, df.shape) # Add slight noise to avoid perfect predictions + + df["available_at"] = pd.to_datetime([ + "2023-01-01T09:50:00", + "2023-01-01T10:55:00", + "2023-01-01T12:10:00", + ]) + + return ForecastDataset( + data=df, + sample_interval=timedelta(hours=1), + target_column="load", + ) + + return _make + + +@pytest.fixture +def ensemble_dataset(forecast_dataset_factory: Callable[[], ForecastDataset]) -> EnsembleForecastDataset: + base_learner_output = { + "GBLinearForecaster": forecast_dataset_factory(), + "LGBMForecaster": forecast_dataset_factory(), + } + + return EnsembleForecastDataset.from_forecast_datasets(base_learner_output) diff --git a/packages/openstef-meta/tests/unit/models/forecast_combiners/test_learned_weights_combiner.py b/packages/openstef-meta/tests/unit/models/forecast_combiners/test_learned_weights_combiner.py new file mode 100644 index 000000000..ac7a4c380 --- /dev/null +++ b/packages/openstef-meta/tests/unit/models/forecast_combiners/test_learned_weights_combiner.py @@ -0,0 +1,95 @@ +# # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# # +# # SPDX-License-Identifier: MPL-2.0 + +from datetime import timedelta + +import pytest + +from openstef_core.exceptions import NotFittedError +from openstef_core.types import LeadTime, Q +from openstef_meta.models.forecast_combiners.learned_weights_combiner import ( + WeightsCombiner, + WeightsCombinerConfig, +) +from openstef_meta.utils.datasets import EnsembleForecastDataset + + +@pytest.fixture(params=["lgbm", "xgboost", "rf", "logistic"]) +def classifier(request: pytest.FixtureRequest) -> str: + """Fixture to provide different classifier types for LearnedWeightsCombiner tests.""" + return request.param + + +@pytest.fixture +def config(classifier: str) -> WeightsCombinerConfig: + """Fixture to create WeightsCombinerConfig based on the classifier type.""" + if classifier == "lgbm": + hp = WeightsCombiner.LGBMHyperParams(n_leaves=5, n_estimators=10) + elif classifier == "xgboost": + hp = WeightsCombiner.XGBHyperParams(n_estimators=10) + elif classifier == "rf": + hp = WeightsCombiner.RFHyperParams(n_estimators=10, n_leaves=5) + elif classifier == "logistic": + hp = WeightsCombiner.LogisticHyperParams() + else: + msg = f"Unsupported classifier type: {classifier}" + raise ValueError(msg) + + return WeightsCombiner.Config( + hyperparams=hp, quantiles=[Q(0.1), Q(0.5), Q(0.9)], horizons=[LeadTime(timedelta(days=1))] + ) + + +@pytest.fixture +def forecaster(config: WeightsCombinerConfig) -> WeightsCombiner: + return WeightsCombiner(config) + + +def test_initialization(forecaster: WeightsCombiner): + assert isinstance(forecaster, WeightsCombiner) + + +def test_quantile_weights_combiner__fit_predict( + ensemble_dataset: EnsembleForecastDataset, + config: WeightsCombinerConfig, +): + """Test basic fit and predict workflow with comprehensive output validation.""" + # Arrange + expected_quantiles = config.quantiles + forecaster = WeightsCombiner(config=config) + + # Act + forecaster.fit(ensemble_dataset) + result = forecaster.predict(ensemble_dataset) + + # Assert + # Basic functionality + assert forecaster.is_fitted, "Model should be fitted after calling fit()" + + # Check that necessary quantiles are present + expected_columns = [q.format() for q in expected_quantiles] + expected_columns.append("load") + assert list(result.data.columns) == expected_columns, ( + f"Expected columns {expected_columns}, got {list(result.data.columns)}" + ) + + # Forecast data quality + assert not result.data.isna().any().any(), "Forecast should not contain NaN or None values" + + # Since forecast is deterministic with fixed random seed, check value spread (vectorized) + # All quantiles should have some variation (not all identical values) + stds = result.data.std() + assert (stds > 0).all(), f"All columns should have variation, got stds: {dict(stds)}" + + +def test_weights_combiner_not_fitted_error( + ensemble_dataset: EnsembleForecastDataset, + config: WeightsCombinerConfig, +): + """Test that NotFittedError is raised when predicting before fitting.""" + # Arrange + forecaster = WeightsCombiner(config=config) + # Act & Assert + with pytest.raises(NotFittedError): + forecaster.predict(ensemble_dataset) diff --git a/packages/openstef-meta/tests/unit/models/forecast_combiners/test_rules_combiner.py b/packages/openstef-meta/tests/unit/models/forecast_combiners/test_rules_combiner.py new file mode 100644 index 000000000..aa08bf59a --- /dev/null +++ b/packages/openstef-meta/tests/unit/models/forecast_combiners/test_rules_combiner.py @@ -0,0 +1,62 @@ +# # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# # +# # SPDX-License-Identifier: MPL-2.0 + +from datetime import timedelta + +import pytest + +from openstef_core.types import LeadTime, Q +from openstef_meta.models.forecast_combiners.rules_combiner import ( + RulesCombiner, + RulesCombinerConfig, +) +from openstef_meta.utils.datasets import EnsembleForecastDataset + + +@pytest.fixture +def config() -> RulesCombinerConfig: + """Fixture to create RulesCombinerConfig.""" + return RulesCombiner.Config( + quantiles=[Q(0.1), Q(0.5), Q(0.9)], + horizons=[LeadTime(timedelta(days=1))], + ) + + +@pytest.fixture +def forecaster(config: RulesCombinerConfig) -> RulesCombiner: + return RulesCombiner(config=config) + + +def test_initialization(forecaster: RulesCombiner): + assert isinstance(forecaster, RulesCombiner) + + +def test_quantile_weights_combiner__fit_predict( + ensemble_dataset: EnsembleForecastDataset, + config: RulesCombinerConfig, +): + """Test basic fit and predict workflow with comprehensive output validation.""" + # Arrange + expected_quantiles = config.quantiles + forecaster = RulesCombiner(config=config) + additional_features = ensemble_dataset.select_quantile(Q(0.5)) + additional_features.data = additional_features.data.drop(columns=additional_features.target_column) + additional_features.data.columns = ["feature1", "feature2"] + + # Act + forecaster.fit(ensemble_dataset, additional_features=additional_features) + result = forecaster.predict(ensemble_dataset, additional_features=additional_features) + + # Assert + # Basic functionality + assert forecaster.is_fitted, "Model should be fitted after calling fit()" + + # Check that necessary quantiles are present + expected_columns = [q.format() for q in expected_quantiles] + assert list(result.data.columns) == expected_columns, ( + f"Expected columns {expected_columns}, got {list(result.data.columns)}" + ) + + # Forecast data quality + assert not result.data.isna().any().any(), "Forecast should not contain NaN or None values" diff --git a/packages/openstef-meta/tests/unit/models/forecast_combiners/test_stacking_combiner.py b/packages/openstef-meta/tests/unit/models/forecast_combiners/test_stacking_combiner.py new file mode 100644 index 000000000..cb182e242 --- /dev/null +++ b/packages/openstef-meta/tests/unit/models/forecast_combiners/test_stacking_combiner.py @@ -0,0 +1,103 @@ +# # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# # +# # SPDX-License-Identifier: MPL-2.0 + +from datetime import timedelta + +import pandas as pd +import pytest + +from openstef_core.exceptions import NotFittedError +from openstef_core.types import LeadTime, Q +from openstef_meta.models.forecast_combiners.stacking_combiner import ( + StackingCombiner, + StackingCombinerConfig, +) +from openstef_meta.utils.datasets import EnsembleForecastDataset + + +@pytest.fixture(params=["lgbm", "gblinear"]) +def regressor(request: pytest.FixtureRequest) -> str: + """Fixture to provide different regressor types for Stacking tests.""" + return request.param + + +@pytest.fixture +def config(regressor: str) -> StackingCombinerConfig: + """Fixture to create StackingCombinerConfig based on the regressor type.""" + if regressor == "lgbm": + hp = StackingCombiner.LGBMHyperParams(num_leaves=5, n_estimators=10) + elif regressor == "gblinear": + hp = StackingCombiner.GBLinearHyperParams(n_steps=10) + else: + msg = f"Unsupported regressor type: {regressor}" + raise ValueError(msg) + + return StackingCombiner.Config( + hyperparams=hp, quantiles=[Q(0.1), Q(0.5), Q(0.9)], horizons=[LeadTime(timedelta(days=1))] + ) + + +@pytest.fixture +def forecaster(config: StackingCombinerConfig) -> StackingCombiner: + return StackingCombiner(config) + + +def test_initialization(forecaster: StackingCombiner): + assert isinstance(forecaster, StackingCombiner) + + +def test_quantile_weights_combiner__fit_predict( + ensemble_dataset: EnsembleForecastDataset, + config: StackingCombinerConfig, +): + """Test basic fit and predict workflow with comprehensive output validation.""" + # Arrange + expected_quantiles = config.quantiles + forecaster = StackingCombiner(config=config) + + # Act + forecaster.fit(ensemble_dataset) + result = forecaster.predict(ensemble_dataset) + + # Assert + # Basic functionality + assert forecaster.is_fitted, "Model should be fitted after calling fit()" + + # Check that necessary quantiles are present + expected_columns = [q.format() for q in expected_quantiles] + assert list(result.data.columns) == expected_columns, ( + f"Expected columns {expected_columns}, got {list(result.data.columns)}" + ) + + # Forecast data quality + assert not result.data.isna().any().any(), "Forecast should not contain NaN or None values" + + +def test_stacking_combiner_not_fitted_error( + ensemble_dataset: EnsembleForecastDataset, + config: StackingCombinerConfig, +): + """Test that NotFittedError is raised when predicting before fitting.""" + # Arrange + forecaster = StackingCombiner(config=config) + # Act & Assert + with pytest.raises(NotFittedError): + forecaster.predict(ensemble_dataset) + + +def test_stacking_combiner_predict_contributions( + ensemble_dataset: EnsembleForecastDataset, + config: StackingCombinerConfig, +): + """Test that predict_contributions method returns contributions with correct shape.""" + # Arrange + forecaster = StackingCombiner(config=config) + forecaster.fit(ensemble_dataset) + + # Act + contributions = forecaster.predict_contributions(ensemble_dataset) + + # Assert + assert isinstance(contributions, pd.DataFrame), "Contributions should be returned as a DataFrame." + assert len(contributions.columns) == (len(ensemble_dataset.quantiles) * len(ensemble_dataset.forecaster_names)) + 1 diff --git a/packages/openstef-meta/tests/unit/models/forecasting/__init__.py b/packages/openstef-meta/tests/unit/models/forecasting/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/packages/openstef-meta/tests/unit/models/forecasting/test_residual_forecaster.py b/packages/openstef-meta/tests/unit/models/forecasting/test_residual_forecaster.py new file mode 100644 index 000000000..0f319552e --- /dev/null +++ b/packages/openstef-meta/tests/unit/models/forecasting/test_residual_forecaster.py @@ -0,0 +1,173 @@ +# # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# # +# # SPDX-License-Identifier: MPL-2.0 + +from datetime import timedelta + +import pytest + +from openstef_core.datasets import ForecastInputDataset +from openstef_core.exceptions import NotFittedError +from openstef_core.types import LeadTime, Q +from openstef_meta.models.forecasting.residual_forecaster import ( + ResidualBaseForecasterHyperParams, + ResidualForecaster, + ResidualForecasterConfig, + ResidualHyperParams, +) +from openstef_models.models.forecasting.gblinear_forecaster import GBLinearHyperParams +from openstef_models.models.forecasting.lgbm_forecaster import LGBMHyperParams +from openstef_models.models.forecasting.lgbmlinear_forecaster import LGBMLinearHyperParams +from openstef_models.models.forecasting.xgboost_forecaster import XGBoostHyperParams + + +@pytest.fixture(params=["gblinear", "lgbmlinear"]) +def primary_model(request: pytest.FixtureRequest) -> ResidualBaseForecasterHyperParams: + """Fixture to provide different primary models types.""" + learner_type = request.param + if learner_type == "gblinear": + return GBLinearHyperParams() + if learner_type == "lgbm": + return LGBMHyperParams() + if learner_type == "lgbmlinear": + return LGBMLinearHyperParams() + return XGBoostHyperParams() + + +@pytest.fixture(params=["gblinear", "lgbm", "lgbmlinear", "xgboost"]) +def secondary_model(request: pytest.FixtureRequest) -> ResidualBaseForecasterHyperParams: + """Fixture to provide different secondary models types.""" + learner_type = request.param + if learner_type == "gblinear": + return GBLinearHyperParams() + if learner_type == "lgbm": + return LGBMHyperParams() + if learner_type == "lgbmlinear": + return LGBMLinearHyperParams() + return XGBoostHyperParams() + + +@pytest.fixture +def base_config( + primary_model: ResidualBaseForecasterHyperParams, + secondary_model: ResidualBaseForecasterHyperParams, +) -> ResidualForecasterConfig: + """Base configuration for Residual forecaster tests.""" + + params = ResidualHyperParams( + primary_hyperparams=primary_model, + secondary_hyperparams=secondary_model, + ) + return ResidualForecasterConfig( + quantiles=[Q(0.1), Q(0.5), Q(0.9)], + horizons=[LeadTime(timedelta(days=1))], + hyperparams=params, + verbosity=False, + ) + + +def test_residual_forecaster_fit_predict( + sample_forecast_input_dataset: ForecastInputDataset, + base_config: ResidualForecasterConfig, +): + """Test basic fit and predict workflow with comprehensive output validation.""" + # Arrange + expected_quantiles = base_config.quantiles + forecaster = ResidualForecaster(config=base_config) + + # Act + forecaster.fit(sample_forecast_input_dataset) + result = forecaster.predict(sample_forecast_input_dataset) + + # Assert + # Basic functionality + assert forecaster.is_fitted, "Model should be fitted after calling fit()" + + # Check that necessary quantiles are present + expected_columns = [q.format() for q in expected_quantiles] + assert list(result.data.columns) == expected_columns, ( + f"Expected columns {expected_columns}, got {list(result.data.columns)}" + ) + + # Forecast data quality + assert not result.data.isna().any().any(), "Forecast should not contain NaN or None values" + + +def test_residual_forecaster_predict_not_fitted_raises_error( + sample_forecast_input_dataset: ForecastInputDataset, + base_config: ResidualForecasterConfig, +): + """Test that predict() raises NotFittedError when called before fit().""" + # Arrange + forecaster = ResidualForecaster(config=base_config) + + # Act & Assert + with pytest.raises(NotFittedError, match="ResidualForecaster"): + forecaster.predict(sample_forecast_input_dataset) + + +def test_residual_forecaster_with_sample_weights( + sample_dataset_with_weights: ForecastInputDataset, + base_config: ResidualForecasterConfig, +): + """Test that forecaster works with sample weights and produces different results.""" + # Arrange + forecaster_with_weights = ResidualForecaster(config=base_config) + + # Create dataset without weights for comparison + data_without_weights = ForecastInputDataset( + data=sample_dataset_with_weights.data.drop(columns=["sample_weight"]), + sample_interval=sample_dataset_with_weights.sample_interval, + target_column=sample_dataset_with_weights.target_column, + forecast_start=sample_dataset_with_weights.forecast_start, + ) + forecaster_without_weights = ResidualForecaster(config=base_config) + + # Act + forecaster_with_weights.fit(sample_dataset_with_weights) + forecaster_without_weights.fit(data_without_weights) + + # Predict using data without sample_weight column (since that's used for training, not prediction) + result_with_weights = forecaster_with_weights.predict(data_without_weights) + result_without_weights = forecaster_without_weights.predict(data_without_weights) + + # Assert + # Both should produce valid forecasts + assert not result_with_weights.data.isna().any().any(), "Weighted forecast should not contain NaN values" + assert not result_without_weights.data.isna().any().any(), "Unweighted forecast should not contain NaN values" + + # Sample weights should affect the model, so results should be different + # (This is a statistical test - with different weights, predictions should differ) + differences = (result_with_weights.data - result_without_weights.data).abs() + assert differences.sum().sum() > 0, "Sample weights should affect model predictions" + + +def test_residual_forecaster_predict_contributions( + sample_forecast_input_dataset: ForecastInputDataset, + base_config: ResidualForecasterConfig, +): + """Test basic fit and predict workflow with output validation.""" + # Arrange + expected_quantiles = base_config.quantiles + forecaster = ResidualForecaster(config=base_config) + + # Act + forecaster.fit(sample_forecast_input_dataset) + result = forecaster.predict_contributions(sample_forecast_input_dataset, scale=True) + + # Assert + # Basic functionality + assert forecaster.is_fitted, "Model should be fitted after calling fit()" + + # Check that necessary quantiles are present + base_models = [forecaster.primary_name, forecaster.secondary_name] + expected_columns = [f"{col}_{q.format()}" for col in base_models for q in expected_quantiles] + assert sorted(result.columns) == sorted(expected_columns), ( + f"Expected columns {expected_columns}, got {list(result.columns)}" + ) + + # Contributions should sum to 1.0 per quantile + for q in expected_quantiles: + quantile_cols = [col for col in result.columns if col.endswith(f"_{q.format()}")] + col_sums = result[quantile_cols].sum(axis=1) + assert all(abs(col_sums - 1.0) < 1e-6), f"Contributions for quantile {q.format()} should sum to 1.0" diff --git a/packages/openstef-meta/tests/unit/models/test_ensemble_forecasting_model.py b/packages/openstef-meta/tests/unit/models/test_ensemble_forecasting_model.py new file mode 100644 index 000000000..84f14cef7 --- /dev/null +++ b/packages/openstef-meta/tests/unit/models/test_ensemble_forecasting_model.py @@ -0,0 +1,279 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +import pickle # noqa: S403 - Controlled test +from datetime import datetime, timedelta +from typing import override + +import numpy as np +import pandas as pd +import pytest + +from openstef_core.datasets import ForecastInputDataset +from openstef_core.datasets.timeseries_dataset import TimeSeriesDataset +from openstef_core.datasets.validated_datasets import ForecastDataset +from openstef_core.exceptions import NotFittedError +from openstef_core.mixins.predictor import HyperParams +from openstef_core.mixins.transform import TransformPipeline +from openstef_core.testing import assert_timeseries_equal, create_synthetic_forecasting_dataset +from openstef_core.types import LeadTime, Q +from openstef_meta.models.ensemble_forecasting_model import EnsembleForecastingModel +from openstef_meta.models.forecast_combiners.forecast_combiner import ForecastCombiner, ForecastCombinerConfig +from openstef_meta.utils.datasets import EnsembleForecastDataset +from openstef_models.models.forecasting import Forecaster, ForecasterConfig +from openstef_models.transforms.postprocessing.quantile_sorter import QuantileSorter +from openstef_models.transforms.time_domain.lags_adder import LagsAdder + + +class SimpleForecaster(Forecaster): + """Simple test forecaster that returns predictable values for testing.""" + + def __init__(self, config: ForecasterConfig): + self._config = config + self._is_fitted = False + + @property + def config(self) -> ForecasterConfig: + return self._config + + @property + @override + def is_fitted(self) -> bool: + return self._is_fitted + + @override + def fit(self, data: ForecastInputDataset, data_val: ForecastInputDataset | None = None) -> None: + self._is_fitted = True + + @override + def predict(self, data: ForecastInputDataset) -> ForecastDataset: + # Return predictable forecast values + forecast_values = {quantile: 100.0 + quantile * 10 for quantile in self.config.quantiles} + return ForecastDataset( + pd.DataFrame( + { + quantile.format(): [forecast_values[quantile]] * len(data.index) + for quantile in self.config.quantiles + }, + index=data.index, + ), + data.sample_interval, + data.forecast_start, + ) + + +class SimpleCombiner(ForecastCombiner): + """Simple combiner that averages base Forecaster predictions.""" + + def __init__(self, config: ForecastCombinerConfig): + self._config = config + self._is_fitted = False + self.quantiles = config.quantiles + + def fit( + self, + data: EnsembleForecastDataset, + data_val: EnsembleForecastDataset | None = None, + additional_features: ForecastInputDataset | None = None, + ) -> None: + self._is_fitted = True + + def predict( + self, + data: EnsembleForecastDataset, + additional_features: ForecastInputDataset | None = None, + ) -> ForecastDataset: + if not self._is_fitted: + raise NotFittedError("Combiner must be fitted before prediction.") + + combined_data = pd.DataFrame(index=data.data.index) + for quantile in self.quantiles: + quantile_cols = [col for col in data.data.columns if col.endswith(quantile.format())] + combined_data[quantile.format()] = data.data[quantile_cols].mean(axis=1) + + return ForecastDataset( + data=combined_data, + sample_interval=data.sample_interval, + forecast_start=data.forecast_start, + ) + + @property + def is_fitted(self) -> bool: + return self._is_fitted + + +@pytest.fixture +def sample_timeseries_dataset() -> TimeSeriesDataset: + """Create sample time series data with typical energy forecasting features.""" + n_samples = 25 + rng = np.random.default_rng(seed=42) + + data = pd.DataFrame( + { + "load": 100.0 + rng.normal(10.0, 5.0, n_samples), + "temperature": 20.0 + rng.normal(1.0, 0.5, n_samples), + "radiation": rng.uniform(0.0, 500.0, n_samples), + }, + index=pd.date_range("2025-01-01 10:00", periods=n_samples, freq="h"), + ) + + return TimeSeriesDataset(data, timedelta(hours=1)) + + +@pytest.fixture +def model() -> EnsembleForecastingModel: + """Create a simple EnsembleForecastingModel for testing.""" + # Arrange + horizons = [LeadTime(timedelta(hours=1))] + quantiles = [Q(0.3), Q(0.5), Q(0.7)] + config = ForecasterConfig(quantiles=quantiles, horizons=horizons) + forecasters: dict[str, Forecaster] = { + "forecaster_1": SimpleForecaster(config=config), + "forecaster_2": SimpleForecaster(config=config), + } + combiner_config = ForecastCombinerConfig(quantiles=quantiles, horizons=horizons, hyperparams=HyperParams()) + + combiner = SimpleCombiner( + config=combiner_config, + ) + + # Act + return EnsembleForecastingModel( + forecasters=forecasters, combiner=combiner, common_preprocessing=TransformPipeline() + ) + + +def test_forecasting_model__init__uses_defaults(model: EnsembleForecastingModel): + """Test initialization uses default preprocessing and postprocessing when not provided.""" + + # Assert - Check that components are assigned correctly + assert model.common_preprocessing is not None + assert model.postprocessing is not None + assert model.target_column == "load" # Default value + assert model.forecaster_names == ["forecaster_1", "forecaster_2"] + + +def test_forecasting_model__fit(sample_timeseries_dataset: TimeSeriesDataset, model: EnsembleForecastingModel): + """Test that fit correctly orchestrates preprocessing and forecaster calls, and returns metrics.""" + + # Act + result = model.fit(data=sample_timeseries_dataset) + + # Assert - Model is fitted and returns metrics + assert model.is_fitted + assert result is not None + + +def test_forecasting_model__predict(sample_timeseries_dataset: TimeSeriesDataset, model: EnsembleForecastingModel): + """Test that predict correctly orchestrates preprocessing and forecaster calls.""" + + # Fit the model first + model.fit(data=sample_timeseries_dataset) + forecast_start = datetime.fromisoformat("2025-01-01T12:00:00") + + # Act + result = model.predict(data=sample_timeseries_dataset, forecast_start=forecast_start) + + # Assert - Prediction returns a forecast dataset with expected properties + assert isinstance(result, ForecastDataset) + assert result.sample_interval == sample_timeseries_dataset.sample_interval + assert result.quantiles == [Q(0.3), Q(0.5), Q(0.7)] + assert result.forecast_start >= forecast_start + assert not result.data.empty + assert not result.data.isna().any().any() + + +def test_forecasting_model__predict__raises_error_when_not_fitted( + sample_timeseries_dataset: TimeSeriesDataset, model: EnsembleForecastingModel +): + """Test predict raises NotFittedError when model is not fitted.""" + + # Act & Assert + with pytest.raises(NotFittedError): + model.predict(data=sample_timeseries_dataset) + + +def test_forecasting_model__score__returns_metrics( + sample_timeseries_dataset: TimeSeriesDataset, model: EnsembleForecastingModel +): + """Test that score evaluates model and returns metrics.""" + + model.fit(data=sample_timeseries_dataset) + + # Act + metrics = model.score(data=sample_timeseries_dataset) + + # Assert - Metrics are calculated for the median quantile + assert metrics.metrics is not None + assert all(x in metrics.metrics for x in [Q(0.3), Q(0.5), Q(0.7)]) + # R2 metric should be present (default evaluation metric) + assert "R2" in metrics.metrics[Q(0.5)] + + +def test_forecasting_model__pickle_roundtrip(): + """Test that ForecastingModel with preprocessing and postprocessing can be pickled and unpickled. + + This verifies that the entire forecasting pipeline, including transforms and forecaster, + can be serialized and deserialized while maintaining functionality. + """ + # Arrange - create synthetic dataset + dataset = create_synthetic_forecasting_dataset( + length=timedelta(days=30), + sample_interval=timedelta(hours=1), + random_seed=42, + ) + + # Create forecasting model with preprocessing and postprocessing + # Arrange + horizons = [LeadTime(timedelta(hours=1))] + quantiles = [Q(0.3), Q(0.5), Q(0.7)] + config = ForecasterConfig(quantiles=quantiles, horizons=horizons) + forecasters: dict[str, Forecaster] = { + "forecaster_1": SimpleForecaster(config=config), + "forecaster_2": SimpleForecaster(config=config), + } + combiner_config = ForecastCombinerConfig(quantiles=quantiles, horizons=horizons, hyperparams=HyperParams()) + + combiner = SimpleCombiner( + config=combiner_config, + ) + + original_model = EnsembleForecastingModel( + forecasters=forecasters, + combiner=combiner, + common_preprocessing=TransformPipeline( + transforms=[ + LagsAdder( + history_available=timedelta(days=14), + horizons=horizons, + max_day_lags=7, + add_trivial_lags=True, + add_autocorr_lags=False, + ), + ] + ), + postprocessing=TransformPipeline(transforms=[QuantileSorter()]), + cutoff_history=timedelta(days=7), + target_column="load", + ) + + # Fit the original model + original_model.fit(data=dataset) + + # Get predictions from original model + expected_predictions = original_model.predict(data=dataset) + + # Act - pickle and unpickle the model + pickled = pickle.dumps(original_model) + restored_model = pickle.loads(pickled) # noqa: S301 - Controlled test + + # Assert - verify the restored model is the correct type + assert isinstance(restored_model, EnsembleForecastingModel) + assert restored_model.is_fitted + assert restored_model.target_column == original_model.target_column + assert restored_model.cutoff_history == original_model.cutoff_history + + # Verify predictions match using pandas testing utilities + actual_predictions = restored_model.predict(data=dataset) + assert_timeseries_equal(actual_predictions, expected_predictions) diff --git a/packages/openstef-meta/tests/unit/utils/__init__.py b/packages/openstef-meta/tests/unit/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/packages/openstef-meta/tests/unit/utils/test_datasets.py b/packages/openstef-meta/tests/unit/utils/test_datasets.py new file mode 100644 index 000000000..efb64f3ea --- /dev/null +++ b/packages/openstef-meta/tests/unit/utils/test_datasets.py @@ -0,0 +1,117 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +from collections.abc import Callable +from datetime import timedelta + +import numpy as np +import pandas as pd +import pytest + +from openstef_core.datasets.validated_datasets import ForecastDataset, ForecastInputDataset, TimeSeriesDataset +from openstef_core.types import Quantile +from openstef_meta.utils.datasets import EnsembleForecastDataset + + +@pytest.fixture +def simple_dataset() -> TimeSeriesDataset: + return TimeSeriesDataset( + data=pd.DataFrame( + data={ + "available_at": pd.to_datetime([ + "2023-01-01T09:50:00", # lead time = 10:00 - 09:50 = +10min + "2023-01-01T10:55:00", # lead time = 11:00 - 10:55 = +5min + "2023-01-01T12:10:00", # lead time = 12:00 - 12:10 = -10min + "2023-01-01T13:20:00", # lead time = 13:00 - 13:20 = -20min + "2023-01-01T14:15:00", # lead time = 14:00 - 14:15 = -15min + "2023-01-01T14:30:00", # lead time = 14:00 - 14:30 = -30min + ]), + "value1": [10, 20, 30, 40, 50, 55], # 55 should override 50 for 14:00 + }, + index=pd.to_datetime([ + "2023-01-01T10:00:00", + "2023-01-01T11:00:00", + "2023-01-01T12:00:00", + "2023-01-01T13:00:00", + # Duplicate timestamp with different availability + "2023-01-01T14:00:00", + "2023-01-01T14:00:00", + ]), + ), + sample_interval=timedelta(hours=1), + ) + + +@pytest.fixture +def forecast_dataset_factory() -> Callable[[], ForecastDataset]: + def _make() -> ForecastDataset: + rng = np.random.default_rng() + df = pd.DataFrame( + data={ + "quantile_P10": [90, 180, 270], + "quantile_P50": [100, 200, 300], + "quantile_P90": [110, 220, 330], + "load": [100, 200, 300], + }, + index=pd.to_datetime([ + "2023-01-01T10:00:00", + "2023-01-01T11:00:00", + "2023-01-01T12:00:00", + ]), + ) + df += rng.normal(0, 1, df.shape) # Add slight noise to avoid perfect predictions + + df["available_at"] = pd.to_datetime([ + "2023-01-01T09:50:00", + "2023-01-01T10:55:00", + "2023-01-01T12:10:00", + ]) + + return ForecastDataset( + data=df, + sample_interval=timedelta(hours=1), + target_column="load", + ) + + return _make + + +@pytest.fixture +def base_predictions( + forecast_dataset_factory: Callable[[], ForecastDataset], +) -> dict[str, ForecastDataset]: + return { + "model_1": forecast_dataset_factory(), + "model_2": forecast_dataset_factory(), + } + + +@pytest.fixture +def ensemble_dataset(base_predictions: dict[str, ForecastDataset]) -> EnsembleForecastDataset: + return EnsembleForecastDataset.from_forecast_datasets(base_predictions) + + +def test_from_ensemble_output(ensemble_dataset: EnsembleForecastDataset): + + assert isinstance(ensemble_dataset, EnsembleForecastDataset) + assert ensemble_dataset.data.shape == (3, 7) # 3 timestamps, 2 learners * 3 quantiles + target + assert set(ensemble_dataset.forecaster_names) == {"model_1", "model_2"} + assert set(ensemble_dataset.quantiles) == {Quantile(0.1), Quantile(0.5), Quantile(0.9)} + + +def test_select_quantile(ensemble_dataset: EnsembleForecastDataset): + + dataset = ensemble_dataset.select_quantile(Quantile(0.5)) + + assert isinstance(dataset, ForecastInputDataset) + assert dataset.data.shape == (3, 3) # 3 timestamps, 2 learners * 1 quantiles + target + + +def test_select_quantile_classification(ensemble_dataset: EnsembleForecastDataset): + + dataset = ensemble_dataset.select_quantile_classification(Quantile(0.5)) + + assert isinstance(dataset, ForecastInputDataset) + assert dataset.data.shape == (3, 3) # 3 timestamps, 2 learners * 1 quantiles + target + assert all(dataset.target_series.apply(lambda x: x in {"model_1", "model_2"})) # type: ignore diff --git a/packages/openstef-meta/tests/unit/utils/test_decision_tree.py b/packages/openstef-meta/tests/unit/utils/test_decision_tree.py new file mode 100644 index 000000000..f40bdb220 --- /dev/null +++ b/packages/openstef-meta/tests/unit/utils/test_decision_tree.py @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +import pandas as pd +import pytest + +from openstef_meta.utils.decision_tree import Decision, DecisionTree, Node, Rule + + +@pytest.fixture +def sample_dataset() -> pd.DataFrame: + data = { + "feature_1": [1, 2, 3, 4, 5], + "feature_2": [10, 20, 30, 40, 50], + } + return pd.DataFrame(data) + + +@pytest.fixture +def simple_decision_tree() -> DecisionTree: + nodes: list[Node] = [ + Rule( + idx=0, + rule_type="less_than", + feature_name="feature_1", + threshold=3, + next_true=1, + next_false=2, + ), + Decision(idx=1, decision="Class_A"), + Decision(idx=2, decision="Class_B"), + ] + return DecisionTree(nodes=nodes, outcomes={"Class_A", "Class_B"}) + + +def test_decision_tree_prediction(sample_dataset: pd.DataFrame, simple_decision_tree: DecisionTree): + + decisions = sample_dataset.apply(simple_decision_tree.get_decision, axis=1) + + expected_decisions = pd.Series( + ["Class_A", "Class_A", "Class_B", "Class_B", "Class_B"], + ) + + pd.testing.assert_series_equal(decisions, expected_decisions) diff --git a/packages/openstef-models/src/openstef_models/explainability/mixins.py b/packages/openstef-models/src/openstef_models/explainability/mixins.py index dda56059b..2e1fa81ca 100644 --- a/packages/openstef-models/src/openstef_models/explainability/mixins.py +++ b/packages/openstef-models/src/openstef_models/explainability/mixins.py @@ -13,6 +13,7 @@ import pandas as pd import plotly.graph_objects as go +from openstef_core.datasets.validated_datasets import ForecastInputDataset from openstef_core.types import Q, Quantile from openstef_models.explainability.plotters.feature_importance_plotter import FeatureImportancePlotter @@ -44,6 +45,19 @@ def feature_importances(self) -> pd.DataFrame: """ raise NotImplementedError + @abstractmethod + def predict_contributions(self, data: ForecastInputDataset, *, scale: bool) -> pd.DataFrame: + """Get feature contributions for each prediction. + + Args: + data: Input dataset for which to compute feature contributions. + scale: Whether to scale contributions to sum to the prediction value. + + Returns: + DataFrame with contributions per feature. + """ + raise NotImplementedError + def plot_feature_importances(self, quantile: Quantile = Q(0.5)) -> go.Figure: """Create interactive treemap visualization of feature importances. diff --git a/packages/openstef-models/src/openstef_models/integrations/mlflow/mlflow_storage_callback.py b/packages/openstef-models/src/openstef_models/integrations/mlflow/mlflow_storage_callback.py index 78f4abb4a..91be9fcab 100644 --- a/packages/openstef-models/src/openstef_models/integrations/mlflow/mlflow_storage_callback.py +++ b/packages/openstef-models/src/openstef_models/integrations/mlflow/mlflow_storage_callback.py @@ -19,13 +19,16 @@ from openstef_beam.evaluation.metric_providers import MetricDirection from openstef_core.base_model import BaseConfig from openstef_core.datasets.timeseries_dataset import TimeSeriesDataset -from openstef_core.datasets.versioned_timeseries_dataset import VersionedTimeSeriesDataset +from openstef_core.datasets.versioned_timeseries_dataset import ( + VersionedTimeSeriesDataset, +) from openstef_core.exceptions import ( MissingColumnsError, ModelNotFoundError, SkipFitting, ) from openstef_core.types import Q, QuantileOrGlobal +from openstef_meta.models.ensemble_forecasting_model import EnsembleForecastingModel from openstef_models.explainability import ExplainableForecaster from openstef_models.integrations.mlflow.mlflow_storage import MLFlowStorage from openstef_models.mixins.callbacks import WorkflowContext @@ -110,6 +113,11 @@ def on_fit_end( if self.model_selection_enable: self._run_model_selection(workflow=context.workflow, result=result) + if isinstance(context.workflow.model, EnsembleForecastingModel): + raise NotImplementedError( + "MLFlowStorageCallback does not yet support EnsembleForecastingWorkflow model storage." + ) + # Create a new run run = self.storage.create_run( model_id=context.workflow.model_id, @@ -134,7 +142,11 @@ def on_fit_end( fig.write_html(data_path / "feature_importances.html") # pyright: ignore[reportUnknownMemberType] # Store the trained model - self.storage.save_run_model(model_id=context.workflow.model_id, run_id=run_id, model=context.workflow.model) + self.storage.save_run_model( + model_id=context.workflow.model_id, + run_id=run_id, + model=context.workflow.model, + ) self._logger.info("Stored trained model for run %s", run_id) # Format the metrics for MLflow @@ -151,7 +163,9 @@ def on_fit_end( @override def on_predict_start( - self, context: WorkflowContext[CustomForecastingWorkflow], data: VersionedTimeSeriesDataset | TimeSeriesDataset + self, + context: WorkflowContext[CustomForecastingWorkflow], + data: VersionedTimeSeriesDataset | TimeSeriesDataset, ): if context.workflow.model.is_fitted: return @@ -175,7 +189,11 @@ def on_predict_start( return context.workflow.model = old_model - self._logger.info("Loaded model from MLflow run %s for model %s", run_id, context.workflow.model_id) + self._logger.info( + "Loaded model from MLflow run %s for model %s", + run_id, + context.workflow.model_id, + ) def _run_model_selection(self, workflow: CustomForecastingWorkflow, result: ModelFitResult) -> None: # Find the latest successful run for this model diff --git a/packages/openstef-models/src/openstef_models/models/forecasting/base_case_forecaster.py b/packages/openstef-models/src/openstef_models/models/forecasting/base_case_forecaster.py index 215e85344..4b021d2b3 100644 --- a/packages/openstef-models/src/openstef_models/models/forecasting/base_case_forecaster.py +++ b/packages/openstef-models/src/openstef_models/models/forecasting/base_case_forecaster.py @@ -189,6 +189,23 @@ def predict(self, data: ForecastInputDataset) -> ForecastDataset: sample_interval=data.sample_interval, ) + @override + def predict_contributions(self, data: ForecastInputDataset, *, scale: bool = True) -> pd.DataFrame: + """Generate feature contributions. + + Args: + data: The forecast input dataset containing target variable history. + scale: Whether to scale contributions to sum to 1. Defaults to True. + + Returns: + pd.DataFrame containing the prediction contributions. + """ + return pd.DataFrame( + data=1.0, + index=data.index, + columns=["load_" + quantile.format() for quantile in self.config.quantiles], + ) + @property @override def feature_importances(self) -> pd.DataFrame: diff --git a/packages/openstef-models/src/openstef_models/models/forecasting/constant_median_forecaster.py b/packages/openstef-models/src/openstef_models/models/forecasting/constant_median_forecaster.py index 9461cdddb..e516472a2 100644 --- a/packages/openstef-models/src/openstef_models/models/forecasting/constant_median_forecaster.py +++ b/packages/openstef-models/src/openstef_models/models/forecasting/constant_median_forecaster.py @@ -141,3 +141,20 @@ def feature_importances(self) -> pd.DataFrame: index=["load"], columns=[quantile.format() for quantile in self.config.quantiles], ) + + @override + def predict_contributions(self, data: ForecastInputDataset, *, scale: bool = True) -> pd.DataFrame: + """Generate feature contributions. + + Args: + data: The forecast input dataset containing target variable history. + scale: Whether to scale contributions to sum to 1. Defaults to True. + + Returns: + pd.DataFrame containing the prediction contributions. + """ + return pd.DataFrame( + data=1.0, + index=data.index, + columns=["load_" + quantile.format() for quantile in self.config.quantiles], + ) diff --git a/packages/openstef-models/src/openstef_models/models/forecasting/flatliner_forecaster.py b/packages/openstef-models/src/openstef_models/models/forecasting/flatliner_forecaster.py index fa7f141d3..e4ab21437 100644 --- a/packages/openstef-models/src/openstef_models/models/forecasting/flatliner_forecaster.py +++ b/packages/openstef-models/src/openstef_models/models/forecasting/flatliner_forecaster.py @@ -117,3 +117,13 @@ def feature_importances(self) -> pd.DataFrame: index=["load"], columns=[quantile.format() for quantile in self.config.quantiles], ) + + @override + def predict_contributions(self, data: ForecastInputDataset, *, scale: bool = True) -> pd.DataFrame: + + forecast_index = data.create_forecast_range(horizon=self.config.max_horizon) + + return pd.DataFrame( + data={quantile.format(): 0.0 for quantile in self.config.quantiles}, + index=forecast_index, + ) diff --git a/packages/openstef-models/src/openstef_models/models/forecasting/gblinear_forecaster.py b/packages/openstef-models/src/openstef_models/models/forecasting/gblinear_forecaster.py index 74c904364..f08dc4269 100644 --- a/packages/openstef-models/src/openstef_models/models/forecasting/gblinear_forecaster.py +++ b/packages/openstef-models/src/openstef_models/models/forecasting/gblinear_forecaster.py @@ -272,7 +272,7 @@ def fit(self, data: ForecastInputDataset, data_val: ForecastInputDataset | None raise InputValidationError("The input data is empty after dropping NaN values.") # Fit the scalers - self._target_scaler.fit(data.target_series.to_frame()) + self._target_scaler.fit(data.target_series.to_frame().to_numpy()) # Prepare training data input_data, target, sample_weight = self._prepare_fit_input(data) @@ -326,6 +326,47 @@ def predict(self, data: ForecastInputDataset) -> ForecastDataset: sample_interval=data.sample_interval, ) + def predict_contributions(self, data: ForecastInputDataset, *, scale: bool = True) -> pd.DataFrame: + """Get feature contributions for each prediction. + + Args: + data: Input dataset for which to compute feature contributions. + scale: If True, scale contributions to sum to 1.0 per quantile. + + Returns: + DataFrame with contributions per feature. + """ + # Get input features for prediction + input_data: pd.DataFrame = data.input_data(start=data.forecast_start) + xgb_input: xgb.DMatrix = xgb.DMatrix(data=input_data) + + # Generate predictions + booster = self._gblinear_model.get_booster() + predictions_array: np.ndarray = booster.predict(xgb_input, pred_contribs=True, strict_shape=True)[:, :, :-1] + + # Remove last column + contribs = predictions_array / np.sum(predictions_array, axis=-1, keepdims=True) + + # Flatten to 2D array, name columns accordingly + contribs = contribs.reshape(contribs.shape[0], -1) + df = pd.DataFrame( + data=contribs, + index=input_data.index, + columns=[ + f"{feature}_{quantile.format()}" for feature in input_data.columns for quantile in self.config.quantiles + ], + ) + + if scale: + # Scale contributions so that they sum to 1.0 per quantile and are positive + for q in self.config.quantiles: + quantile_cols = [col for col in df.columns if col.endswith(f"_{q.format()}")] + row_sums = df[quantile_cols].abs().sum(axis=1) + df[quantile_cols] = df[quantile_cols].abs().div(row_sums, axis=0) + + # Construct DataFrame with appropriate quantile columns + return df + @property @override def feature_importances(self) -> pd.DataFrame: diff --git a/packages/openstef-models/src/openstef_models/models/forecasting/hybrid_forecaster.py b/packages/openstef-models/src/openstef_models/models/forecasting/hybrid_forecaster.py deleted file mode 100644 index 2b4b72573..000000000 --- a/packages/openstef-models/src/openstef_models/models/forecasting/hybrid_forecaster.py +++ /dev/null @@ -1,308 +0,0 @@ -# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project -# -# SPDX-License-Identifier: MPL-2.0 -"""Hybrid Forecaster (Stacked LightGBM + Linear Model Gradient Boosting). - -Provides method that attempts to combine the advantages of a linear model (Extraplolation) -and tree-based model (Non-linear patterns). This is acieved by training two base learners, -followed by a small linear model that regresses on the baselearners' predictions. -The implementation is based on sklearn's StackingRegressor. -""" - -import logging -from abc import abstractmethod -from typing import override - -import pandas as pd -from pydantic import Field, field_validator - -from openstef_core.datasets import ForecastDataset, ForecastInputDataset -from openstef_core.exceptions import ( - NotFittedError, -) -from openstef_core.mixins import HyperParams -from openstef_core.types import Quantile -from openstef_models.models.forecasting.forecaster import ( - Forecaster, - ForecasterConfig, -) -from openstef_models.models.forecasting.gblinear_forecaster import ( - GBLinearForecaster, - GBLinearForecasterConfig, - GBLinearHyperParams, -) -from openstef_models.models.forecasting.lgbm_forecaster import LGBMForecaster, LGBMForecasterConfig, LGBMHyperParams -from openstef_models.models.forecasting.lgbmlinear_forecaster import ( - LGBMLinearForecaster, - LGBMLinearForecasterConfig, - LGBMLinearHyperParams, -) -from openstef_models.models.forecasting.xgboost_forecaster import ( - XGBoostForecaster, - XGBoostForecasterConfig, - XGBoostHyperParams, -) - -logger = logging.getLogger(__name__) - - -BaseLearner = LGBMForecaster | LGBMLinearForecaster | XGBoostForecaster | GBLinearForecaster -BaseLearnerHyperParams = LGBMHyperParams | LGBMLinearHyperParams | XGBoostHyperParams | GBLinearHyperParams -BaseLearnerConfig = ( - LGBMForecasterConfig | LGBMLinearForecasterConfig | XGBoostForecasterConfig | GBLinearForecasterConfig -) - - -class FinalLearner: - """Combines base learner predictions for each quantile into final predictions.""" - - @abstractmethod - def fit(self, base_learner_predictions: dict[Quantile, ForecastInputDataset]) -> None: - raise NotImplementedError("Subclasses must implement the fit method.") - - def predict(self, base_learner_predictions: dict[Quantile, ForecastInputDataset]) -> ForecastDataset: - raise NotImplementedError("Subclasses must implement the predict method.") - - @property - @abstractmethod - def is_fitted(self) -> bool: - raise NotImplementedError("Subclasses must implement the is_fitted property.") - - -class FinalForecaster(FinalLearner): - """Combines base learner predictions for each quantile into final predictions.""" - - def __init__(self, forecaster: Forecaster, feature_adders: None = None) -> None: - # Feature adders placeholder for future use - if feature_adders is not None: - raise NotImplementedError("Feature adders are not yet implemented.") - - # Split forecaster per quantile - self.quantiles = forecaster.config.quantiles - models: list[Forecaster] = [] - for q in self.quantiles: - config = forecaster.config.model_copy( - update={ - "quantiles": [q], - } - ) - model = forecaster.__class__(config=config) - models.append(model) - self.models = models - - @override - def fit(self, base_learner_predictions: dict[Quantile, ForecastInputDataset]) -> None: - for i, q in enumerate(self.quantiles): - self.models[i].fit(data=base_learner_predictions[q], data_val=None) - - @override - def predict(self, base_learner_predictions: dict[Quantile, ForecastInputDataset]) -> ForecastDataset: - if not self.is_fitted: - raise NotFittedError(self.__class__.__name__) - - # Generate predictions - predictions = [ - self.models[i].predict(data=base_learner_predictions[q]).data for i, q in enumerate(self.quantiles) - ] - - # Concatenate predictions along columns to form a DataFrame with quantile columns - df = pd.concat(predictions, axis=1) - - return ForecastDataset( - data=df, - sample_interval=base_learner_predictions[self.quantiles[0]].sample_interval, - ) - - @property - def is_fitted(self) -> bool: - return all(x.is_fitted for x in self.models) - - -class HybridHyperParams(HyperParams): - """Hyperparameters for Stacked LGBM GBLinear Regressor.""" - - base_hyperparams: list[BaseLearnerHyperParams] = Field( - default=[LGBMHyperParams(), GBLinearHyperParams()], - description="List of hyperparameter configurations for base learners. " - "Defaults to [LGBMHyperParams, GBLinearHyperParams].", - ) - - final_hyperparams: BaseLearnerHyperParams = Field( - default=GBLinearHyperParams(), - description="Hyperparameters for the final learner. Defaults to GBLinearHyperParams.", - ) - - add_rolling_accuracy_features: bool = Field( - default=False, - description="Whether to add rolling accuracy features from base learners as additional features " - "to the final learner. Defaults to False.", - ) - - @field_validator("base_hyperparams", mode="after") - @classmethod - def _check_classes(cls, v: list[BaseLearnerHyperParams]) -> list[BaseLearnerHyperParams]: - hp_classes = [type(hp) for hp in v] - if not len(hp_classes) == len(set(hp_classes)): - raise ValueError("Duplicate base learner hyperparameter classes are not allowed.") - return v - - -class HybridForecasterConfig(ForecasterConfig): - """Configuration for Hybrid-based forecasting models.""" - - hyperparams: HybridHyperParams = HybridHyperParams() - - verbosity: bool = Field( - default=True, - description="Enable verbose output from the Hybrid model (True/False).", - ) - - -class HybridForecaster(Forecaster): - """Wrapper for sklearn's StackingRegressor to make it compatible with HorizonForecaster.""" - - Config = HybridForecasterConfig - HyperParams = HybridHyperParams - - _config: HybridForecasterConfig - - def __init__(self, config: HybridForecasterConfig) -> None: - """Initialize the Hybrid forecaster.""" - self._config = config - - self._base_learners: list[BaseLearner] = self._init_base_learners( - base_hyperparams=config.hyperparams.base_hyperparams - ) - final_forecaster = self._init_base_learners(base_hyperparams=[config.hyperparams.final_hyperparams])[0] - self._final_learner = FinalForecaster(forecaster=final_forecaster) - - def _init_base_learners(self, base_hyperparams: list[BaseLearnerHyperParams]) -> list[BaseLearner]: - """Initialize base learners based on provided hyperparameters. - - Returns: - list[Forecaster]: List of initialized base learner forecasters. - """ - base_learners: list[BaseLearner] = [] - horizons = self.config.horizons - quantiles = self.config.quantiles - - for hyperparams in base_hyperparams: - forecaster_cls = hyperparams.forecaster_class() - config = forecaster_cls.Config(horizons=horizons, quantiles=quantiles) - if "hyperparams" in forecaster_cls.Config.model_fields: - config = config.model_copy(update={"hyperparams": hyperparams}) - - base_learners.append(config.forecaster_from_config()) - - return base_learners - - @property - @override - def is_fitted(self) -> bool: - return all(x.is_fitted for x in self._base_learners) - - @property - @override - def config(self) -> ForecasterConfig: - return self._config - - @override - def fit(self, data: ForecastInputDataset, data_val: ForecastInputDataset | None = None) -> None: - """Fit the Hybrid model to the training data. - - Args: - data: Training data in the expected ForecastInputDataset format. - data_val: Validation data for tuning the model (optional, not used in this implementation). - - """ - # Fit base learners - [x.fit(data=data, data_val=data_val) for x in self._base_learners] - - # Reset forecast start date to ensure we predict on the full dataset - full_dataset = ForecastInputDataset( - data=data.data, - sample_interval=data.sample_interval, - target_column=data.target_column, - forecast_start=data.index[0], - ) - - base_predictions = self._predict_base_learners(data=full_dataset) - - quantile_datasets = self._prepare_input_final_learner( - base_predictions=base_predictions, quantiles=self._config.quantiles, target_series=data.target_series - ) - - self._final_learner.fit( - base_learner_predictions=quantile_datasets, - ) - - self._is_fitted = True - - def _predict_base_learners(self, data: ForecastInputDataset) -> dict[type[BaseLearner], ForecastDataset]: - """Generate predictions from base learners. - - Args: - data: Input data for prediction. - - Returns: - DataFrame containing base learner predictions. - """ - base_predictions: dict[type[BaseLearner], ForecastDataset] = {} - for learner in self._base_learners: - preds = learner.predict(data=data) - base_predictions[learner.__class__] = preds - - return base_predictions - - @staticmethod - def _prepare_input_final_learner( - quantiles: list[Quantile], - base_predictions: dict[type[BaseLearner], ForecastDataset], - target_series: pd.Series, - ) -> dict[Quantile, ForecastInputDataset]: - """Prepare input data for the final learner based on base learner predictions. - - Args: - quantiles: List of quantiles to prepare data for. - base_predictions: Predictions from base learners. - target_series: Actual target series for reference. - - Returns: - dictionary mapping quantile strings to DataFrames of base learner predictions. - """ - predictions_quantiles: dict[Quantile, ForecastInputDataset] = {} - sample_interval = base_predictions[next(iter(base_predictions))].sample_interval - target_name = str(target_series.name) - - for q in quantiles: - df = pd.DataFrame({ - learner.__name__: preds.data[Quantile(q).format()] for learner, preds in base_predictions.items() - }) - df[target_name] = target_series - - predictions_quantiles[q] = ForecastInputDataset( - data=df, - sample_interval=sample_interval, - target_column=target_name, - forecast_start=df.index[0], - ) - - return predictions_quantiles - - @override - def predict(self, data: ForecastInputDataset) -> ForecastDataset: - if not self.is_fitted: - raise NotFittedError(self.__class__.__name__) - - base_predictions = self._predict_base_learners(data=data) - - final_learner_input = self._prepare_input_final_learner( - quantiles=self._config.quantiles, base_predictions=base_predictions, target_series=data.target_series - ) - - return self._final_learner.predict(base_learner_predictions=final_learner_input) - - # TODO(@Lars800): #745: Make forecaster Explainable - - -__all__ = ["HybridForecaster", "HybridForecasterConfig", "HybridHyperParams"] diff --git a/packages/openstef-models/src/openstef_models/models/forecasting/lgbm_forecaster.py b/packages/openstef-models/src/openstef_models/models/forecasting/lgbm_forecaster.py index 03c667b00..5868289d3 100644 --- a/packages/openstef-models/src/openstef_models/models/forecasting/lgbm_forecaster.py +++ b/packages/openstef-models/src/openstef_models/models/forecasting/lgbm_forecaster.py @@ -312,6 +312,43 @@ def predict(self, data: ForecastInputDataset) -> ForecastDataset: sample_interval=data.sample_interval, ) + def predict_contributions(self, data: ForecastInputDataset, *, scale: bool) -> pd.DataFrame: + """Get feature contributions for each prediction. + + Args: + data: Input dataset for which to compute feature contributions. + scale: If True, scale contributions to sum to 1.0 per quantile. + + Returns: + DataFrame with contributions per feature. + """ + # Get input features for prediction + input_data: pd.DataFrame = data.input_data(start=data.forecast_start) + + contributions: list[pd.DataFrame] = [] + + for i, quantile in enumerate(self.config.quantiles): + # Get model for specific quantile + model: LGBMRegressor = self._lgbm_model.models[i] # type: ignore + + # Generate contributions using LightGBM's built-in method, and remove bias term + contribs_quantile: np.ndarray[float] = model.predict(input_data, pred_contrib=True)[:, :-1] # type: ignore + + if scale: + # Scale contributions so that they sum to 1.0 per quantile + contribs_quantile = np.abs(contribs_quantile) / np.sum(np.abs(contribs_quantile), axis=1, keepdims=True) + + contributions.append( + pd.DataFrame( + data=contribs_quantile, + index=input_data.index, + columns=[f"{feature}_{quantile.format()}" for feature in input_data.columns], + ) + ) + + # Construct DataFrame + return pd.concat(contributions, axis=1) + @property @override def feature_importances(self) -> pd.DataFrame: diff --git a/packages/openstef-models/src/openstef_models/models/forecasting/lgbmlinear_forecaster.py b/packages/openstef-models/src/openstef_models/models/forecasting/lgbmlinear_forecaster.py index eace689fb..391bcceca 100644 --- a/packages/openstef-models/src/openstef_models/models/forecasting/lgbmlinear_forecaster.py +++ b/packages/openstef-models/src/openstef_models/models/forecasting/lgbmlinear_forecaster.py @@ -314,6 +314,19 @@ def predict(self, data: ForecastInputDataset) -> ForecastDataset: sample_interval=data.sample_interval, ) + @override + def predict_contributions(self, data: ForecastInputDataset, *, scale: bool) -> pd.DataFrame: + """Get feature contributions for each prediction. + + Args: + data: Input dataset for which to compute feature contributions. + scale: If True, scale contributions to sum to 1.0 per quantile. + + Returns: + DataFrame with contributions per feature. + """ + raise NotImplementedError("predict_contributions is not yet implemented for LGBMLinearForecaster") + @property @override def feature_importances(self) -> pd.DataFrame: diff --git a/packages/openstef-models/src/openstef_models/models/forecasting/xgboost_forecaster.py b/packages/openstef-models/src/openstef_models/models/forecasting/xgboost_forecaster.py index 0e371a339..c5415e2d6 100644 --- a/packages/openstef-models/src/openstef_models/models/forecasting/xgboost_forecaster.py +++ b/packages/openstef-models/src/openstef_models/models/forecasting/xgboost_forecaster.py @@ -420,6 +420,48 @@ def predict(self, data: ForecastInputDataset) -> ForecastDataset: sample_interval=data.sample_interval, ) + def predict_contributions(self, data: ForecastInputDataset, *, scale: bool) -> pd.DataFrame: + """Get feature contributions for each prediction. + + Args: + data: Input dataset for which to compute feature contributions. + scale: If True, scale contributions to sum to 1.0 per quantile. + + Returns: + DataFrame with contributions per feature. + """ + # Get input features for prediction + input_data: pd.DataFrame = data.input_data(start=data.forecast_start) + xgb_input: xgb.DMatrix = xgb.DMatrix(data=input_data) + + # Generate predictions + booster = self._xgboost_model.get_booster() + predictions_array: np.ndarray = booster.predict(xgb_input, pred_contribs=True, strict_shape=True)[:, :, :-1] + + # Remove last column + contribs = predictions_array / np.sum(predictions_array, axis=-1, keepdims=True) + + # Flatten to 2D array, name columns accordingly + contribs = contribs.reshape(contribs.shape[0], -1) + + df = pd.DataFrame( + data=contribs, + index=input_data.index, + columns=[ + f"{feature}_{quantile.format()}" for feature in input_data.columns for quantile in self.config.quantiles + ], + ) + + if scale: + # Scale contributions so that they sum to 1.0 per quantile and are positive + for q in self.config.quantiles: + quantile_cols = [col for col in df.columns if col.endswith(f"_{q.format()}")] + row_sums = df[quantile_cols].abs().sum(axis=1) + df[quantile_cols] = df[quantile_cols].abs().div(row_sums, axis=0) + + # Construct DataFrame with appropriate quantile columns + return df + @property @override def feature_importances(self) -> pd.DataFrame: diff --git a/packages/openstef-models/src/openstef_models/models/forecasting_model.py b/packages/openstef-models/src/openstef_models/models/forecasting_model.py index f2de3c4b3..9acea87fa 100644 --- a/packages/openstef-models/src/openstef_models/models/forecasting_model.py +++ b/packages/openstef-models/src/openstef_models/models/forecasting_model.py @@ -381,7 +381,7 @@ def restore_target[T: TimeSeriesDataset]( target_series = original_dataset.select_features([target_column]).select_version().data[target_column] def _transform_restore_target(df: pd.DataFrame) -> pd.DataFrame: - return df.assign(**{str(target_series.name): df.index.map(target_series)}) # pyright: ignore[reportUnknownMemberType] + return df.assign(**{str(target_series.name): df.index.map(target_series)}) # type: ignore return dataset.pipe_pandas(_transform_restore_target) diff --git a/packages/openstef-models/src/openstef_models/presets/forecasting_workflow.py b/packages/openstef-models/src/openstef_models/presets/forecasting_workflow.py index 57afe7847..b5e8efdce 100644 --- a/packages/openstef-models/src/openstef_models/presets/forecasting_workflow.py +++ b/packages/openstef-models/src/openstef_models/presets/forecasting_workflow.py @@ -25,12 +25,12 @@ from openstef_core.base_model import BaseConfig from openstef_core.mixins import TransformPipeline from openstef_core.types import LeadTime, Q, Quantile, QuantileOrGlobal +from openstef_meta.models.forecasting.residual_forecaster import ResidualForecaster from openstef_models.integrations.mlflow import MLFlowStorage, MLFlowStorageCallback from openstef_models.mixins import ModelIdentifier from openstef_models.models import ForecastingModel from openstef_models.models.forecasting.flatliner_forecaster import FlatlinerForecaster from openstef_models.models.forecasting.gblinear_forecaster import GBLinearForecaster -from openstef_models.models.forecasting.hybrid_forecaster import HybridForecaster from openstef_models.models.forecasting.lgbm_forecaster import LGBMForecaster from openstef_models.models.forecasting.lgbmlinear_forecaster import LGBMLinearForecaster from openstef_models.models.forecasting.xgboost_forecaster import XGBoostForecaster @@ -109,7 +109,7 @@ class ForecastingWorkflowConfig(BaseConfig): # PredictionJob run_name: str | None = Field(default=None, description="Optional name for this workflow run.") # Model configuration - model: Literal["xgboost", "gblinear", "flatliner", "hybrid", "lgbm", "lgbmlinear"] = Field( + model: Literal["xgboost", "gblinear", "flatliner", "residual", "lgbm", "lgbmlinear"] = Field( description="Type of forecasting model to use." ) # TODO(#652): Implement median forecaster quantiles: list[Quantile] = Field( @@ -145,9 +145,9 @@ class ForecastingWorkflowConfig(BaseConfig): # PredictionJob description="Hyperparameters for LightGBM forecaster.", ) - hybrid_hyperparams: HybridForecaster.HyperParams = Field( - default=HybridForecaster.HyperParams(), - description="Hyperparameters for Hybrid forecaster.", + residual_hyperparams: ResidualForecaster.HyperParams = Field( + default=ResidualForecaster.HyperParams(), + description="Hyperparameters for Residual forecaster.", ) location: LocationConfig = Field( @@ -222,7 +222,7 @@ class ForecastingWorkflowConfig(BaseConfig): # PredictionJob ) sample_weight_exponent: float = Field( default_factory=lambda data: 1.0 - if data.get("model") in {"gblinear", "lgbmlinear", "lgbm", "hybrid", "xgboost"} + if data.get("model") in {"gblinear", "lgbmlinear", "lgbm", "learned_weights", "stacking", "residual", "xgboost"} else 0.0, description="Exponent applied to scale the sample weights. " "0=uniform weights, 1=linear scaling, >1=stronger emphasis on high values. " @@ -326,9 +326,12 @@ def create_forecasting_workflow( LagsAdder( history_available=config.predict_history, horizons=config.horizons, - add_trivial_lags=config.model not in {"gblinear", "hybrid"}, # GBLinear uses only 7day lag. + add_trivial_lags=config.model + not in {"gblinear", "residual", "stacking", "learned_weights"}, # GBLinear uses only 7day lag. target_column=config.target_column, - custom_lags=[timedelta(days=7)] if config.model in {"gblinear", "hybrid"} else [], + custom_lags=[timedelta(days=7)] + if config.model in {"gblinear", "residual", "stacking", "learned_weights"} + else [], ), WindPowerFeatureAdder( windspeed_reference_column=config.wind_speed_column, @@ -353,10 +356,7 @@ def create_forecasting_workflow( ), ] feature_standardizers = [ - Clipper( - selection=Include(config.energy_price_column).combine(config.clip_features), - mode="standard", - ), + Clipper(selection=Include(config.energy_price_column).combine(config.clip_features), mode="standard"), Scaler(selection=Exclude(config.target_column), method="standard"), SampleWeighter( target_column=config.target_column, @@ -464,7 +464,8 @@ def create_forecasting_workflow( QuantileSorter(), ConfidenceIntervalApplicator(quantiles=config.quantiles), ] - elif config.model == "hybrid": + + elif config.model == "residual": preprocessing = [ *checks, *feature_adders, @@ -478,11 +479,11 @@ def create_forecasting_workflow( selection=Exclude(config.target_column), ), ] - forecaster = HybridForecaster( - config=HybridForecaster.Config( + forecaster = ResidualForecaster( + config=ResidualForecaster.Config( quantiles=config.quantiles, horizons=config.horizons, - hyperparams=config.hybrid_hyperparams, + hyperparams=config.residual_hyperparams, ) ) postprocessing = [QuantileSorter()] diff --git a/packages/openstef-models/src/openstef_models/transforms/general/__init__.py b/packages/openstef-models/src/openstef_models/transforms/general/__init__.py index 32a8b979c..e601043c1 100644 --- a/packages/openstef-models/src/openstef_models/transforms/general/__init__.py +++ b/packages/openstef-models/src/openstef_models/transforms/general/__init__.py @@ -13,6 +13,7 @@ from openstef_models.transforms.general.empty_feature_remover import ( EmptyFeatureRemover, ) +from openstef_models.transforms.general.flagger import Flagger from openstef_models.transforms.general.imputer import Imputer from openstef_models.transforms.general.nan_dropper import NaNDropper from openstef_models.transforms.general.sample_weighter import SampleWeighter @@ -23,6 +24,7 @@ "Clipper", "DimensionalityReducer", "EmptyFeatureRemover", + "Flagger", "Imputer", "NaNDropper", "SampleWeighter", diff --git a/packages/openstef-models/src/openstef_models/transforms/general/flagger.py b/packages/openstef-models/src/openstef_models/transforms/general/flagger.py new file mode 100644 index 000000000..5c3675148 --- /dev/null +++ b/packages/openstef-models/src/openstef_models/transforms/general/flagger.py @@ -0,0 +1,97 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +"""Transform for flagging feature values inside or outside observed training ranges. + +This module provides functionality to clip feature values to their observed +minimum and maximum ranges during training. It is useful to flag data drift and +can be used to inform forecast combiners which models might perform better. +""" + +from typing import override + +import pandas as pd +from pydantic import Field, PrivateAttr + +from openstef_core.base_model import BaseConfig +from openstef_core.datasets import TimeSeriesDataset +from openstef_core.exceptions import NotFittedError +from openstef_core.transforms import TimeSeriesTransform +from openstef_models.utils.feature_selection import FeatureSelection + + +class Flagger(BaseConfig, TimeSeriesTransform): + """Transform that flags specified features to their observed min and max values. + + This transform flags the peaks for the metalearner to know when to expect outliers and + extrapolate from its training set. + + + Example: + >>> import pandas as pd + >>> from datetime import timedelta + >>> from openstef_core.datasets import TimeSeriesDataset + >>> from openstef_models.transforms.general import Flagger + >>> from openstef_models.utils.feature_selection import FeatureSelection + >>> # Create sample training dataset + >>> training_data = pd.DataFrame({ + ... 'load': [100, 90, 110], + ... 'temperature': [19, 20, 21] + ... }, index=pd.date_range('2025-01-01', periods=3, freq='1h')) + >>> training_dataset = TimeSeriesDataset(training_data, timedelta(hours=1)) + >>> test_data = pd.DataFrame({ + ... 'load': [90, 140, 100], + ... 'temperature': [18, 20, 22] + ... }, index=pd.date_range('2025-01-06', periods=3, + ... freq='1h')) + >>> test_dataset = TimeSeriesDataset(test_data, timedelta(hours=1)) + >>> # Initialize and apply transform + >>> flagger = Flagger(selection=FeatureSelection(include=['load', 'temperature'])) + >>> flagger.fit(training_dataset) + >>> transformed_dataset = flagger.transform(test_dataset) + >>> transformed_dataset.data['load'].tolist() + [0, 0, 1] + >>> transformed_dataset.data['temperature'].tolist() + [0, 1, 0] + + """ + + selection: FeatureSelection = Field(default=FeatureSelection.ALL, description="Features to flag.") + + _feature_mins: pd.Series = PrivateAttr(default_factory=pd.Series) + _feature_maxs: pd.Series = PrivateAttr(default_factory=pd.Series) + _is_fitted: bool = PrivateAttr(default=False) + + @property + @override + def is_fitted(self) -> bool: + return self._is_fitted + + @override + def fit(self, data: TimeSeriesDataset) -> None: + features = self.selection.resolve(data.feature_names) + self._feature_mins = data.data.reindex(features, axis=1).min() + self._feature_maxs = data.data.reindex(features, axis=1).max() + self._is_fitted = True + + @override + def transform(self, data: TimeSeriesDataset) -> TimeSeriesDataset: + if not self._is_fitted: + raise NotFittedError(self.__class__.__name__) + + features = self.selection.resolve(data.feature_names) + transformed_data = data.data.copy(deep=False).loc[:, features] + + # compute min & max of the features + min_aligned = self._feature_mins.reindex(features) + max_aligned = self._feature_maxs.reindex(features) + + outside = (transformed_data[features] <= min_aligned) | (transformed_data[features] >= max_aligned) + transformed_data = (~outside).astype(int) + + return TimeSeriesDataset(data=transformed_data, sample_interval=data.sample_interval) + + @override + def features_added(self) -> list[str]: + return [] diff --git a/packages/openstef-models/src/openstef_models/transforms/general/selector.py b/packages/openstef-models/src/openstef_models/transforms/general/selector.py index f6ce646ee..38f7c68bc 100644 --- a/packages/openstef-models/src/openstef_models/transforms/general/selector.py +++ b/packages/openstef-models/src/openstef_models/transforms/general/selector.py @@ -72,7 +72,6 @@ def fit(self, data: TimeSeriesDataset) -> None: @override def transform(self, data: TimeSeriesDataset) -> TimeSeriesDataset: - features = self.selection.resolve(data.feature_names) transformed_data = data.data.drop(columns=[col for col in data.feature_names if col not in features]) diff --git a/packages/openstef-models/src/openstef_models/utils/multi_quantile_regressor.py b/packages/openstef-models/src/openstef_models/utils/multi_quantile_regressor.py index 763932268..b95fbc28c 100644 --- a/packages/openstef-models/src/openstef_models/utils/multi_quantile_regressor.py +++ b/packages/openstef-models/src/openstef_models/utils/multi_quantile_regressor.py @@ -41,7 +41,7 @@ def __init__( base_learner: A scikit-learn compatible regressor class that supports quantile regression. quantile_param: The name of the parameter in base_learner that sets the quantile level. quantiles: List of quantiles to predict (e.g., [0.1, 0.5, 0.9]). - hyperparams: Dictionary of hyperparameters to pass to each base learner instance. + hyperparams: Dictionary of hyperparameters to pass to each estimator instance. """ self.quantiles = quantiles self.hyperparams = hyperparams @@ -56,7 +56,7 @@ def _init_model(self, q: float) -> BaseEstimator: base_learner = self.base_learner(**params) if self.quantile_param not in base_learner.get_params(): # type: ignore - msg = f"The base learner does not support the quantile parameter '{self.quantile_param}'." + msg = f"The base estimator does not support the quantile parameter '{self.quantile_param}'." raise ValueError(msg) return base_learner @@ -149,9 +149,9 @@ def models(self) -> list[BaseEstimator]: @property def has_feature_names(self) -> bool: - """Check if the base learners have feature names. + """Check if the base estimators have feature names. Returns: - True if the base learners have feature names, False otherwise. + True if the base estimators have feature names, False otherwise. """ return len(self.model_feature_names) > 0 diff --git a/packages/openstef-models/src/openstef_models/workflows/custom_forecasting_workflow.py b/packages/openstef-models/src/openstef_models/workflows/custom_forecasting_workflow.py index d2f517c15..5fbeac8a0 100644 --- a/packages/openstef-models/src/openstef_models/workflows/custom_forecasting_workflow.py +++ b/packages/openstef-models/src/openstef_models/workflows/custom_forecasting_workflow.py @@ -18,6 +18,7 @@ from openstef_core.datasets import TimeSeriesDataset, VersionedTimeSeriesDataset from openstef_core.datasets.validated_datasets import ForecastDataset from openstef_core.exceptions import NotFittedError, SkipFitting +from openstef_meta.models.ensemble_forecasting_model import EnsembleForecastingModel, EnsembleModelFitResult from openstef_models.mixins import ModelIdentifier, PredictorCallback from openstef_models.mixins.callbacks import WorkflowContext from openstef_models.models.forecasting_model import ForecastingModel, ModelFitResult @@ -117,7 +118,7 @@ class CustomForecastingWorkflow(BaseModel): ... ) # doctest: +SKIP """ - model: ForecastingModel = Field(description="The forecasting model to use.") + model: ForecastingModel | EnsembleForecastingModel = Field(description="The forecasting model to use.") callbacks: list[ForecastingCallback] = Field( default_factory=list[ForecastingCallback], description="List of callbacks to execute during workflow events." ) @@ -135,7 +136,7 @@ def fit( data: TimeSeriesDataset, data_val: TimeSeriesDataset | None = None, data_test: TimeSeriesDataset | None = None, - ) -> ModelFitResult | None: + ) -> ModelFitResult | EnsembleModelFitResult | None: """Train the forecasting model with callback execution. Executes the complete training workflow including pre-fit callbacks, @@ -158,6 +159,10 @@ def fit( result = self.model.fit(data=data, data_val=data_val, data_test=data_test) + if isinstance(result, EnsembleModelFitResult): + self._logger.debug("Discarding EnsembleModelFitResult for compatibility.") + result = result.combiner_fit_result + for callback in self.callbacks: callback.on_fit_end(context=context, result=result) except SkipFitting as e: diff --git a/packages/openstef-models/tests/unit/models/forecasting/test_gblinear_forecaster.py b/packages/openstef-models/tests/unit/models/forecasting/test_gblinear_forecaster.py index 58ca98159..24d3b8e01 100644 --- a/packages/openstef-models/tests/unit/models/forecasting/test_gblinear_forecaster.py +++ b/packages/openstef-models/tests/unit/models/forecasting/test_gblinear_forecaster.py @@ -4,6 +4,7 @@ from datetime import timedelta +import numpy as np import pandas as pd import pytest @@ -132,3 +133,32 @@ def test_gblinear_forecaster__feature_importances( col_sums = feature_importances.sum(axis=0) pd.testing.assert_series_equal(col_sums, pd.Series(1.0, index=expected_columns), atol=1e-10) assert (feature_importances >= 0).all().all() + + +def test_gblinear_forecaster_predict_contributions( + sample_forecast_input_dataset: ForecastInputDataset, + base_config: GBLinearForecasterConfig, +): + """Test basic fit and predict workflow with output validation.""" + # Arrange + expected_quantiles = base_config.quantiles + forecaster = GBLinearForecaster(config=base_config) + + # Act + forecaster.fit(sample_forecast_input_dataset) + result = forecaster.predict_contributions(sample_forecast_input_dataset, scale=True) + + # Assert + # Basic functionality + assert forecaster.is_fitted, "Model should be fitted after calling fit()" + + # Check that necessary quantiles are present + input_features = sample_forecast_input_dataset.input_data().columns + expected_columns = [f"{col}_{q.format()}" for col in input_features for q in expected_quantiles] + assert list(result.columns) == expected_columns, f"Expected columns {expected_columns}, got {list(result.columns)}" + + # Contributions should sum to 1.0 per quantile + for q in expected_quantiles: + quantile_cols = [col for col in result.columns if col.endswith(f"_{q.format()}")] + col_sums = result[quantile_cols].sum(axis=1) + pd.testing.assert_series_equal(col_sums, pd.Series(1.0, index=result.index, dtype=np.float32), atol=1e-10) diff --git a/packages/openstef-models/tests/unit/models/forecasting/test_hybrid_forecaster.py b/packages/openstef-models/tests/unit/models/forecasting/test_hybrid_forecaster.py deleted file mode 100644 index 4e36e125d..000000000 --- a/packages/openstef-models/tests/unit/models/forecasting/test_hybrid_forecaster.py +++ /dev/null @@ -1,105 +0,0 @@ -# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project -# -# SPDX-License-Identifier: MPL-2.0 - -from datetime import timedelta - -import pytest - -from openstef_core.datasets import ForecastInputDataset -from openstef_core.exceptions import NotFittedError -from openstef_core.types import LeadTime, Q -from openstef_models.models.forecasting.hybrid_forecaster import ( - HybridForecaster, - HybridForecasterConfig, - HybridHyperParams, -) - - -@pytest.fixture -def base_config() -> HybridForecasterConfig: - """Base configuration for Hybrid forecaster tests.""" - - params = HybridHyperParams() - return HybridForecasterConfig( - quantiles=[Q(0.1), Q(0.5), Q(0.9)], - horizons=[LeadTime(timedelta(days=1))], - hyperparams=params, - verbosity=False, - ) - - -def test_hybrid_forecaster_fit_predict( - sample_forecast_input_dataset: ForecastInputDataset, - base_config: HybridForecasterConfig, -): - """Test basic fit and predict workflow with comprehensive output validation.""" - # Arrange - expected_quantiles = base_config.quantiles - forecaster = HybridForecaster(config=base_config) - - # Act - forecaster.fit(sample_forecast_input_dataset) - result = forecaster.predict(sample_forecast_input_dataset) - - # Assert - # Basic functionality - assert forecaster.is_fitted, "Model should be fitted after calling fit()" - - # Check that necessary quantiles are present - expected_columns = [q.format() for q in expected_quantiles] - assert list(result.data.columns) == expected_columns, ( - f"Expected columns {expected_columns}, got {list(result.data.columns)}" - ) - - # Forecast data quality - assert not result.data.isna().any().any(), "Forecast should not contain NaN or None values" - - -def test_hybrid_forecaster_predict_not_fitted_raises_error( - sample_forecast_input_dataset: ForecastInputDataset, - base_config: HybridForecasterConfig, -): - """Test that predict() raises NotFittedError when called before fit().""" - # Arrange - forecaster = HybridForecaster(config=base_config) - - # Act & Assert - with pytest.raises(NotFittedError, match="HybridForecaster"): - forecaster.predict(sample_forecast_input_dataset) - - -def test_hybrid_forecaster_with_sample_weights( - sample_dataset_with_weights: ForecastInputDataset, - base_config: HybridForecasterConfig, -): - """Test that forecaster works with sample weights and produces different results.""" - # Arrange - forecaster_with_weights = HybridForecaster(config=base_config) - - # Create dataset without weights for comparison - data_without_weights = ForecastInputDataset( - data=sample_dataset_with_weights.data.drop(columns=["sample_weight"]), - sample_interval=sample_dataset_with_weights.sample_interval, - target_column=sample_dataset_with_weights.target_column, - forecast_start=sample_dataset_with_weights.forecast_start, - ) - forecaster_without_weights = HybridForecaster(config=base_config) - - # Act - forecaster_with_weights.fit(sample_dataset_with_weights) - forecaster_without_weights.fit(data_without_weights) - - # Predict using data without sample_weight column (since that's used for training, not prediction) - result_with_weights = forecaster_with_weights.predict(data_without_weights) - result_without_weights = forecaster_without_weights.predict(data_without_weights) - - # Assert - # Both should produce valid forecasts - assert not result_with_weights.data.isna().any().any(), "Weighted forecast should not contain NaN values" - assert not result_without_weights.data.isna().any().any(), "Unweighted forecast should not contain NaN values" - - # Sample weights should affect the model, so results should be different - # (This is a statistical test - with different weights, predictions should differ) - differences = (result_with_weights.data - result_without_weights.data).abs() - assert differences.sum().sum() > 0, "Sample weights should affect model predictions" diff --git a/packages/openstef-models/tests/unit/models/forecasting/test_lgbm_forecaster.py b/packages/openstef-models/tests/unit/models/forecasting/test_lgbm_forecaster.py index b4fe1c989..886da0ce6 100644 --- a/packages/openstef-models/tests/unit/models/forecasting/test_lgbm_forecaster.py +++ b/packages/openstef-models/tests/unit/models/forecasting/test_lgbm_forecaster.py @@ -146,4 +146,35 @@ def test_lgbm_forecaster__feature_importances( assert (feature_importances >= 0).all().all() +def test_lgbm_forecaster_predict_contributions( + sample_forecast_input_dataset: ForecastInputDataset, + base_config: LGBMForecasterConfig, +): + """Test basic fit and predict workflow with output validation.""" + # Arrange + expected_quantiles = base_config.quantiles + forecaster = LGBMForecaster(config=base_config) + + # Act + forecaster.fit(sample_forecast_input_dataset) + result = forecaster.predict_contributions(sample_forecast_input_dataset, scale=True) + + # Assert + # Basic functionality + assert forecaster.is_fitted, "Model should be fitted after calling fit()" + + # Check that necessary quantiles are present + input_features = sample_forecast_input_dataset.input_data().columns + expected_columns = [f"{col}_{q.format()}" for col in input_features for q in expected_quantiles] + assert sorted(result.columns) == sorted(expected_columns), ( + f"Expected columns {expected_columns}, got {list(result.columns)}" + ) + + # Contributions should sum to 1.0 per quantile + for q in expected_quantiles: + quantile_cols = [col for col in result.columns if col.endswith(f"_{q.format()}")] + col_sums = result[quantile_cols].sum(axis=1) + pd.testing.assert_series_equal(col_sums, pd.Series(1.0, index=result.index), atol=1e-10) + + # TODO(@MvLieshout): Add tests on different loss functions # noqa: TD003 diff --git a/packages/openstef-models/tests/unit/models/forecasting/test_xgboost_forecaster.py b/packages/openstef-models/tests/unit/models/forecasting/test_xgboost_forecaster.py index 9e51d1047..91fddda99 100644 --- a/packages/openstef-models/tests/unit/models/forecasting/test_xgboost_forecaster.py +++ b/packages/openstef-models/tests/unit/models/forecasting/test_xgboost_forecaster.py @@ -22,7 +22,7 @@ def base_config() -> XGBoostForecasterConfig: """Base configuration for XGBoost forecaster tests.""" return XGBoostForecasterConfig( horizons=[LeadTime(timedelta(days=1))], - quantiles=[Q(0.1), Q(0.5), Q(0.9)], + quantiles=[Q(0.1), Q(0.3), Q(0.5), Q(0.7), Q(0.9)], hyperparams=XGBoostHyperParams( n_estimators=10, # Small for fast tests ), @@ -167,3 +167,32 @@ def test_xgboost_forecaster__feature_importances( col_sums = feature_importances.sum(axis=0) pd.testing.assert_series_equal(col_sums, pd.Series(1.0, index=expected_columns), atol=1e-10) assert (feature_importances >= 0).all().all() + + +def test_xgboost_forecaster_predict_contributions( + sample_forecast_input_dataset: ForecastInputDataset, + base_config: XGBoostForecasterConfig, +): + """Test basic fit and predict workflow with output validation.""" + # Arrange + expected_quantiles = base_config.quantiles + forecaster = XGBoostForecaster(config=base_config) + + # Act + forecaster.fit(sample_forecast_input_dataset) + result = forecaster.predict_contributions(sample_forecast_input_dataset, scale=True) + + # Assert + # Basic functionality + assert forecaster.is_fitted, "Model should be fitted after calling fit()" + + # Check that necessary quantiles are present + input_features = sample_forecast_input_dataset.input_data().columns + expected_columns = [f"{col}_{q.format()}" for col in input_features for q in expected_quantiles] + assert list(result.columns) == expected_columns, f"Expected columns {expected_columns}, got {list(result.columns)}" + + # Contributions should sum to 1.0 per quantile + for q in expected_quantiles: + quantile_cols = [col for col in result.columns if col.endswith(f"_{q.format()}")] + col_sums = result[quantile_cols].sum(axis=1) + pd.testing.assert_series_equal(col_sums, pd.Series(1.0, index=result.index), atol=1e-10, check_dtype=False) diff --git a/packages/openstef-models/tests/unit/transforms/general/test_flagger.py b/packages/openstef-models/tests/unit/transforms/general/test_flagger.py new file mode 100644 index 000000000..b250099f4 --- /dev/null +++ b/packages/openstef-models/tests/unit/transforms/general/test_flagger.py @@ -0,0 +1,62 @@ +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +from datetime import timedelta + +import pandas as pd +import pytest + +from openstef_core.datasets import TimeSeriesDataset +from openstef_models.transforms.general import Flagger +from openstef_models.utils.feature_selection import FeatureSelection + + +@pytest.fixture +def train_dataset() -> TimeSeriesDataset: + """Training dataset with three features A, B, C.""" + return TimeSeriesDataset( + data=pd.DataFrame( + {"A": [1.0, 2.0, 3.0], "B": [1.0, 2.0, 3.0], "C": [1.0, 2.0, 3.0]}, + index=pd.date_range("2025-01-01", periods=3, freq="1h"), + ), + sample_interval=timedelta(hours=1), + ) + + +@pytest.fixture +def test_dataset() -> TimeSeriesDataset: + """Test dataset with values outside training ranges.""" + return TimeSeriesDataset( + data=pd.DataFrame( + {"A": [2, 2], "B": [0.0, 2.0], "C": [1, 4]}, + index=pd.date_range("2025-01-06", periods=2, freq="1h"), + ), + sample_interval=timedelta(hours=1), + ) + + +def test_flagger__fit_transform( + train_dataset: TimeSeriesDataset, + test_dataset: TimeSeriesDataset, +): + """Test fit and transform flags correctly leaves other columns unchanged.""" + # Arrange + flagger = Flagger(selection=FeatureSelection(include={"A", "B", "C"})) + + # Act + flagger.fit(train_dataset) + transformed_dataset = flagger.transform(test_dataset) + + # Assert + # Column C should remain unchanged + expected_df = pd.DataFrame( + { + "A": [1, 1], + "B": [0, 1], + "C": [0, 0], # Unchanged + }, + index=test_dataset.index, + ) + pd.testing.assert_frame_equal(transformed_dataset.data, expected_df) + assert transformed_dataset.sample_interval == test_dataset.sample_interval diff --git a/pyproject.toml b/pyproject.toml index b81bb5eba..eb92e37fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,7 @@ optional-dependencies.beam = [ "openstef-beam", ] optional-dependencies.models = [ + "openstef-meta", "openstef-models[xgb-cpu]", ] urls.Documentation = "https://openstef.github.io/openstef/index.html" @@ -77,6 +78,7 @@ openstef-beam = { workspace = true } openstef-models = { workspace = true } openstef-docs = { workspace = true } openstef-core = { workspace = true } +openstef-meta = { workspace = true } microsoft-python-type-stubs = { git = "git+https://github.com/microsoft/python-type-stubs.git" } [tool.uv.workspace] @@ -85,6 +87,7 @@ members = [ "examples", "packages/openstef-beam", "packages/openstef-core", + "packages/openstef-meta", "packages/openstef-models", ] @@ -191,6 +194,7 @@ source = [ "packages/openstef-beam/src", "packages/openstef-models/src", "packages/openstef-core/src", + "packages/openstef-meta/src", ] omit = [ "tests/*", diff --git a/uv.lock b/uv.lock index 92b9273aa..dde23531c 100644 --- a/uv.lock +++ b/uv.lock @@ -13,6 +13,7 @@ members = [ "openstef-core", "openstef-docs", "openstef-examples", + "openstef-meta", "openstef-models", ] @@ -2697,6 +2698,7 @@ beam = [ { name = "openstef-beam" }, ] models = [ + { name = "openstef-meta" }, { name = "openstef-models", extra = ["xgb-cpu"] }, ] @@ -2728,6 +2730,7 @@ requires-dist = [ { name = "openstef-beam", extras = ["all"], marker = "extra == 'all'", editable = "packages/openstef-beam" }, { name = "openstef-core", editable = "packages/openstef-core" }, { name = "openstef-core", marker = "extra == 'all'", editable = "packages/openstef-core" }, + { name = "openstef-meta", marker = "extra == 'models'", editable = "packages/openstef-meta" }, { name = "openstef-models", extras = ["xgb-cpu"], editable = "packages/openstef-models" }, { name = "openstef-models", extras = ["xgb-cpu"], marker = "extra == 'all'", editable = "packages/openstef-models" }, { name = "openstef-models", extras = ["xgb-cpu"], marker = "extra == 'models'", editable = "packages/openstef-models" }, @@ -2849,6 +2852,23 @@ requires-dist = [ { name = "sphinx-pyproject", specifier = ">=0.3.0" }, ] +[[package]] +name = "openstef-meta" +version = "0.0.0" +source = { editable = "packages/openstef-meta" } +dependencies = [ + { name = "openstef-beam" }, + { name = "openstef-core" }, + { name = "openstef-models" }, +] + +[package.metadata] +requires-dist = [ + { name = "openstef-beam", editable = "packages/openstef-beam" }, + { name = "openstef-core", editable = "packages/openstef-core" }, + { name = "openstef-models", editable = "packages/openstef-models" }, +] + [[package]] name = "openstef-examples" version = "0.0.0"