From 8cf0da443433b5e0caac2b45fd4453976b98104b Mon Sep 17 00:00:00 2001 From: root Date: Mon, 24 Nov 2025 14:06:26 +0100 Subject: [PATCH 01/15] Add support to set pids to track for remote tracking --- .../api/infra/database/sql_models.py | 2 + .../api/infra/repositories/repository_runs.py | 2 + .../edcd10edf11d_add_metadata_in_run_table.py | 2 + codecarbon/core/api_client.py | 1 + codecarbon/core/resource_tracker.py | 4 ++ codecarbon/core/schemas.py | 1 + codecarbon/emissions_tracker.py | 10 ++++ codecarbon/external/hardware.py | 59 +++++++++++++++--- codecarbon/external/ram.py | 60 ++++++++++++++----- codecarbon/external/task.py | 1 + codecarbon/output_methods/emissions_data.py | 2 + docs/_sources/output.rst.txt | 2 + docs/_sources/parameters.rst.txt | 2 + 13 files changed, 127 insertions(+), 21 deletions(-) diff --git a/carbonserver/carbonserver/api/infra/database/sql_models.py b/carbonserver/carbonserver/api/infra/database/sql_models.py index 337771697..7205a47a9 100644 --- a/carbonserver/carbonserver/api/infra/database/sql_models.py +++ b/carbonserver/carbonserver/api/infra/database/sql_models.py @@ -54,6 +54,7 @@ class Run(Base): provider = Column(String, nullable=True) ram_total_size = Column(Float, nullable=True) tracking_mode = Column(String, nullable=True) + tracking_pids = Column(Integer, nullable=True) experiment = relationship("Experiment", back_populates="runs") emissions = relationship( "Emission", back_populates="run", cascade="all, delete-orphan" @@ -77,6 +78,7 @@ def __repr__(self): f'provider="{self.provider}")>,' f'ram_total_size="{self.ram_total_size}")>,' f'tracking_mode="{self.tracking_mode}")>,' + f'tracking_pids="{self.tracking_pids}")>,' ) diff --git a/carbonserver/carbonserver/api/infra/repositories/repository_runs.py b/carbonserver/carbonserver/api/infra/repositories/repository_runs.py index 84d2202b6..2d7f7a6b6 100644 --- a/carbonserver/carbonserver/api/infra/repositories/repository_runs.py +++ b/carbonserver/carbonserver/api/infra/repositories/repository_runs.py @@ -48,6 +48,7 @@ def add_run(self, run: RunCreate) -> Run: provider=run.provider, ram_total_size=run.ram_total_size, tracking_mode=run.tracking_mode, + tracking_pids=run.tracking_pids, ) session.add(db_run) session.commit() @@ -114,6 +115,7 @@ def map_sql_to_schema(run: SqlModelRun) -> Run: provider=run.provider, ram_total_size=run.ram_total_size, tracking_mode=run.tracking_mode, + tracking_pids=run.tracking_pids, ) def get_experiment_detailed_sums_by_run( diff --git a/carbonserver/carbonserver/database/alembic/versions/edcd10edf11d_add_metadata_in_run_table.py b/carbonserver/carbonserver/database/alembic/versions/edcd10edf11d_add_metadata_in_run_table.py index 754a78430..f39720c58 100644 --- a/carbonserver/carbonserver/database/alembic/versions/edcd10edf11d_add_metadata_in_run_table.py +++ b/carbonserver/carbonserver/database/alembic/versions/edcd10edf11d_add_metadata_in_run_table.py @@ -29,6 +29,7 @@ def upgrade(): op.add_column("runs", sa.Column("provider", sa.String)) op.add_column("runs", sa.Column("ram_total_size", sa.Float)) op.add_column("runs", sa.Column("tracking_mode", sa.String)) + op.add_column("runs", sa.Column("tracking_pids", sa.Integer)) def downgrade(): @@ -44,3 +45,4 @@ def downgrade(): op.drop_column("runs", "provider") op.drop_column("runs", "ram_total_size") op.drop_column("runs", "tracking_mode") + op.drop_column("runs", "tracking_pids") \ No newline at end of file diff --git a/codecarbon/core/api_client.py b/codecarbon/core/api_client.py index 34067c71c..94b78a9ae 100644 --- a/codecarbon/core/api_client.py +++ b/codecarbon/core/api_client.py @@ -270,6 +270,7 @@ def _create_run(self, experiment_id: str): provider=self.conf.get("provider"), ram_total_size=self.conf.get("ram_total_size"), tracking_mode=self.conf.get("tracking_mode"), + tracking_pids=self.conf.get("tracking_pids"), ) payload = dataclasses.asdict(run) url = self.url + "/runs" diff --git a/codecarbon/core/resource_tracker.py b/codecarbon/core/resource_tracker.py index 120faf4ef..478b07e52 100644 --- a/codecarbon/core/resource_tracker.py +++ b/codecarbon/core/resource_tracker.py @@ -28,6 +28,7 @@ def set_RAM_tracking(self): self.ram_tracker = "RAM power estimation model" ram = RAM( tracking_mode=self.tracker._tracking_mode, + tracking_pids=self.tracker._tracking_pids, force_ram_power=self.tracker._force_ram_power, ) self.tracker._conf["ram_total_size"] = ram.machine_memory_GB @@ -46,6 +47,7 @@ def _setup_cpu_load_mode(self, tdp, max_power): model, max_power, tracking_mode=self.tracker._tracking_mode, + tracking_pids=self.tracker._tracking_pids, ) self.cpu_tracker = MODE_CPU_LOAD self.tracker._conf["cpu_model"] = hardware_cpu.get_model() @@ -141,6 +143,7 @@ def _setup_fallback_tracking(self, tdp, max_power): model, max_power, tracking_mode=self.tracker._tracking_mode, + tracking_pids=self.tracker._tracking_pids, ) self.cpu_tracker = MODE_CPU_LOAD else: @@ -163,6 +166,7 @@ def _setup_fallback_tracking(self, tdp, max_power): model, max_power, tracking_mode=self.tracker._tracking_mode, + tracking_pids=self.tracker._tracking_pids, ) self.cpu_tracker = MODE_CPU_LOAD else: diff --git a/codecarbon/core/schemas.py b/codecarbon/core/schemas.py index 225deb6a5..64e8e9cca 100644 --- a/codecarbon/core/schemas.py +++ b/codecarbon/core/schemas.py @@ -49,6 +49,7 @@ class RunBase: provider: Optional[str] ram_total_size: Optional[float] tracking_mode: Optional[str] + tracking_pids: Optional[int] class RunCreate(RunBase): diff --git a/codecarbon/emissions_tracker.py b/codecarbon/emissions_tracker.py index 2bb73cbd2..40d0d1392 100644 --- a/codecarbon/emissions_tracker.py +++ b/codecarbon/emissions_tracker.py @@ -176,6 +176,7 @@ def __init__( str ] = _sentinel, # Deprecated, use electricitymaps_api_token tracking_mode: Optional[str] = _sentinel, + tracking_pids: Optional[int] = _sentinel, log_level: Optional[Union[int, str]] = _sentinel, on_csv_write: Optional[str] = _sentinel, logger_preamble: Optional[str] = _sentinel, @@ -231,6 +232,8 @@ def __init__( power consumption due to the entire machine or to try and isolate the tracked processe's in isolation. Defaults to "machine". + :param tracking_pids: PID of the process to be tracked when using "process" mode. + Defaults to None, which means the current process. :param log_level: Global codecarbon log level. Accepts one of: {"debug", "info", "warning", "error", "critical"}. Defaults to "info". @@ -314,6 +317,7 @@ def __init__( self._set_from_conf(prometheus_url, "prometheus_url", "localhost:9091") self._set_from_conf(output_handlers, "output_handlers", []) self._set_from_conf(tracking_mode, "tracking_mode", "machine") + self._set_from_conf(tracking_pids, "tracking_pids", None, int) self._set_from_conf(on_csv_write, "on_csv_write", "append") self._set_from_conf(logger_preamble, "logger_preamble", "") self._set_from_conf(force_cpu_power, "force_cpu_power", None, float) @@ -807,6 +811,7 @@ def _prepare_emissions_data(self) -> EmissionsData: latitude=self._conf.get("latitude"), ram_total_size=self._conf.get("ram_total_size"), tracking_mode=self._conf.get("tracking_mode"), + tracking_pids=self._conf.get("tracking_pids"), pue=self._pue, wue=self._wue, ) @@ -1163,6 +1168,7 @@ def track_emissions( str ] = _sentinel, # Deprecated, use electricitymaps_api_token tracking_mode: Optional[str] = _sentinel, + tracking_pids: Optional[int] = _sentinel, log_level: Optional[Union[int, str]] = _sentinel, on_csv_write: Optional[str] = _sentinel, logger_preamble: Optional[str] = _sentinel, @@ -1223,6 +1229,8 @@ def track_emissions( power consumption due to the entire machine or to try and isolate the tracked processe's in isolation. Defaults to "machine". + :param tracking_pids: PID of the process to be tracked when using "process" mode. + Defaults to None, which means the current process. :param log_level: Global codecarbon log level. Accepts one of: {"debug", "info", "warning", "error", "critical"}. Defaults to "info". @@ -1297,6 +1305,7 @@ def wrapped_fn(*args, **kwargs): gpu_ids=gpu_ids, electricitymaps_api_token=_electricitymaps_token, tracking_mode=tracking_mode, + tracking_pids=tracking_pids, log_level=log_level, on_csv_write=on_csv_write, logger_preamble=logger_preamble, @@ -1336,6 +1345,7 @@ def wrapped_fn(*args, **kwargs): experiment_name=experiment_name, electricitymaps_api_token=_electricitymaps_token, tracking_mode=tracking_mode, + tracking_pids=tracking_pids, log_level=log_level, on_csv_write=on_csv_write, logger_preamble=logger_preamble, diff --git a/codecarbon/external/hardware.py b/codecarbon/external/hardware.py index 30ce125ee..8625289e0 100644 --- a/codecarbon/external/hardware.py +++ b/codecarbon/external/hardware.py @@ -168,6 +168,7 @@ def __init__( tdp: int, rapl_dir: str = "/sys/class/powercap/intel-rapl/subsystem", tracking_mode: str = "machine", + tracking_pids: int = None, rapl_include_dram: bool = False, rapl_prefer_psys: bool = False, ): @@ -179,9 +180,19 @@ def __init__( self._tdp = tdp self._is_generic_tdp = False self._tracking_mode = tracking_mode - self._pid = psutil.Process().pid + self._tracking_pids = tracking_pids self._cpu_count = count_cpus() - self._process = psutil.Process(self._pid) + + if tracking_pids is not None: + # Make list if it is not already a list + if not isinstance(tracking_pids, list): + self._tracking_pids = [tracking_pids] + else: + self._tracking_pids = tracking_pids + else: + self._tracking_pids = [psutil.Process().pid] + + if self._mode == "intel_power_gadget": self._intel_interface = IntelPowerGadget(self._output_dir) @@ -231,7 +242,7 @@ def _calculate_power_from_cpu_load_treadripper(tdp, cpu_load): def _get_power_from_cpu_load(self): """ When in MODE_CPU_LOAD - """ + """ if self._tracking_mode == "machine": tdp = self._tdp cpu_load = psutil.cpu_percent( @@ -245,12 +256,44 @@ def _get_power_from_cpu_load(self): f"CPU load {self._tdp} W and {cpu_load:.1f}% {load_factor=} => estimation of {power} W for whole machine." ) elif self._tracking_mode == "process": - - cpu_load = self._process.cpu_percent(interval=0.5) / self._cpu_count + + cpu_load = 0 + + for pid in self._tracking_pids: + if not psutil.pid_exists(pid): + # Log a warning and continue + logger.warning(f"Process with pid {pid} does not exist anymore.") + continue + self._process = psutil.Process(pid) + cpu_load += self._process.cpu_percent(interval=0.5) + + try: + children = self._process.children(recursive=True) + for child in children: + try: + # Use interval=0.0 for children to avoid blocking + child_cpu = child.cpu_percent(interval=0.0) + logger.info(f"Child {child.pid} CPU: {child_cpu}") + cpu_load += child_cpu + except ( + psutil.NoSuchProcess, + psutil.AccessDenied, + psutil.ZombieProcess, + ): + # Child process may have terminated or we don't have access + continue + except (psutil.NoSuchProcess, psutil.AccessDenied): + # Main process terminated or access denied + pass + + # Normalize by CPU count + logger.info(f"Total CPU load (all processes): {cpu_load}") + cpu_load = cpu_load / self._cpu_count power = self._tdp * cpu_load / 100 logger.debug( - f"CPU load {self._tdp} W and {cpu_load * 100:.1f}% => estimation of {power} W for process {self._pid}." - ) + f"CPU load {self._tdp} W and {cpu_load * 100:.1f}% => estimation of {power} W for process {self._pid} (including children)." + ) + else: raise Exception(f"Unknown tracking_mode {self._tracking_mode}") return Power.from_watts(power) @@ -337,6 +380,7 @@ def from_utils( model: Optional[str] = None, tdp: Optional[int] = None, tracking_mode: str = "machine", + tracking_pids: int = None, rapl_include_dram: bool = False, rapl_prefer_psys: bool = False, ) -> "CPU": @@ -364,6 +408,7 @@ def from_utils( model=model, tdp=tdp, tracking_mode=tracking_mode, + tracking_pids=tracking_pids, rapl_include_dram=rapl_include_dram, rapl_prefer_psys=rapl_prefer_psys, ) diff --git a/codecarbon/external/ram.py b/codecarbon/external/ram.py index 20a5c2fad..9b1bf17c3 100644 --- a/codecarbon/external/ram.py +++ b/codecarbon/external/ram.py @@ -33,10 +33,10 @@ class RAM(BaseHardware): is_arm_cpu = False def __init__( - self, - pid: int = psutil.Process().pid, + self, children: bool = True, tracking_mode: str = "machine", + tracking_pids: int = None, force_ram_power: Optional[int] = None, ): """ @@ -44,20 +44,30 @@ def __init__( current process's. The `pid` is used to find children processes if `children` is True. - Args: - pid (int, optional): Process id (with respect to which we'll look for - children). Defaults to psutil.Process().pid. + Args: children (int, optional): Look for children of the process when computing total RAM used. Defaults to True. tracking_mode (str, optional): Whether to track "machine" or "process" RAM. Defaults to "machine". + tracking_pids (int, optional): Process id to track RAM usage for "process" + tracking_mode. Defaults to None. force_ram_power (int, optional): User-provided RAM power in watts. If provided, this value is used instead of estimating RAM power. Defaults to None. """ - self._pid = pid self._children = children self._tracking_mode = tracking_mode + + if tracking_mode == "process" and tracking_pids is None: + + self._tracking_pids = [psutil.Process().pid] + else: + # Test if individual process or list of ids + if self._tracking_pids is not list: + self._tracking_pids = list(self._tracking_pids) + else: + self._tracking_pids = tracking_pids + self._force_ram_power = force_ram_power # Check if using ARM architecture self.is_arm_cpu = self._detect_arm_cpu() @@ -192,16 +202,22 @@ def _calculate_ram_power(self, memory_gb: float) -> float: # Apply minimum power constraint return max(min_power, total_power) - def _get_children_memories(self): + def _get_children_memories(self, pid: int): """ Compute the used RAM by the process's children Returns: list(int): The list of RAM values - """ - current_process = psutil.Process(self._pid) + """ + memories = dict() + current_process = psutil.Process(pid) + children = current_process.children(recursive=True) - return [child.memory_info().rss for child in children] + for child in children: + memories[child.pid] = child.memory_info().rss + + return memories + def _read_slurm_scontrol(self): try: @@ -292,10 +308,26 @@ def process_memory_GB(self): Returns: float: RAM usage (GB) """ - children_memories = self._get_children_memories() if self._children else [] - main_memory = psutil.Process(self._pid).memory_info().rss - memories = children_memories + [main_memory] - return sum([m for m in memories if m] + [0]) / B_TO_GB + + # Store memory usage in dict to avoid double counting + total_memory = dict() + + for pid in self._tracking_pids: + if not psutil.pid_exists(pid): + logger.warning(f"Process with pid {pid} does not exist anymore.") + continue + + # Own memory + total_memory[pid] = psutil.Process(pid).memory_info().rss + + # Children's memory + children_memories = self._get_children_memories(pid) + for child_pid, mem in children_memories.items(): + total_memory[child_pid] = mem + + # Reduce to total memory + total_memory = sum(total_memory.values()) + return total_memory / B_TO_GB @property def machine_memory_GB(self): diff --git a/codecarbon/external/task.py b/codecarbon/external/task.py index e3d7ecbae..d40d5083e 100644 --- a/codecarbon/external/task.py +++ b/codecarbon/external/task.py @@ -51,5 +51,6 @@ def out(self): latitude=self.emissions_data.latitude, ram_total_size=self.emissions_data.ram_total_size, tracking_mode=self.emissions_data.tracking_mode, + tracking_pids=self.emissions_data.tracking_pids, on_cloud=self.emissions_data.on_cloud, ) diff --git a/codecarbon/output_methods/emissions_data.py b/codecarbon/output_methods/emissions_data.py index 8813a2679..063d1244f 100644 --- a/codecarbon/output_methods/emissions_data.py +++ b/codecarbon/output_methods/emissions_data.py @@ -40,6 +40,7 @@ class EmissionsData: latitude: float ram_total_size: float tracking_mode: str + tracking_pids: int on_cloud: str = "N" pue: float = 1 wue: float = 0 @@ -101,6 +102,7 @@ class TaskEmissionsData: latitude: float ram_total_size: float tracking_mode: str + tracking_pids: int on_cloud: str = "N" @property diff --git a/docs/_sources/output.rst.txt b/docs/_sources/output.rst.txt index 2a6a4a369..08a260d13 100644 --- a/docs/_sources/output.rst.txt +++ b/docs/_sources/output.rst.txt @@ -79,6 +79,8 @@ input parameter (defaults to the current directory), for each experiment tracked - total RAM available (Go) * - Tracking_mode: - ``machine`` or ``process``(default to ``machine``) + * - tracking_pids + - Process id to track CPU and RAM usage for ``process`` tracking_mode (default to ``None``) .. note:: diff --git a/docs/_sources/parameters.rst.txt b/docs/_sources/parameters.rst.txt index cf7884c88..36b01ea61 100644 --- a/docs/_sources/parameters.rst.txt +++ b/docs/_sources/parameters.rst.txt @@ -24,6 +24,8 @@ Input Parameters * - tracking_mode - | ``machine`` measure the power consumptions of the entire machine (default) | ``process`` try and isolate the tracked processes in isolation + * - tracking_pids + - | Process id to track CPU and RAM usage for ``process`` tracking_mode (default to ``None``) * - gpu_ids - | Comma-separated list of GPU ids to track, defaults to ``None`` | These can either be integer indexes of GPUs on the system, or prefixes From 9bcb3d60612b9a0c1a67debeccc5c55fce980ac0 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 25 Nov 2025 11:28:07 +0100 Subject: [PATCH 02/15] Some fixes and added examples --- codecarbon/emissions_tracker.py | 2 +- codecarbon/external/ram.py | 16 +- .../output_methods/metrics/prometheus.py | 7 +- examples/slurm_logging.py | 217 ++++++++++++++++++ examples/slurm_prolog.sh | 32 +++ 5 files changed, 264 insertions(+), 10 deletions(-) create mode 100755 examples/slurm_logging.py create mode 100755 examples/slurm_prolog.sh diff --git a/codecarbon/emissions_tracker.py b/codecarbon/emissions_tracker.py index 40d0d1392..03240b31f 100644 --- a/codecarbon/emissions_tracker.py +++ b/codecarbon/emissions_tracker.py @@ -451,7 +451,7 @@ def _init_output_methods(self, *, api_key: str = None): self.run_id = uuid.uuid4() if self._save_to_prometheus: - self._output_handlers.append(PrometheusOutput(self._prometheus_url)) + self._output_handlers.append(PrometheusOutput(self._prometheus_url, jobname=self._project_name + "_" + self._experiment_name)) if self._save_to_logfire: self._output_handlers.append(LogfireOutput()) diff --git a/codecarbon/external/ram.py b/codecarbon/external/ram.py index 9b1bf17c3..4ccf8e9f9 100644 --- a/codecarbon/external/ram.py +++ b/codecarbon/external/ram.py @@ -63,8 +63,8 @@ def __init__( self._tracking_pids = [psutil.Process().pid] else: # Test if individual process or list of ids - if self._tracking_pids is not list: - self._tracking_pids = list(self._tracking_pids) + if tracking_pids is not list: + self._tracking_pids = [tracking_pids] else: self._tracking_pids = tracking_pids @@ -209,14 +209,14 @@ def _get_children_memories(self, pid: int): Returns: list(int): The list of RAM values """ - memories = dict() + memorie_consumption = dict() current_process = psutil.Process(pid) children = current_process.children(recursive=True) for child in children: - memories[child.pid] = child.memory_info().rss + memorie_consumption[child.pid] = child.memory_info().rss - return memories + return memorie_consumption def _read_slurm_scontrol(self): @@ -301,7 +301,7 @@ def slurm_memory_GB(self): return mem @property - def process_memory_GB(self): + def process_memory_GB(self) -> float: """ Property to compute the process's total memory usage in bytes. @@ -326,7 +326,9 @@ def process_memory_GB(self): total_memory[child_pid] = mem # Reduce to total memory - total_memory = sum(total_memory.values()) + total_memory = sum(total_memory.values()) + logger.debug(f"Process total memory usage: {total_memory / B_TO_GB:.2f} GB") + return total_memory / B_TO_GB @property diff --git a/codecarbon/output_methods/metrics/prometheus.py b/codecarbon/output_methods/metrics/prometheus.py index 24318c6b8..a65b770cc 100644 --- a/codecarbon/output_methods/metrics/prometheus.py +++ b/codecarbon/output_methods/metrics/prometheus.py @@ -35,6 +35,8 @@ # TODO: Set up the possible labels labelnames = [ "project_name", + "experiment_id", + "experiment_name", "country_name", "country_iso_code", "region", @@ -77,8 +79,9 @@ class PrometheusOutput(BaseOutput): Send emissions data to prometheus pushgateway """ - def __init__(self, prometheus_url: str): + def __init__(self, prometheus_url: str, jobname: str = "codecarbon"): self.prometheus_url = prometheus_url + self.jobname = jobname def out(self, total: EmissionsData, delta: EmissionsData): try: @@ -124,7 +127,7 @@ def add_emission(self, carbon_emission: dict): # Send the new metric values push_to_gateway( self.prometheus_url, - job="codecarbon", + job=self.jobname, registry=registry, handler=self._auth_handler, ) diff --git a/examples/slurm_logging.py b/examples/slurm_logging.py new file mode 100755 index 000000000..dcc260dc7 --- /dev/null +++ b/examples/slurm_logging.py @@ -0,0 +1,217 @@ +#!/root/.venv/codecarbon/bin/python3 + +import os + +import logging +from codecarbon import OfflineEmissionsTracker + +import time + +import psutil +import sys +import traceback + +import subprocess as sp + +import argparse + + + +def _print_process_tree(proc, indent=0): + prefix = " " * indent + try: + name = proc.name() + pid = proc.pid + except psutil.NoSuchProcess: + return + + log_message(f"{prefix}{name} (pid {pid})\n") + + # Children + for child in proc.children(recursive=False): + _print_process_tree(child, indent + 4) + +def print_process_tree(pid=os.getpid()): + logger = logging.getLogger() + current = psutil.Process(pid) + log_message("\n=== Parent Tree ===\n") + p = current + stack = [] + while p is not None: + stack.append(p) + p = p.parent() + + # Print ancestors from root → current + for proc in reversed(stack): + log_message(f"{proc.name()} (pid {proc.pid})\n") + log_message("\n=== Children Tree ===\n") + _print_process_tree(current) + + +def query_slurm_pids(jobid): + + try: + sp_output = sp.check_output(["/usr/local/bin/scontrol", "listpids", str(jobid)], stderr=sp.STDOUT) + log_message(f"scontrol output:\n{sp_output.decode()}") + except sp.CalledProcessError as e: + log_message(f"scontrol failed for job {jobid}\n") + log_message(f"Return code: {e.returncode}\n") + log_message(f"Output:\n{e.output.decode(errors='replace')}\n") + return [] + except Exception as e: + # Catch-all for other failures + log_message(f"Unexpected error calling scontrol: {e}\n") + return [] + + pids = [] + lines = sp_output.decode().strip().splitlines() + for line in lines[1:]: # Skip the first line + parts = line.split() + if not parts: + continue + + pid = parts[0] + # skip invalid PIDs + if pid in ("-1", "-"): + continue + + try: + pids.append(int(pid)) + except ValueError: + # In case pid is something unexpected + continue + + return pids + + +def log_message(message): + print(message) + if logfile is not None: + logfile.write(message + "\n") + logfile.flush() + + +def build_argument_parser(): + parser = argparse.ArgumentParser( + description="CodeCarbon job wrapper" + ) + group_ids = parser.add_mutually_exclusive_group(required=True) + group_ids.add_argument( + "--jobid", + type=int, + required=False, + default=None, + help="SLURM Job ID" + ) + group_ids.add_argument( + "--pids", + type=int, + nargs="+", + required=False, + default=[], + help="Process ID" + ) + + parser.add_argument( + "--user", + type=str, + required=True, + help="SLURM Job User" + ) + parser.add_argument( + "--gpuids", + type=str, + required=False, + help="Comma-separated GPU IDs assigned to the job" + ) + return parser + +################################################################### + +# Loglevel debug +logging.basicConfig(level=logging.DEBUG) + +logfile = None +try: + parser = build_argument_parser() + args = parser.parse_args() + + jobid = args.jobid + pids = args.pids + + user = args.user + if args.gpuids: + gpuids = args.gpuids.split(",") + else: + gpuids = [] + + os.environ["SLURM_JOB_ID"] = str(jobid) + os.environ["SLURM_JOB_USER"] = str(user) + os.environ["SLURM_JOB_GPUS"] = ",".join(gpuids) + + + logfile = open(f"/tmp/cc_{jobid}.log", "w", buffering=1) + log_message("Python started") + log_message(f"Interpreter: {sys.executable}") + + log_message("CodeCarbon SLURM Prolog Script Started") + log_message(f"Time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}") + + + log_message("Available environment variables:") + for key, value in os.environ.items(): + log_message(f"{key}: {value}") + + log_message("Wait 60 seconds to allow job processes to start") + for i in range(60): + log_message(f" Waiting... {1 * i} seconds elapsed") + time.sleep(1) # Give some time for the job to start properly + + log_message("Wait completed at " + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) + + + if jobid is None: + log_message(f"Using provided PIDs: {pids}") + else: + log_message("Parse scontrol for process IDs with \"scontrol listpids\"") + pids = query_slurm_pids(jobid) + + log_message(f"Found PIDs: {pids}") + + for pid in pids: + log_message(f"Process tree for PID {pid}:") + print_process_tree(pid) + + + log_message(f"Job ID: {jobid}, User: {user}, GPU IDs: {gpuids}") + + tracker = OfflineEmissionsTracker(country_iso_code="DEU", region="DE-NW", measure_power_secs=10, api_call_interval=2, + gpu_ids=f"{gpuids}", tracking_mode="process", tracking_pids=args.jobid, + save_to_prometheus=True, prometheus_url="129.217.31.239:9091", + project_name=f"{user}", experiment_name=f"{jobid}", + output_dir="/tmp/codecarbon_log/", output_file="/tmp/codecarbon_log/emission.csv") + + tracker.start() + + # Check for termination signal every second + try: + while True: + time.sleep(1) + except KeyboardInterrupt: + log_message("Received termination signal. Stopping CodeCarbon tracker...") + except Exception as e: + log_message(f"Exception in tracking loop: {e}") + raise e + finally: + tracker.stop() + log_message("CodeCarbon tracker stopped.") + +except Exception: + log_message("Exception occurred:") + log_message(traceback.format_exc()) + +finally: + if logfile is not None: + log_message("CodeCarbon SLURM Prolog Script Ended") + logfile.close() + diff --git a/examples/slurm_prolog.sh b/examples/slurm_prolog.sh new file mode 100755 index 000000000..eaa24859c --- /dev/null +++ b/examples/slurm_prolog.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +JOBID=$SLURM_JOB_ID +LOGFILE="/tmp/prolog_${JOBID}.log" +PIDFILE="/tmp/prolog_${JOBID}.pid" + +echo "Starting CodeCarbon for job $JOBID" >> "$LOGFILE" + +mkdir -p /tmp/codecarbon_log/ + +# Check if GPU IDs are available +if [ -z "$SLURM_JOB_GPUS" ]; then + # We cannot inherit the cgroup because slum kills the entire cgroup on end of initialization + systemd-run --unit codecarbon_$JOBID \ + /etc/slurm/pyscripts/_codecarbon.py \ + --jobid $JOBID \ + --user $SLURM_JOB_USER \ + &>> "$LOGFILE" +else + systemd-run --unit codecarbon_$JOBID \ + /etc/slurm/pyscripts/_codecarbon.py \ + --jobid $JOBID \ + --user $SLURM_JOB_USER \ + --gpuids $SLURM_JOB_GPUS \ + &>> "$LOGFILE" +fi + +# Save PID for epilog +sleep 1 +exit 0 + + From 73b574cfad729b7ca2b27b4e3162a541ed89cfff Mon Sep 17 00:00:00 2001 From: root Date: Tue, 25 Nov 2025 14:20:53 +0100 Subject: [PATCH 03/15] Removed tracking_pids from API --- .../versions/edcd10edf11d_add_metadata_in_run_table.py | 4 +--- codecarbon/core/api_client.py | 1 - codecarbon/core/schemas.py | 1 - codecarbon/output_methods/emissions_data.py | 2 -- 4 files changed, 1 insertion(+), 7 deletions(-) diff --git a/carbonserver/carbonserver/database/alembic/versions/edcd10edf11d_add_metadata_in_run_table.py b/carbonserver/carbonserver/database/alembic/versions/edcd10edf11d_add_metadata_in_run_table.py index f39720c58..75cf02760 100644 --- a/carbonserver/carbonserver/database/alembic/versions/edcd10edf11d_add_metadata_in_run_table.py +++ b/carbonserver/carbonserver/database/alembic/versions/edcd10edf11d_add_metadata_in_run_table.py @@ -29,7 +29,6 @@ def upgrade(): op.add_column("runs", sa.Column("provider", sa.String)) op.add_column("runs", sa.Column("ram_total_size", sa.Float)) op.add_column("runs", sa.Column("tracking_mode", sa.String)) - op.add_column("runs", sa.Column("tracking_pids", sa.Integer)) def downgrade(): @@ -44,5 +43,4 @@ def downgrade(): op.drop_column("runs", "region") op.drop_column("runs", "provider") op.drop_column("runs", "ram_total_size") - op.drop_column("runs", "tracking_mode") - op.drop_column("runs", "tracking_pids") \ No newline at end of file + op.drop_column("runs", "tracking_mode") \ No newline at end of file diff --git a/codecarbon/core/api_client.py b/codecarbon/core/api_client.py index 94b78a9ae..34067c71c 100644 --- a/codecarbon/core/api_client.py +++ b/codecarbon/core/api_client.py @@ -270,7 +270,6 @@ def _create_run(self, experiment_id: str): provider=self.conf.get("provider"), ram_total_size=self.conf.get("ram_total_size"), tracking_mode=self.conf.get("tracking_mode"), - tracking_pids=self.conf.get("tracking_pids"), ) payload = dataclasses.asdict(run) url = self.url + "/runs" diff --git a/codecarbon/core/schemas.py b/codecarbon/core/schemas.py index 64e8e9cca..225deb6a5 100644 --- a/codecarbon/core/schemas.py +++ b/codecarbon/core/schemas.py @@ -49,7 +49,6 @@ class RunBase: provider: Optional[str] ram_total_size: Optional[float] tracking_mode: Optional[str] - tracking_pids: Optional[int] class RunCreate(RunBase): diff --git a/codecarbon/output_methods/emissions_data.py b/codecarbon/output_methods/emissions_data.py index 063d1244f..8813a2679 100644 --- a/codecarbon/output_methods/emissions_data.py +++ b/codecarbon/output_methods/emissions_data.py @@ -40,7 +40,6 @@ class EmissionsData: latitude: float ram_total_size: float tracking_mode: str - tracking_pids: int on_cloud: str = "N" pue: float = 1 wue: float = 0 @@ -102,7 +101,6 @@ class TaskEmissionsData: latitude: float ram_total_size: float tracking_mode: str - tracking_pids: int on_cloud: str = "N" @property From e49eb4e75d3da81389d3d68f34ab11ba81c47d35 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 25 Nov 2025 14:51:46 +0100 Subject: [PATCH 04/15] Missed SQL_models --- carbonserver/carbonserver/api/infra/database/sql_models.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/carbonserver/carbonserver/api/infra/database/sql_models.py b/carbonserver/carbonserver/api/infra/database/sql_models.py index 7205a47a9..337771697 100644 --- a/carbonserver/carbonserver/api/infra/database/sql_models.py +++ b/carbonserver/carbonserver/api/infra/database/sql_models.py @@ -54,7 +54,6 @@ class Run(Base): provider = Column(String, nullable=True) ram_total_size = Column(Float, nullable=True) tracking_mode = Column(String, nullable=True) - tracking_pids = Column(Integer, nullable=True) experiment = relationship("Experiment", back_populates="runs") emissions = relationship( "Emission", back_populates="run", cascade="all, delete-orphan" @@ -78,7 +77,6 @@ def __repr__(self): f'provider="{self.provider}")>,' f'ram_total_size="{self.ram_total_size}")>,' f'tracking_mode="{self.tracking_mode}")>,' - f'tracking_pids="{self.tracking_pids}")>,' ) From dba5ab05d147ca66826416029d2ec659c8b904a5 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 25 Nov 2025 17:02:51 +0100 Subject: [PATCH 05/15] Fixed issues from autorunner --- .../carbonserver/api/infra/repositories/repository_runs.py | 6 ++---- codecarbon/emissions_tracker.py | 3 +-- codecarbon/external/task.py | 3 +-- examples/slurm_logging.py | 1 - 4 files changed, 4 insertions(+), 9 deletions(-) diff --git a/carbonserver/carbonserver/api/infra/repositories/repository_runs.py b/carbonserver/carbonserver/api/infra/repositories/repository_runs.py index 2d7f7a6b6..db176010e 100644 --- a/carbonserver/carbonserver/api/infra/repositories/repository_runs.py +++ b/carbonserver/carbonserver/api/infra/repositories/repository_runs.py @@ -47,8 +47,7 @@ def add_run(self, run: RunCreate) -> Run: region=run.region, provider=run.provider, ram_total_size=run.ram_total_size, - tracking_mode=run.tracking_mode, - tracking_pids=run.tracking_pids, + tracking_mode=run.tracking_mode, ) session.add(db_run) session.commit() @@ -114,8 +113,7 @@ def map_sql_to_schema(run: SqlModelRun) -> Run: region=run.region, provider=run.provider, ram_total_size=run.ram_total_size, - tracking_mode=run.tracking_mode, - tracking_pids=run.tracking_pids, + tracking_mode=run.tracking_mode, ) def get_experiment_detailed_sums_by_run( diff --git a/codecarbon/emissions_tracker.py b/codecarbon/emissions_tracker.py index 03240b31f..b9f5023d7 100644 --- a/codecarbon/emissions_tracker.py +++ b/codecarbon/emissions_tracker.py @@ -810,8 +810,7 @@ def _prepare_emissions_data(self) -> EmissionsData: longitude=self._conf.get("longitude"), latitude=self._conf.get("latitude"), ram_total_size=self._conf.get("ram_total_size"), - tracking_mode=self._conf.get("tracking_mode"), - tracking_pids=self._conf.get("tracking_pids"), + tracking_mode=self._conf.get("tracking_mode"), pue=self._pue, wue=self._wue, ) diff --git a/codecarbon/external/task.py b/codecarbon/external/task.py index d40d5083e..4972cc0fe 100644 --- a/codecarbon/external/task.py +++ b/codecarbon/external/task.py @@ -50,7 +50,6 @@ def out(self): longitude=self.emissions_data.longitude, latitude=self.emissions_data.latitude, ram_total_size=self.emissions_data.ram_total_size, - tracking_mode=self.emissions_data.tracking_mode, - tracking_pids=self.emissions_data.tracking_pids, + tracking_mode=self.emissions_data.tracking_mode, on_cloud=self.emissions_data.on_cloud, ) diff --git a/examples/slurm_logging.py b/examples/slurm_logging.py index dcc260dc7..63865d2ed 100755 --- a/examples/slurm_logging.py +++ b/examples/slurm_logging.py @@ -32,7 +32,6 @@ def _print_process_tree(proc, indent=0): _print_process_tree(child, indent + 4) def print_process_tree(pid=os.getpid()): - logger = logging.getLogger() current = psutil.Process(pid) log_message("\n=== Parent Tree ===\n") p = current From 9b5ce4329cae56aee3e2cf2854b2122165bfdd3b Mon Sep 17 00:00:00 2001 From: root Date: Fri, 28 Nov 2025 11:12:57 +0100 Subject: [PATCH 06/15] Fixed formating, removed changes to .txt files and added them to .rst, started unittest --- codecarbon/emissions_tracker.py | 9 +- codecarbon/external/hardware.py | 22 +++-- codecarbon/external/ram.py | 45 +++++----- docs/_sources/output.rst.txt | 2 - docs/_sources/parameters.rst.txt | 2 - docs/edit/parameters.rst | 3 + examples/slurm_logging.py | 137 +++++++++++++++---------------- 7 files changed, 106 insertions(+), 114 deletions(-) diff --git a/codecarbon/emissions_tracker.py b/codecarbon/emissions_tracker.py index b9f5023d7..bf52e1df4 100644 --- a/codecarbon/emissions_tracker.py +++ b/codecarbon/emissions_tracker.py @@ -451,7 +451,12 @@ def _init_output_methods(self, *, api_key: str = None): self.run_id = uuid.uuid4() if self._save_to_prometheus: - self._output_handlers.append(PrometheusOutput(self._prometheus_url, jobname=self._project_name + "_" + self._experiment_name)) + self._output_handlers.append( + PrometheusOutput( + self._prometheus_url, + jobname=self._project_name + "_" + self._experiment_name, + ) + ) if self._save_to_logfire: self._output_handlers.append(LogfireOutput()) @@ -810,7 +815,7 @@ def _prepare_emissions_data(self) -> EmissionsData: longitude=self._conf.get("longitude"), latitude=self._conf.get("latitude"), ram_total_size=self._conf.get("ram_total_size"), - tracking_mode=self._conf.get("tracking_mode"), + tracking_mode=self._conf.get("tracking_mode"), pue=self._pue, wue=self._wue, ) diff --git a/codecarbon/external/hardware.py b/codecarbon/external/hardware.py index 8625289e0..ffdd50a7a 100644 --- a/codecarbon/external/hardware.py +++ b/codecarbon/external/hardware.py @@ -182,7 +182,7 @@ def __init__( self._tracking_mode = tracking_mode self._tracking_pids = tracking_pids self._cpu_count = count_cpus() - + if tracking_pids is not None: # Make list if it is not already a list if not isinstance(tracking_pids, list): @@ -191,8 +191,6 @@ def __init__( self._tracking_pids = tracking_pids else: self._tracking_pids = [psutil.Process().pid] - - if self._mode == "intel_power_gadget": self._intel_interface = IntelPowerGadget(self._output_dir) @@ -242,7 +240,7 @@ def _calculate_power_from_cpu_load_treadripper(tdp, cpu_load): def _get_power_from_cpu_load(self): """ When in MODE_CPU_LOAD - """ + """ if self._tracking_mode == "machine": tdp = self._tdp cpu_load = psutil.cpu_percent( @@ -256,9 +254,9 @@ def _get_power_from_cpu_load(self): f"CPU load {self._tdp} W and {cpu_load:.1f}% {load_factor=} => estimation of {power} W for whole machine." ) elif self._tracking_mode == "process": - + cpu_load = 0 - + for pid in self._tracking_pids: if not psutil.pid_exists(pid): # Log a warning and continue @@ -266,7 +264,7 @@ def _get_power_from_cpu_load(self): continue self._process = psutil.Process(pid) cpu_load += self._process.cpu_percent(interval=0.5) - + try: children = self._process.children(recursive=True) for child in children: @@ -281,19 +279,19 @@ def _get_power_from_cpu_load(self): psutil.ZombieProcess, ): # Child process may have terminated or we don't have access - continue + continue except (psutil.NoSuchProcess, psutil.AccessDenied): # Main process terminated or access denied - pass - + pass + # Normalize by CPU count logger.info(f"Total CPU load (all processes): {cpu_load}") cpu_load = cpu_load / self._cpu_count power = self._tdp * cpu_load / 100 logger.debug( f"CPU load {self._tdp} W and {cpu_load * 100:.1f}% => estimation of {power} W for process {self._pid} (including children)." - ) - + ) + else: raise Exception(f"Unknown tracking_mode {self._tracking_mode}") return Power.from_watts(power) diff --git a/codecarbon/external/ram.py b/codecarbon/external/ram.py index 4ccf8e9f9..df2d73284 100644 --- a/codecarbon/external/ram.py +++ b/codecarbon/external/ram.py @@ -33,7 +33,7 @@ class RAM(BaseHardware): is_arm_cpu = False def __init__( - self, + self, children: bool = True, tracking_mode: str = "machine", tracking_pids: int = None, @@ -44,7 +44,7 @@ def __init__( current process's. The `pid` is used to find children processes if `children` is True. - Args: + Args: children (int, optional): Look for children of the process when computing total RAM used. Defaults to True. tracking_mode (str, optional): Whether to track "machine" or "process" RAM. @@ -57,17 +57,17 @@ def __init__( """ self._children = children self._tracking_mode = tracking_mode - + if tracking_mode == "process" and tracking_pids is None: - + self._tracking_pids = [psutil.Process().pid] else: - # Test if individual process or list of ids - if tracking_pids is not list: - self._tracking_pids = [tracking_pids] + # Test if individual process or list of ids + if tracking_pids is not list: + self._tracking_pids = [tracking_pids] else: self._tracking_pids = tracking_pids - + self._force_ram_power = force_ram_power # Check if using ARM architecture self.is_arm_cpu = self._detect_arm_cpu() @@ -208,16 +208,15 @@ def _get_children_memories(self, pid: int): Returns: list(int): The list of RAM values - """ - memorie_consumption = dict() - current_process = psutil.Process(pid) - + """ + memorie_consumption = dict() + current_process = psutil.Process(pid) + children = current_process.children(recursive=True) for child in children: - memorie_consumption[child.pid] = child.memory_info().rss - - return memorie_consumption - + memorie_consumption[child.pid] = child.memory_info().rss + + return memorie_consumption def _read_slurm_scontrol(self): try: @@ -308,27 +307,27 @@ def process_memory_GB(self) -> float: Returns: float: RAM usage (GB) """ - + # Store memory usage in dict to avoid double counting total_memory = dict() - + for pid in self._tracking_pids: if not psutil.pid_exists(pid): logger.warning(f"Process with pid {pid} does not exist anymore.") continue - + # Own memory total_memory[pid] = psutil.Process(pid).memory_info().rss - + # Children's memory children_memories = self._get_children_memories(pid) for child_pid, mem in children_memories.items(): total_memory[child_pid] = mem - + # Reduce to total memory - total_memory = sum(total_memory.values()) + total_memory = sum(total_memory.values()) logger.debug(f"Process total memory usage: {total_memory / B_TO_GB:.2f} GB") - + return total_memory / B_TO_GB @property diff --git a/docs/_sources/output.rst.txt b/docs/_sources/output.rst.txt index 08a260d13..2a6a4a369 100644 --- a/docs/_sources/output.rst.txt +++ b/docs/_sources/output.rst.txt @@ -79,8 +79,6 @@ input parameter (defaults to the current directory), for each experiment tracked - total RAM available (Go) * - Tracking_mode: - ``machine`` or ``process``(default to ``machine``) - * - tracking_pids - - Process id to track CPU and RAM usage for ``process`` tracking_mode (default to ``None``) .. note:: diff --git a/docs/_sources/parameters.rst.txt b/docs/_sources/parameters.rst.txt index 36b01ea61..cf7884c88 100644 --- a/docs/_sources/parameters.rst.txt +++ b/docs/_sources/parameters.rst.txt @@ -24,8 +24,6 @@ Input Parameters * - tracking_mode - | ``machine`` measure the power consumptions of the entire machine (default) | ``process`` try and isolate the tracked processes in isolation - * - tracking_pids - - | Process id to track CPU and RAM usage for ``process`` tracking_mode (default to ``None``) * - gpu_ids - | Comma-separated list of GPU ids to track, defaults to ``None`` | These can either be integer indexes of GPUs on the system, or prefixes diff --git a/docs/edit/parameters.rst b/docs/edit/parameters.rst index cf7884c88..26b17f896 100644 --- a/docs/edit/parameters.rst +++ b/docs/edit/parameters.rst @@ -24,6 +24,9 @@ Input Parameters * - tracking_mode - | ``machine`` measure the power consumptions of the entire machine (default) | ``process`` try and isolate the tracked processes in isolation + * - tracking_pids + - | List of PIDs to track when using ``process`` tracking mode, + | defaults to ``None``, which tracks the current process * - gpu_ids - | Comma-separated list of GPU ids to track, defaults to ``None`` | These can either be integer indexes of GPUs on the system, or prefixes diff --git a/examples/slurm_logging.py b/examples/slurm_logging.py index 63865d2ed..6ebdcb693 100755 --- a/examples/slurm_logging.py +++ b/examples/slurm_logging.py @@ -1,20 +1,16 @@ #!/root/.venv/codecarbon/bin/python3 -import os - +import argparse import logging -from codecarbon import OfflineEmissionsTracker - -import time - -import psutil +import os +import subprocess as sp import sys +import time import traceback -import subprocess as sp - -import argparse +import psutil +from codecarbon import OfflineEmissionsTracker def _print_process_tree(proc, indent=0): @@ -31,6 +27,7 @@ def _print_process_tree(proc, indent=0): for child in proc.children(recursive=False): _print_process_tree(child, indent + 4) + def print_process_tree(pid=os.getpid()): current = psutil.Process(pid) log_message("\n=== Parent Tree ===\n") @@ -46,11 +43,13 @@ def print_process_tree(pid=os.getpid()): log_message("\n=== Children Tree ===\n") _print_process_tree(current) - + def query_slurm_pids(jobid): - + try: - sp_output = sp.check_output(["/usr/local/bin/scontrol", "listpids", str(jobid)], stderr=sp.STDOUT) + sp_output = sp.check_output( + ["/usr/local/bin/scontrol", "listpids", str(jobid)], stderr=sp.STDOUT + ) log_message(f"scontrol output:\n{sp_output.decode()}") except sp.CalledProcessError as e: log_message(f"scontrol failed for job {jobid}\n") @@ -61,25 +60,25 @@ def query_slurm_pids(jobid): # Catch-all for other failures log_message(f"Unexpected error calling scontrol: {e}\n") return [] - + pids = [] lines = sp_output.decode().strip().splitlines() for line in lines[1:]: # Skip the first line parts = line.split() if not parts: continue - + pid = parts[0] # skip invalid PIDs if pid in ("-1", "-"): continue - + try: pids.append(int(pid)) except ValueError: # In case pid is something unexpected continue - + return pids @@ -88,43 +87,28 @@ def log_message(message): if logfile is not None: logfile.write(message + "\n") logfile.flush() - - + + def build_argument_parser(): - parser = argparse.ArgumentParser( - description="CodeCarbon job wrapper" - ) + parser = argparse.ArgumentParser(description="CodeCarbon job wrapper") group_ids = parser.add_mutually_exclusive_group(required=True) group_ids.add_argument( - "--jobid", - type=int, - required=False, - default=None, - help="SLURM Job ID" + "--jobid", type=int, required=False, default=None, help="SLURM Job ID" ) group_ids.add_argument( - "--pids", - type=int, - nargs="+", - required=False, - default=[], - help="Process ID" - ) - - parser.add_argument( - "--user", - type=str, - required=True, - help="SLURM Job User" + "--pids", type=int, nargs="+", required=False, default=[], help="Process ID" ) + + parser.add_argument("--user", type=str, required=True, help="SLURM Job User") parser.add_argument( "--gpuids", type=str, required=False, - help="Comma-separated GPU IDs assigned to the job" + help="Comma-separated GPU IDs assigned to the job", ) return parser + ################################################################### # Loglevel debug @@ -134,73 +118,81 @@ def build_argument_parser(): try: parser = build_argument_parser() args = parser.parse_args() - + jobid = args.jobid pids = args.pids - + user = args.user if args.gpuids: gpuids = args.gpuids.split(",") else: gpuids = [] - + os.environ["SLURM_JOB_ID"] = str(jobid) os.environ["SLURM_JOB_USER"] = str(user) os.environ["SLURM_JOB_GPUS"] = ",".join(gpuids) - - + logfile = open(f"/tmp/cc_{jobid}.log", "w", buffering=1) log_message("Python started") log_message(f"Interpreter: {sys.executable}") - + log_message("CodeCarbon SLURM Prolog Script Started") log_message(f"Time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}") - - + log_message("Available environment variables:") for key, value in os.environ.items(): log_message(f"{key}: {value}") - - log_message("Wait 60 seconds to allow job processes to start") + + log_message("Wait 60 seconds to allow job processes to start") for i in range(60): - log_message(f" Waiting... {1 * i} seconds elapsed") + log_message(f" Waiting... {1 * i} seconds elapsed") time.sleep(1) # Give some time for the job to start properly - - log_message("Wait completed at " + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) - - + + log_message( + "Wait completed at " + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ) + if jobid is None: log_message(f"Using provided PIDs: {pids}") else: - log_message("Parse scontrol for process IDs with \"scontrol listpids\"") + log_message('Parse scontrol for process IDs with "scontrol listpids"') pids = query_slurm_pids(jobid) - + log_message(f"Found PIDs: {pids}") - + for pid in pids: log_message(f"Process tree for PID {pid}:") print_process_tree(pid) - - + log_message(f"Job ID: {jobid}, User: {user}, GPU IDs: {gpuids}") - - tracker = OfflineEmissionsTracker(country_iso_code="DEU", region="DE-NW", measure_power_secs=10, api_call_interval=2, - gpu_ids=f"{gpuids}", tracking_mode="process", tracking_pids=args.jobid, - save_to_prometheus=True, prometheus_url="129.217.31.239:9091", - project_name=f"{user}", experiment_name=f"{jobid}", - output_dir="/tmp/codecarbon_log/", output_file="/tmp/codecarbon_log/emission.csv") - + + tracker = OfflineEmissionsTracker( + country_iso_code="DEU", + region="DE-NW", + measure_power_secs=10, + api_call_interval=2, + gpu_ids=f"{gpuids}", + tracking_mode="process", + tracking_pids=args.jobid, + save_to_prometheus=True, + prometheus_url="129.217.31.239:9091", + project_name=f"{user}", + experiment_name=f"{jobid}", + output_dir="/tmp/codecarbon_log/", + output_file="/tmp/codecarbon_log/emission.csv", + ) + tracker.start() - + # Check for termination signal every second try: while True: time.sleep(1) except KeyboardInterrupt: - log_message("Received termination signal. Stopping CodeCarbon tracker...") + log_message("Received termination signal. Stopping CodeCarbon tracker...") except Exception as e: log_message(f"Exception in tracking loop: {e}") - raise e + raise e finally: tracker.stop() log_message("CodeCarbon tracker stopped.") @@ -208,9 +200,8 @@ def build_argument_parser(): except Exception: log_message("Exception occurred:") log_message(traceback.format_exc()) - + finally: if logfile is not None: log_message("CodeCarbon SLURM Prolog Script Ended") logfile.close() - From 2b5146a5e3667e95728c32809de9e091c34f06fc Mon Sep 17 00:00:00 2001 From: root Date: Fri, 28 Nov 2025 11:23:49 +0100 Subject: [PATCH 07/15] Renamed missing _pid to _tracking_pids --- codecarbon/external/hardware.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codecarbon/external/hardware.py b/codecarbon/external/hardware.py index ffdd50a7a..38dd9b0c4 100644 --- a/codecarbon/external/hardware.py +++ b/codecarbon/external/hardware.py @@ -289,7 +289,7 @@ def _get_power_from_cpu_load(self): cpu_load = cpu_load / self._cpu_count power = self._tdp * cpu_load / 100 logger.debug( - f"CPU load {self._tdp} W and {cpu_load * 100:.1f}% => estimation of {power} W for process {self._pid} (including children)." + f"CPU load {self._tdp} W and {cpu_load * 100:.1f}% => estimation of {power} W for processes {self._tracking_pids} (including children)." ) else: From 909df877ce6c5719866cd5256d2c3008edd7cb65 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 28 Nov 2025 15:10:31 +0100 Subject: [PATCH 08/15] Added Unittest --- docs/_sources/output.rst.txt | 8 ++-- docs/_sources/parameters.rst.txt | 3 ++ tests/test_pid_tracking.py | 79 ++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 4 deletions(-) create mode 100644 tests/test_pid_tracking.py diff --git a/docs/_sources/output.rst.txt b/docs/_sources/output.rst.txt index 2a6a4a369..478a32aca 100644 --- a/docs/_sources/output.rst.txt +++ b/docs/_sources/output.rst.txt @@ -30,11 +30,11 @@ input parameter (defaults to the current directory), for each experiment tracked * - emissions_rate - emissions divided per duration, in Kg/s * - cpu_power - - CPU power (W) + - Mean CPU power (W) * - gpu_power - - GPU power (W) + - Mean GPU power (W) * - ram_power - - RAM power (W) + - Mean RAM power (W) * - cpu_energy - Energy used per CPU (kWh) * - gpu_energy @@ -130,7 +130,7 @@ Logfire Using CodeCarbon with logfire ````````````````````````````````` -`Logfire `_ is an observability platform. +`Logfire `_ is an observability platform. CodeCarbon exposes all its metrics with the suffix `codecarbon_`. diff --git a/docs/_sources/parameters.rst.txt b/docs/_sources/parameters.rst.txt index cf7884c88..26b17f896 100644 --- a/docs/_sources/parameters.rst.txt +++ b/docs/_sources/parameters.rst.txt @@ -24,6 +24,9 @@ Input Parameters * - tracking_mode - | ``machine`` measure the power consumptions of the entire machine (default) | ``process`` try and isolate the tracked processes in isolation + * - tracking_pids + - | List of PIDs to track when using ``process`` tracking mode, + | defaults to ``None``, which tracks the current process * - gpu_ids - | Comma-separated list of GPU ids to track, defaults to ``None`` | These can either be integer indexes of GPUs on the system, or prefixes diff --git a/tests/test_pid_tracking.py b/tests/test_pid_tracking.py new file mode 100644 index 000000000..5cc6574d2 --- /dev/null +++ b/tests/test_pid_tracking.py @@ -0,0 +1,79 @@ +import os +import subprocess as sp +import tempfile +import time +import unittest + +from codecarbon.emissions_tracker import OfflineEmissionsTracker + +python_load_code = """ +import math +i = 0 +erg = 0 +while True: + i += 1 + a = math.sqrt(64*64*64*64*64) + erg += a +print(erg) +""" + + +class TestPIDTracking(unittest.TestCase): + def setUp(self) -> None: + self.project_name = "project_TestPIDTracking" + self.emissions_file = "emissions-test-TestPIDTracking" + self.emissions_path = tempfile.gettempdir() + self.emissions_file_path = os.path.join( + self.emissions_path, self.emissions_file + ) + if os.path.isfile(self.emissions_file_path): + os.remove(self.emissions_file_path) + + self.pids = [] + self.process = [] + for _ in range(128): + self.process.append(sp.Popen(["python", "-c", python_load_code])) + self.pids.append(self.process[-1].pid) + self.pids.append(os.getpid()) + + def tearDown(self) -> None: + if os.path.isfile(self.emissions_file_path): + os.remove(self.emissions_file_path) + + for proc in self.process: + proc.terminate() + proc.wait() + + def test_carbon_pid_tracking_offline(self): + + # Subprocess PIDs are childs, therefore booth should be equal + tracker_pid = OfflineEmissionsTracker( + output_dir=self.emissions_path, + output_file=self.emissions_file + "_pid.csv", + tracking_mode="process", + tracking_pids=self.pids, + ) + tracker_self = OfflineEmissionsTracker( + output_dir=self.emissions_path, + output_file=self.emissions_file + "_global.csv", + tracking_mode="process", + tracking_pids=os.getpid(), + ) + + tracker_pid.start() + tracker_self.start() + + time.sleep(5) + + emissions_pid = tracker_pid.stop() + emissions_self = tracker_self.stop() + + print(f"Emissions (self): {emissions_self} kgCO2eq") + print(f"Emissions (pid): {emissions_pid} kgCO2eq") + + if not isinstance(emissions_pid, float): + print(emissions_pid) + assert isinstance(emissions_pid, float) + + self.assertNotEqual(emissions_pid, 0.0) + self.assertAlmostEqual(emissions_self, emissions_pid, 2) From 842cbd493337f9575940bd47b15a27c578222323 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 1 Dec 2025 06:24:42 +0100 Subject: [PATCH 09/15] Force reformat on the remaining 3 files --- .../carbonserver/api/infra/repositories/repository_runs.py | 4 ++-- .../versions/edcd10edf11d_add_metadata_in_run_table.py | 2 +- codecarbon/external/task.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/carbonserver/carbonserver/api/infra/repositories/repository_runs.py b/carbonserver/carbonserver/api/infra/repositories/repository_runs.py index db176010e..84d2202b6 100644 --- a/carbonserver/carbonserver/api/infra/repositories/repository_runs.py +++ b/carbonserver/carbonserver/api/infra/repositories/repository_runs.py @@ -47,7 +47,7 @@ def add_run(self, run: RunCreate) -> Run: region=run.region, provider=run.provider, ram_total_size=run.ram_total_size, - tracking_mode=run.tracking_mode, + tracking_mode=run.tracking_mode, ) session.add(db_run) session.commit() @@ -113,7 +113,7 @@ def map_sql_to_schema(run: SqlModelRun) -> Run: region=run.region, provider=run.provider, ram_total_size=run.ram_total_size, - tracking_mode=run.tracking_mode, + tracking_mode=run.tracking_mode, ) def get_experiment_detailed_sums_by_run( diff --git a/carbonserver/carbonserver/database/alembic/versions/edcd10edf11d_add_metadata_in_run_table.py b/carbonserver/carbonserver/database/alembic/versions/edcd10edf11d_add_metadata_in_run_table.py index 75cf02760..754a78430 100644 --- a/carbonserver/carbonserver/database/alembic/versions/edcd10edf11d_add_metadata_in_run_table.py +++ b/carbonserver/carbonserver/database/alembic/versions/edcd10edf11d_add_metadata_in_run_table.py @@ -43,4 +43,4 @@ def downgrade(): op.drop_column("runs", "region") op.drop_column("runs", "provider") op.drop_column("runs", "ram_total_size") - op.drop_column("runs", "tracking_mode") \ No newline at end of file + op.drop_column("runs", "tracking_mode") diff --git a/codecarbon/external/task.py b/codecarbon/external/task.py index 4972cc0fe..e3d7ecbae 100644 --- a/codecarbon/external/task.py +++ b/codecarbon/external/task.py @@ -50,6 +50,6 @@ def out(self): longitude=self.emissions_data.longitude, latitude=self.emissions_data.latitude, ram_total_size=self.emissions_data.ram_total_size, - tracking_mode=self.emissions_data.tracking_mode, + tracking_mode=self.emissions_data.tracking_mode, on_cloud=self.emissions_data.on_cloud, ) From 574f983584a2e24051b89bdda6cc51bffe79a9db Mon Sep 17 00:00:00 2001 From: root Date: Fri, 5 Dec 2025 15:10:40 +0100 Subject: [PATCH 10/15] Added TotalConsumption to prometheus --- .../output_methods/metrics/metric_docs.py | 5 ++++ .../output_methods/metrics/prometheus.py | 27 ++++++++++++++++--- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/codecarbon/output_methods/metrics/metric_docs.py b/codecarbon/output_methods/metrics/metric_docs.py index 864641ee8..963ba3df5 100644 --- a/codecarbon/output_methods/metrics/metric_docs.py +++ b/codecarbon/output_methods/metrics/metric_docs.py @@ -64,3 +64,8 @@ class MetricDocumentation: "codecarbon_energy_consumed", description="Sum of cpu_energy, gpu_energy and ram_energy (kW)", ) + +energy_consumed_total_doc = MetricDocumentation( + "codecarbon_energy_consumed_total", + description="CumSum of cpu_energy, gpu_energy and ram_energy (kW)", +) diff --git a/codecarbon/output_methods/metrics/prometheus.py b/codecarbon/output_methods/metrics/prometheus.py index a65b770cc..ff9cbe891 100644 --- a/codecarbon/output_methods/metrics/prometheus.py +++ b/codecarbon/output_methods/metrics/prometheus.py @@ -1,7 +1,12 @@ import dataclasses import os -from prometheus_client import CollectorRegistry, Gauge, push_to_gateway +from prometheus_client import ( + CollectorRegistry, + Gauge, + delete_from_gateway, + push_to_gateway, +) from prometheus_client.exposition import basic_auth_handler from codecarbon.external.logger import logger @@ -15,6 +20,7 @@ emissions_doc, emissions_rate_doc, energy_consumed_doc, + energy_consumed_total_doc, gpu_energy_doc, gpu_power_doc, ram_energy_doc, @@ -72,6 +78,7 @@ def generate_gauge(metric_doc: MetricDocumentation): gpu_energy_gauge = generate_gauge(gpu_energy_doc) ram_energy_gauge = generate_gauge(ram_energy_doc) energy_consumed_gauge = generate_gauge(energy_consumed_doc) +energy_consumed_total_gauge = generate_gauge(energy_consumed_total_doc) class PrometheusOutput(BaseOutput): @@ -83,9 +90,18 @@ def __init__(self, prometheus_url: str, jobname: str = "codecarbon"): self.prometheus_url = prometheus_url self.jobname = jobname + def __del__(self): + # Cleanup metrics from pushgateway on shutdown, prometheus should already have scraped them + try: + delete_from_gateway( + self.prometheus_url, job=self.jobname, registry=registry + ) + except Exception as e: + logger.error(e, exc_info=True) + def out(self, total: EmissionsData, delta: EmissionsData): try: - self.add_emission(dataclasses.asdict(delta)) + self.add_emission(dataclasses.asdict(delta), dataclasses.asdict(total)) except Exception as e: logger.error(e, exc_info=True) @@ -99,7 +115,7 @@ def _auth_handler(self, url, method, timeout, headers, data): url, method, timeout, headers, data, username, password ) - def add_emission(self, carbon_emission: dict): + def add_emission(self, carbon_emission: dict, total_emission: dict): """ Send emissions data to push gateway """ @@ -131,3 +147,8 @@ def add_emission(self, carbon_emission: dict): registry=registry, handler=self._auth_handler, ) + + # Now set the total energy consumed + energy_consumed_total_gauge.labels(**labels).set( + total_emission["energy_consumed"] + ) From 8dfea34593fc0f774cdb0234f0cbc14f73a0083c Mon Sep 17 00:00:00 2001 From: root Date: Fri, 5 Dec 2025 16:56:17 +0100 Subject: [PATCH 11/15] Reverted add_emission to avoid changing all interfaces --- codecarbon/output_methods/metrics/prometheus.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/codecarbon/output_methods/metrics/prometheus.py b/codecarbon/output_methods/metrics/prometheus.py index ff9cbe891..0bdcdcd21 100644 --- a/codecarbon/output_methods/metrics/prometheus.py +++ b/codecarbon/output_methods/metrics/prometheus.py @@ -101,7 +101,9 @@ def __del__(self): def out(self, total: EmissionsData, delta: EmissionsData): try: - self.add_emission(dataclasses.asdict(delta), dataclasses.asdict(total)) + delta_with_total = dataclasses.asdict(delta) + delta_with_total["energy_consumed_total"] = total.energy_consumed + self.add_emission(delta_with_total) except Exception as e: logger.error(e, exc_info=True) @@ -115,7 +117,7 @@ def _auth_handler(self, url, method, timeout, headers, data): url, method, timeout, headers, data, username, password ) - def add_emission(self, carbon_emission: dict, total_emission: dict): + def add_emission(self, carbon_emission: dict): """ Send emissions data to push gateway """ @@ -137,6 +139,7 @@ def add_emission(self, carbon_emission: dict, total_emission: dict): (gpu_energy_gauge, "gpu_energy"), (ram_energy_gauge, "ram_energy"), (energy_consumed_gauge, "energy_consumed"), + (energy_consumed_total_gauge, "energy_consumed_total"), ]: gauge.labels(**labels).set(carbon_emission[emission_name]) @@ -147,8 +150,3 @@ def add_emission(self, carbon_emission: dict, total_emission: dict): registry=registry, handler=self._auth_handler, ) - - # Now set the total energy consumed - energy_consumed_total_gauge.labels(**labels).set( - total_emission["energy_consumed"] - ) From 632db67b01e0a4dc32744cfcc907f366220a9106 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 5 Dec 2025 21:04:02 +0100 Subject: [PATCH 12/15] fixed bug in RAM, added process id printout, fixed prometheus cleanup --- codecarbon/emissions_tracker.py | 6 ++++++ codecarbon/external/ram.py | 13 +++++-------- codecarbon/output_methods/metrics/prometheus.py | 4 +--- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/codecarbon/emissions_tracker.py b/codecarbon/emissions_tracker.py index bf52e1df4..58bbd9ac8 100644 --- a/codecarbon/emissions_tracker.py +++ b/codecarbon/emissions_tracker.py @@ -387,6 +387,12 @@ def __init__( else: logger.info(f" GPU model: {self._conf.get('gpu_model')}") + if self._tracking_mode == "process": + logger.info(" Tracking mode: process") + logger.info(" Tracked PIDs: " + str(self._tracking_pids)) + else: + logger.info(" Tracking mode: machine") + # Run `self._measure_power_and_energy` every `measure_power_secs` seconds in a # background thread self._scheduler = PeriodicScheduler( diff --git a/codecarbon/external/ram.py b/codecarbon/external/ram.py index df2d73284..f59dc6918 100644 --- a/codecarbon/external/ram.py +++ b/codecarbon/external/ram.py @@ -1,8 +1,9 @@ import math import re import subprocess +import traceback from dataclasses import dataclass -from typing import Optional +from typing import List, Optional import psutil @@ -36,7 +37,7 @@ def __init__( self, children: bool = True, tracking_mode: str = "machine", - tracking_pids: int = None, + tracking_pids: Optional[List[int]] = None, force_ram_power: Optional[int] = None, ): """ @@ -59,14 +60,9 @@ def __init__( self._tracking_mode = tracking_mode if tracking_mode == "process" and tracking_pids is None: - self._tracking_pids = [psutil.Process().pid] else: - # Test if individual process or list of ids - if tracking_pids is not list: - self._tracking_pids = [tracking_pids] - else: - self._tracking_pids = tracking_pids + self._tracking_pids = tracking_pids self._force_ram_power = force_ram_power # Check if using ARM architecture @@ -371,6 +367,7 @@ def total_power(self) -> Power: ) except Exception as e: logger.warning(f"Could not measure RAM Power ({str(e)})") + logger.warning(traceback.format_exc()) ram_power = Power.from_watts(0) return ram_power diff --git a/codecarbon/output_methods/metrics/prometheus.py b/codecarbon/output_methods/metrics/prometheus.py index 0bdcdcd21..689b2b37d 100644 --- a/codecarbon/output_methods/metrics/prometheus.py +++ b/codecarbon/output_methods/metrics/prometheus.py @@ -93,9 +93,7 @@ def __init__(self, prometheus_url: str, jobname: str = "codecarbon"): def __del__(self): # Cleanup metrics from pushgateway on shutdown, prometheus should already have scraped them try: - delete_from_gateway( - self.prometheus_url, job=self.jobname, registry=registry - ) + delete_from_gateway(self.prometheus_url, job=self.jobname) except Exception as e: logger.error(e, exc_info=True) From 6c1dd4e10d74c638d261148895d6e9fbe09f5920 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 9 Dec 2025 12:40:48 +0100 Subject: [PATCH 13/15] Renamed metrics, fixed type, improved cleanup, Changed to Counter --- codecarbon/emissions_tracker.py | 4 +++ codecarbon/output_methods/base_output.py | 3 +++ .../output_methods/metrics/metric_docs.py | 12 ++++----- .../output_methods/metrics/prometheus.py | 25 ++++++++++++++----- 4 files changed, 32 insertions(+), 12 deletions(-) diff --git a/codecarbon/emissions_tracker.py b/codecarbon/emissions_tracker.py index 58bbd9ac8..7fb7c3c25 100644 --- a/codecarbon/emissions_tracker.py +++ b/codecarbon/emissions_tracker.py @@ -701,6 +701,10 @@ def stop(self) -> Optional[float]: self.final_emissions_data = emissions_data self.final_emissions = emissions_data.emissions + + for handler in self._output_handlers: + handler.exit() + return emissions_data.emissions def _persist_data( diff --git a/codecarbon/output_methods/base_output.py b/codecarbon/output_methods/base_output.py index 4b152c29b..ff6ea1778 100644 --- a/codecarbon/output_methods/base_output.py +++ b/codecarbon/output_methods/base_output.py @@ -22,3 +22,6 @@ def live_out(self, total: EmissionsData, delta: EmissionsData): def task_out(self, data: List[TaskEmissionsData], experiment_name: str): pass + + def exit(self): + pass diff --git a/codecarbon/output_methods/metrics/metric_docs.py b/codecarbon/output_methods/metrics/metric_docs.py index 963ba3df5..d62d48f71 100644 --- a/codecarbon/output_methods/metrics/metric_docs.py +++ b/codecarbon/output_methods/metrics/metric_docs.py @@ -50,22 +50,22 @@ class MetricDocumentation: ) cpu_energy_doc = MetricDocumentation( "codecarbon_cpu_energy", - description="Energy used per CPU (kWh)", + description="Energy used per CPU since last reading (kWh)", ) gpu_energy_doc = MetricDocumentation( "codecarbon_gpu_energy", - description="Energy used per GPU (kWh)", + description="Energy used per GPU since last reading (kWh)", ) ram_energy_doc = MetricDocumentation( "codecarbon_ram_energy", - description="Energy used per RAM (kWh)", + description="Energy used per RAM since last reading (kWh)", ) energy_consumed_doc = MetricDocumentation( "codecarbon_energy_consumed", - description="Sum of cpu_energy, gpu_energy and ram_energy (kW)", + description="Sum of cpu_energy, gpu_energy and ram_energy (kWh)", ) energy_consumed_total_doc = MetricDocumentation( - "codecarbon_energy_consumed_total", - description="CumSum of cpu_energy, gpu_energy and ram_energy (kW)", + "codecarbon_energy_total", + description="Accumulated cpu_energy, gpu_energy and ram_energy (kWh)", ) diff --git a/codecarbon/output_methods/metrics/prometheus.py b/codecarbon/output_methods/metrics/prometheus.py index 689b2b37d..b09d5f0bc 100644 --- a/codecarbon/output_methods/metrics/prometheus.py +++ b/codecarbon/output_methods/metrics/prometheus.py @@ -3,6 +3,7 @@ from prometheus_client import ( CollectorRegistry, + Counter, Gauge, delete_from_gateway, push_to_gateway, @@ -68,6 +69,15 @@ def generate_gauge(metric_doc: MetricDocumentation): ) +def generate_counter(metric_doc: MetricDocumentation): + return Counter( + metric_doc.name, + metric_doc.description, + labelnames, + registry=registry, + ) + + duration_gauge = generate_gauge(duration_doc) emissions_gauge = generate_gauge(emissions_doc) emissions_rate_gauge = generate_gauge(emissions_rate_doc) @@ -78,7 +88,7 @@ def generate_gauge(metric_doc: MetricDocumentation): gpu_energy_gauge = generate_gauge(gpu_energy_doc) ram_energy_gauge = generate_gauge(ram_energy_doc) energy_consumed_gauge = generate_gauge(energy_consumed_doc) -energy_consumed_total_gauge = generate_gauge(energy_consumed_total_doc) +energy_consumed_total = generate_counter(energy_consumed_total_doc) class PrometheusOutput(BaseOutput): @@ -90,18 +100,18 @@ def __init__(self, prometheus_url: str, jobname: str = "codecarbon"): self.prometheus_url = prometheus_url self.jobname = jobname - def __del__(self): + def exit(self): # Cleanup metrics from pushgateway on shutdown, prometheus should already have scraped them + # Otherwise they will persist with their last values try: + logger.info("Deleting metrics from Prometheus Pushgateway") delete_from_gateway(self.prometheus_url, job=self.jobname) except Exception as e: logger.error(e, exc_info=True) def out(self, total: EmissionsData, delta: EmissionsData): try: - delta_with_total = dataclasses.asdict(delta) - delta_with_total["energy_consumed_total"] = total.energy_consumed - self.add_emission(delta_with_total) + self.add_emission(dataclasses.asdict(delta)) except Exception as e: logger.error(e, exc_info=True) @@ -137,10 +147,13 @@ def add_emission(self, carbon_emission: dict): (gpu_energy_gauge, "gpu_energy"), (ram_energy_gauge, "ram_energy"), (energy_consumed_gauge, "energy_consumed"), - (energy_consumed_total_gauge, "energy_consumed_total"), ]: gauge.labels(**labels).set(carbon_emission[emission_name]) + # Update the total energy consumed counter + # + energy_consumed_total.labels(**labels).inc(carbon_emission["energy_consumed"]) + # Send the new metric values push_to_gateway( self.prometheus_url, From 6cac7845e5a191f36fad113e21bca3fee791198c Mon Sep 17 00:00:00 2001 From: root Date: Sat, 13 Dec 2025 13:32:00 +0100 Subject: [PATCH 14/15] Fixed spelling --- tests/test_pid_tracking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_pid_tracking.py b/tests/test_pid_tracking.py index 5cc6574d2..f8c3bdf3d 100644 --- a/tests/test_pid_tracking.py +++ b/tests/test_pid_tracking.py @@ -46,7 +46,7 @@ def tearDown(self) -> None: def test_carbon_pid_tracking_offline(self): - # Subprocess PIDs are childs, therefore booth should be equal + # Subprocess PIDs are children, therefore both should be equal tracker_pid = OfflineEmissionsTracker( output_dir=self.emissions_path, output_file=self.emissions_file + "_pid.csv", From 72c1e2dbfc30b11198a0ebd206a372c31b117e2e Mon Sep 17 00:00:00 2001 From: root Date: Sat, 13 Dec 2025 13:43:43 +0100 Subject: [PATCH 15/15] Added more static types Corrected usage of int compared to [int] --- codecarbon/emissions_tracker.py | 4 ++-- codecarbon/external/ram.py | 2 +- tests/test_pid_tracking.py | 10 ++++++++-- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/codecarbon/emissions_tracker.py b/codecarbon/emissions_tracker.py index 7fb7c3c25..79e3dc0f4 100644 --- a/codecarbon/emissions_tracker.py +++ b/codecarbon/emissions_tracker.py @@ -176,7 +176,7 @@ def __init__( str ] = _sentinel, # Deprecated, use electricitymaps_api_token tracking_mode: Optional[str] = _sentinel, - tracking_pids: Optional[int] = _sentinel, + tracking_pids: Optional[List[int]] = _sentinel, log_level: Optional[Union[int, str]] = _sentinel, on_csv_write: Optional[str] = _sentinel, logger_preamble: Optional[str] = _sentinel, @@ -1182,7 +1182,7 @@ def track_emissions( str ] = _sentinel, # Deprecated, use electricitymaps_api_token tracking_mode: Optional[str] = _sentinel, - tracking_pids: Optional[int] = _sentinel, + tracking_pids: Optional[List[int]] = _sentinel, log_level: Optional[Union[int, str]] = _sentinel, on_csv_write: Optional[str] = _sentinel, logger_preamble: Optional[str] = _sentinel, diff --git a/codecarbon/external/ram.py b/codecarbon/external/ram.py index f59dc6918..6a24758ea 100644 --- a/codecarbon/external/ram.py +++ b/codecarbon/external/ram.py @@ -50,7 +50,7 @@ def __init__( total RAM used. Defaults to True. tracking_mode (str, optional): Whether to track "machine" or "process" RAM. Defaults to "machine". - tracking_pids (int, optional): Process id to track RAM usage for "process" + tracking_pids ([int], optional): Process id to track RAM usage for "process" tracking_mode. Defaults to None. force_ram_power (int, optional): User-provided RAM power in watts. If provided, this value is used instead of estimating RAM power. diff --git a/tests/test_pid_tracking.py b/tests/test_pid_tracking.py index f8c3bdf3d..14fa204dd 100644 --- a/tests/test_pid_tracking.py +++ b/tests/test_pid_tracking.py @@ -57,7 +57,7 @@ def test_carbon_pid_tracking_offline(self): output_dir=self.emissions_path, output_file=self.emissions_file + "_global.csv", tracking_mode="process", - tracking_pids=os.getpid(), + tracking_pids=[os.getpid()], ) tracker_pid.start() @@ -76,4 +76,10 @@ def test_carbon_pid_tracking_offline(self): assert isinstance(emissions_pid, float) self.assertNotEqual(emissions_pid, 0.0) - self.assertAlmostEqual(emissions_self, emissions_pid, 2) + + # Compare emissions from both trackers, should be less than 10% difference + diff = abs(emissions_pid - emissions_self) + avg = (emissions_pid + emissions_self) / 2 + percent_diff = (diff / avg) * 100 + print(f"Percent difference: {percent_diff}%") + self.assertLessEqual(percent_diff, 10.0)