From 9c0838e6096154654f413c5226e3bd3c650aba40 Mon Sep 17 00:00:00 2001
From: Mirza Halilcevic <mirza.halilcevic@amd.com>
Date: Sat, 10 Jan 2026 00:14:42 +0000
Subject: [PATCH 01/23] Improve time and metadata tracking, output file format
 has changed.

---
 mlir/utils/performance/tuningRunner.py | 382 +++++++++++++++++++------
 1 file changed, 300 insertions(+), 82 deletions(-)

diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py
index d4c077b18c91..6a8f4fca8115 100755
--- a/mlir/utils/performance/tuningRunner.py
+++ b/mlir/utils/performance/tuningRunner.py
@@ -13,10 +13,12 @@
 import argparse
 import glob
 import os
+import statistics
 import subprocess
 import sys
 import tempfile
 import threading
+import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from contextlib import nullcontext
 from dataclasses import dataclass, field
@@ -40,9 +42,14 @@
     PerfConfiguration,
 )
 
+# =============================================================================
+# Constants
+# =============================================================================
+
 MLIR_N_REPEATS = 10
 WARMUP_ITERATIONS = 1
 SLEEP_US = 100  # 0.1 ms
+MAX_FAILURES = 10
 
 # =============================================================================
 # Configuration & Results
@@ -75,7 +82,8 @@ class TuningResult:
     """Result of tuning a single configuration."""
     test_vector: str
     success: bool
-    gpu_id: Optional[int] = None
+    gpu_id: int
+    elapsed_seconds: float
     winning_config: Optional[str] = None
     max_tflops: Optional[float] = None
     entries: List[Dict] = field(default_factory=list)
@@ -227,85 +235,201 @@ def get(self, test_vector: str) -> Optional[TuningResult]:
         """Get cached result for a test vector."""
         return self._results.get(test_vector)
 
+    def get_all_results(self) -> List[TuningResult]:
+        """Get all cached tuning results."""
+        return list(self._results.values())
+
     def count(self) -> int:
         """Return number of cached configurations."""
         return len(self._results)
 
     @classmethod
-    def from_output_file(cls,
-                         filepath: str,
-                         tuning_space_kind: str,
-                         quiet: bool = False) -> 'TunedConfigsCache':
+    def from_output_file(cls, options: Options) -> 'TunedConfigsCache':
         """Load previously tuned configurations from an output TSV file.
 
-        The output file has the following structure:
-        - Commit lines starting with '# commit: ' indicating the git commit hash of the tuning run
-        - Header lines starting with '# ' containing tuning space kind in parentheses
-          (e.g., '# arch\tnumCUs\ttestVector\tperfConfig (quick)\tTFlops')
-        - Multiple commit and header sections can exist in the same file from different tuning runs
-        - Data lines with tab-separated fields following each header
-        - Error lines starting with '### ' indicating errors during tuning
-
-        Only data lines under headers matching options.tuning_space_kind are loaded.
-        For example, if options.tuning_space_kind='quick', only data under headers containing '(quick)'
-        will be loaded, ignoring '(full)' or other sections.
+        Supports both old and new file formats:
+        - Old format: header starts with '# '; tuning space embedded in column name (e.g., perfConfig (quick))
+        - New format: proper tsv header (no #); metadata in ## comments before header
+
+        Only data lines that match the current tuning space, arch, and numCUs are loaded.
         """
         cache = cls()
 
-        if filepath == '-' or not os.path.exists(filepath):
+        if options.output == '-' or not os.path.exists(options.output):
             return cache
 
         current_commit = get_git_commit_hash()
-        file_commit = current_commit
-        matching_tuning_space = False
+
+        # Pending metadata
+        file_commit: Optional[str] = None
+        file_tuning_space: Optional[str] = None
+        file_arch: Optional[str] = None
+        file_num_cu: Optional[int] = None
+
+        # Active section state
+        matching_section = False
+        column_indices: Dict[str, int] = {}
 
         try:
-            with open(filepath, mode='r') as f:
+            with open(options.output, mode='r') as f:
                 for line in f:
                     line = line.strip()
                     if not line:
                         continue
 
-                    # Track commit hash for warning about stale results
-                    if line.startswith('# commit: '):
-                        file_commit = line[len('# commit: '):].strip()
+                    # Check for metadata line
+                    if line.startswith('## '):
+                        key_value = line[3:]
+                        if ':' in key_value:
+                            key, value = key_value.split(':', 1)
+                            key = key.strip()
+                            value = value.strip()
+                            if key == 'commit':
+                                file_commit = value
+                            elif key == 'tuningSpace':
+                                file_tuning_space = value
+                            elif key == 'arch':
+                                file_arch = value
+                            elif key == 'numCUs':
+                                try:
+                                    file_num_cu = int(value)
+                                except ValueError:
+                                    pass
                         continue
 
-                    # Check if this section header matches our tuning space
-                    if line.startswith('# '):
-                        matching_tuning_space = f"({tuning_space_kind})" in line
-                        if matching_tuning_space and file_commit != current_commit and not quiet:
-                            print(
-                                f"Warning: Loading tuned configs from different commit "
-                                f"(file: {file_commit[:12]}, current: {current_commit[:12]})",
-                                file=sys.stderr)
+                    # Check for header line
+                    if cls._is_header_line(line):
+                        # Determine if this section matches based on metadata or old format
+                        if file_tuning_space is not None:
+                            # New format: use metadata
+                            matching_section = (file_tuning_space == options.tuning_space_kind and
+                                                (file_arch is None or file_arch == options.arch) and
+                                                (file_num_cu is None or
+                                                 file_num_cu == options.num_cu))
+                        elif f'({options.tuning_space_kind})' in line:
+                            # Old format: tuning space embedded in header
+                            matching_section = True
+                        else:
+                            matching_section = False
+
+                        if matching_section:
+                            column_indices = cls._parse_header_line(line)
+                            if file_commit and file_commit != current_commit and not options.quiet:
+                                print(
+                                    f"Warning: Loading tuned configs from different commit "
+                                    f"(file: {file_commit[:8]}, current: {current_commit[:8]})",
+                                    file=sys.stderr)
+
+                        # Reset pending metadata for next section
+                        file_commit = None
+                        file_tuning_space = None
+                        file_arch = None
+                        file_num_cu = None
                         continue
 
-                    # Skip error lines and lines from non-matching sections
-                    if line.startswith('### ') or not matching_tuning_space:
+                    # Skip other comment lines
+                    if line.startswith('#'):
                         continue
 
-                    # Parse data line
-                    fields = line.split('\t')
-                    if len(fields) < 4:
+                    # Skip data lines from non-matching sections
+                    if not matching_section or not column_indices:
                         continue
 
-                    test_vector = fields[2]
-                    perf_config = fields[3] if fields[3] else None
-                    tflops_value = float(fields[4]) if len(fields) > 4 and fields[4] else None
+                    # Parse data line
+                    result = cls._parse_data_line(line.split('\t'), column_indices, options.arch,
+                                                  options.num_cu)
+                    if result:
+                        cache._results[result.test_vector] = result
 
-                    if perf_config and perf_config != "None":
-                        cache._results[test_vector] = TuningResult(test_vector=test_vector,
-                                                                   success=True,
-                                                                   winning_config=perf_config,
-                                                                   max_tflops=tflops_value)
         except Exception as e:
-            if not quiet:
-                print(f"Warning: Failed to load existing tuning results from {filepath}: {e}",
+            if not options.quiet:
+                print(f"Warning: Failed to load existing tuning results from {options.output}: {e}",
                       file=sys.stderr)
 
         return cache
 
+    @staticmethod
+    def _is_header_line(line: str) -> bool:
+        """Check if line is a column header (old or new format)."""
+        # Old format: '# arch\t...'
+        if line.startswith('# '):
+            return line[2:].startswith('arch\t')
+        # New format: 'testVector\t...'
+        return line.startswith('testVector\t')
+
+    @staticmethod
+    def _parse_header_line(line: str) -> Dict[str, int]:
+        """Parse column header and return name -> index mapping."""
+        header_text = line[2:] if line.startswith('# ') else line
+        indices = {}
+        for i, col in enumerate(header_text.split('\t')):
+            if col:
+                indices[col.split()[0]] = i
+        return indices
+
+    @staticmethod
+    def _parse_data_line(fields: List[str], column_indices: Dict[str, int], arch: str,
+                         num_cu: int) -> Optional[TuningResult]:
+        """Parse a data line and return TuningResult if valid.
+
+        A line is valid if:
+        - arch and numCUs match current system (if columns exist, for old format)
+        - testVector is present
+        - perfConfig is present and not 'None'
+        - TFlops is a valid finite number (if column exists)
+        """
+
+        def get_field(name: str) -> Optional[str]:
+            idx = column_indices.get(name)
+            if idx is not None and idx < len(fields) and fields[idx]:
+                return fields[idx]
+            return None
+
+        # Old format: arch and numCUs are columns
+        if 'arch' in column_indices:
+            if get_field('arch') != arch:
+                return None
+
+        if 'numCUs' in column_indices:
+            if get_field('numCUs') != str(num_cu):
+                return None
+
+        test_vector = get_field('testVector')
+        if not test_vector:
+            return None
+
+        perf_config = get_field('perfConfig')
+        if not perf_config or perf_config == 'None':
+            return None
+
+        max_tflops = None
+        if 'TFlops' in column_indices:
+            tflops_str = get_field('TFlops')
+            if not tflops_str:
+                return None
+            try:
+                tflops_val = float(tflops_str)
+                if np.isnan(tflops_val) or np.isinf(tflops_val):
+                    return None
+                max_tflops = tflops_val
+            except ValueError:
+                return None
+
+        elapsed_seconds = 0.0
+        elapsed_str = get_field('elapsedSeconds')
+        if elapsed_str:
+            try:
+                elapsed_seconds = float(elapsed_str)
+            except ValueError:
+                pass
+
+        return TuningResult(test_vector=test_vector,
+                            success=True,
+                            gpu_id=-1,
+                            elapsed_seconds=elapsed_seconds,
+                            winning_config=perf_config,
+                            max_tflops=max_tflops)
+
 
 @dataclass
 class TuningContext:
@@ -436,6 +560,66 @@ def _set_memory_policy(self, numa_node: int) -> None:
             pass  # libnuma not available, rely on first-touch policy
 
 
+@dataclass
+class ETATracker:
+    """Track completion times for accurate ETA estimation using median of successful configs."""
+    total_configs: int
+    num_workers: int
+    initial_times: List[float] = field(default_factory=list)
+    initial_ok_count: int = 0
+    _success_times: List[float] = field(default_factory=list, init=False)
+    _processed: int = field(default=0, init=False)
+    _ok_count: int = field(default=0, init=False)
+    _fail_count: int = field(default=0, init=False)
+
+    def __post_init__(self):
+        self._success_times = list(self.initial_times)
+        self._ok_count = self.initial_ok_count
+
+    def record(self, result: TuningResult) -> None:
+        self._processed += 1
+        if result.success:
+            self._ok_count += 1
+            self._success_times.append(result.elapsed_seconds)
+        else:
+            self._fail_count += 1
+
+    def _format_rate(self, seconds: float) -> str:
+        if seconds < 60:
+            return f"{seconds:.1f}s/cfg"
+        elif seconds < 3600:
+            return f"{seconds / 60:.1f}m/cfg"
+        else:
+            return f"{seconds / 3600:.1f}h/cfg"
+
+    def _format_eta(self, seconds: float) -> str:
+        if seconds < 60:
+            return "<1m"
+        elif seconds < 3600:
+            return f"{int(seconds // 60)}m"
+        elif seconds < 86400:
+            hours = int(seconds // 3600)
+            minutes = int((seconds % 3600) // 60)
+            return f"{hours}h{minutes}m"
+        else:
+            days = int(seconds // 86400)
+            hours = int((seconds % 86400) // 3600)
+            return f"{days}d{hours}h"
+
+    def get_postfix_str(self) -> str:
+        remaining = self.total_configs - self._processed
+
+        rate = "n/a"
+        eta = "n/a"
+        if len(self._success_times) >= 3:
+            median = statistics.median(self._success_times)
+            eta_seconds = (remaining / self.num_workers) * median
+            rate = self._format_rate(median)
+            eta = self._format_eta(eta_seconds)
+
+        return f"ok={self._ok_count}, fail={self._fail_count}, rate={rate}, eta={eta}"
+
+
 # =============================================================================
 # Output Writers
 # =============================================================================
@@ -449,11 +633,14 @@ def __init__(self, filepath: str, options: Options):
         self.options = options
         self.file = None
         self.header_written = False
+        self._is_appending = False
 
     def __enter__(self):
         if self.filepath == '-':
             self.file = sys.stdout
         else:
+            self._is_appending = os.path.exists(self.filepath) and os.path.getsize(
+                self.filepath) > 0
             self.file = open(self.filepath, 'a')
         return self
 
@@ -465,15 +652,23 @@ def _write_header(self):
         if self.header_written:
             return
 
-        commit_hash = get_git_commit_hash()
-        print(f"# commit: {commit_hash}", file=self.file)
-        columns = [
-            'arch', 'numCUs', 'numChiplets', 'testVector',
-            f'perfConfig ({self.options.tuning_space_kind})'
-        ]
+        # Add a blank line if appending
+        if self._is_appending:
+            print("", file=self.file)
+
+        # Metadata comments
+        print(f"## commit: {get_git_commit_hash()}", file=self.file)
+        print(f"## tuningSpace: {self.options.tuning_space_kind}", file=self.file)
+        print(f'## arch: {self.options.arch}', file=self.file)
+        print(f'## numCUs: {self.options.num_cu}', file=self.file)
+        print(f'## numChiplets: {self.options.num_chiplets}', file=self.file)
+
+        # TSV header
+        columns = ['testVector', 'perfConfig']
         if self.options.tflops:
             columns.append('TFlops')
-        print("# " + "\t".join(columns), file=self.file)
+        columns.append('elapsedSeconds')
+        print("\t".join(columns), file=self.file)
 
         self.file.flush()
         self.header_written = True
@@ -481,13 +676,10 @@ def _write_header(self):
     def write_result(self, result: TuningResult):
         self._write_header()
 
-        fields = [
-            self.options.arch,
-            str(self.options.num_cu),
-            str(self.options.num_chiplets), result.test_vector, result.winning_config or ""
-        ]
+        fields = [result.test_vector, result.winning_config or ""]
         if self.options.tflops:
             fields.append(f"{result.max_tflops}" if result.max_tflops else "")
+        fields.append(f"{result.elapsed_seconds:.1f}")
         print("\t".join(fields), file=self.file)
 
         self.file.flush()
@@ -564,6 +756,15 @@ def __call__(self, parser, namespace, values, option_string=None):
         setattr(namespace, self.dest, values)
 
 
+def ensure_tsv_extension(filepath: str) -> str:
+    """Ensure filepath has .tsv extension, unless it's stdout."""
+    if filepath == '-':
+        return filepath
+    if not filepath.endswith('.tsv'):
+        return filepath + '.tsv'
+    return filepath
+
+
 def get_git_commit_hash() -> str:
     """Get the current git commit hash."""
     try:
@@ -895,8 +1096,7 @@ def tune_configs(ctx: TuningContext) -> bool:
     # Load cached results unless retuning is forced
     cache = TunedConfigsCache()
     if not ctx.options.retune:
-        cache = TunedConfigsCache.from_output_file(ctx.options.output,
-                                                   ctx.options.tuning_space_kind, ctx.options.quiet)
+        cache = TunedConfigsCache.from_output_file(ctx.options)
         if cache.count() > 0 and not ctx.options.quiet:
             print(f"Found {cache.count()} tuned config(s) in {ctx.options.output}", file=sys.stderr)
 
@@ -916,30 +1116,36 @@ def tune_configs(ctx: TuningContext) -> bool:
     ctx.print_gpu_summary()
 
     def execute_tuning_task(test_vector: str) -> TuningResult:
-        try:
-            gpu_id = pool.acquire_gpu_for_thread()
-            compile_threads = ctx.get_compile_threads(gpu_id)
-            result = tune_config(test_vector, ctx.conf_class, ctx.paths, ctx.options, gpu_id,
-                                 compile_threads)
-            return TuningResult(test_vector=test_vector,
-                                success=result.get('success', False),
-                                gpu_id=gpu_id,
-                                winning_config=result.get('winning_config'),
-                                max_tflops=result.get('max_tflops'),
-                                entries=result.get('entries', []),
-                                verify_tflops=result.get('verify_tflops'),
-                                error=result.get('error'))
-        except Exception as e:
-            return TuningResult(test_vector=test_vector, success=False, error=str(e))
+        gpu_id = pool.acquire_gpu_for_thread()
+        start_time = time.time()
+        compile_threads = ctx.get_compile_threads(gpu_id)
+        result = tune_config(test_vector, ctx.conf_class, ctx.paths, ctx.options, gpu_id,
+                             compile_threads)
+        return TuningResult(test_vector=test_vector,
+                            success=result.get('success', False),
+                            gpu_id=gpu_id,
+                            elapsed_seconds=time.time() - start_time,
+                            winning_config=result.get('winning_config'),
+                            max_tflops=result.get('max_tflops'),
+                            entries=result.get('entries', []),
+                            verify_tflops=result.get('verify_tflops'),
+                            error=result.get('error'))
 
     executor = None
     progress_bar = None
-    has_errors = False
 
     with OutputFileWriter(ctx.options.output, ctx.options) as results_writer:
         with DebugFileWriter(f"{ctx.options.output}.debug") if ctx.options.debug else nullcontext(
         ) as debug_writer:
             try:  # No context manager for executor because we need to shutdown with wait=False
+                initial_times = [
+                    r.elapsed_seconds for r in cache.get_all_results() if r.elapsed_seconds > 0.0
+                ]
+                eta_tracker = ETATracker(total_configs=len(pending_configs),
+                                         num_workers=num_workers,
+                                         initial_times=initial_times,
+                                         initial_ok_count=cache.count())
+
                 progress_bar = tqdm(
                     total=len(ctx.configs),
                     initial=skipped_count,
@@ -947,7 +1153,10 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
                     file=sys.stderr,
                     desc=f"Tuning {ctx.conf_class.__name__} ({ctx.options.tuning_space_kind})",
                     unit="config",
-                    leave=False)
+                    leave=False,
+                    bar_format=
+                    '{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [t={elapsed}{postfix}]')
+                progress_bar.set_postfix_str(eta_tracker.get_postfix_str())
 
                 executor = ThreadPoolExecutor(max_workers=num_workers)
                 pending_futures = {
@@ -955,19 +1164,22 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
                     for test_vector in pending_configs
                 }
 
+                has_errors = False
+                consecutive_failures = 0
+
                 for completed_future in as_completed(pending_futures):
                     result = completed_future.result()
 
                     if result.success:
+                        consecutive_failures = 0
                         results_writer.write_result(result)
                         if debug_writer:
                             debug_writer.write_entries(result.entries)
-                        progress_bar.update(1)
                     else:
                         has_errors = True
+                        consecutive_failures += 1
                         error_text = result.error or "Unknown error"
-                        gpu_prefix = f"[GPU {result.gpu_id}] " if result.gpu_id is not None else ""
-                        formatted_error = f"{gpu_prefix}Error tuning {result.test_vector}\n" + '\n'.join(
+                        formatted_error = f"[GPU {result.gpu_id}] Error tuning {result.test_vector}\n" + '\n'.join(
                             f"\t{line}" for line in error_text.splitlines())
                         print(formatted_error, file=sys.stderr)
                         results_writer.write_error(formatted_error)
@@ -975,7 +1187,13 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
                         if ctx.options.abort_on_error:
                             return False
 
-                        progress_bar.refresh()
+                        if consecutive_failures >= MAX_FAILURES:
+                            print("Aborting due to too many consecutive failures", file=sys.stderr)
+                            return False
+
+                    eta_tracker.record(result)
+                    progress_bar.update(1)
+                    progress_bar.set_postfix_str(eta_tracker.get_postfix_str())
 
                 if has_errors:
                     print("Encountered errors during tuning", file=sys.stderr)
@@ -1272,7 +1490,7 @@ def main(args=None):
                       verify_mode=parsed_args.verify_mode,
                       verify_perfconfigs=parsed_args.verify_perf_configs,
                       tflops=parsed_args.tflops,
-                      output=parsed_args.output,
+                      output=ensure_tsv_extension(parsed_args.output),
                       abort_on_error=parsed_args.abort_on_error,
                       retune=parsed_args.retune,
                       gpu_ids=parsed_args.gpus,

From 9f6c671780600a9ff61b15d5a39fb23a29cc8b68 Mon Sep 17 00:00:00 2001
From: Mirza Halilcevic <mirza.halilcevic@amd.com>
Date: Sat, 10 Jan 2026 01:11:18 +0000
Subject: [PATCH 02/23] Add support for stdin.

---
 mlir/utils/performance/tuningRunner.py | 225 +++++++++++++------------
 1 file changed, 121 insertions(+), 104 deletions(-)

diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py
index 6a8f4fca8115..9a95b6ca739d 100755
--- a/mlir/utils/performance/tuningRunner.py
+++ b/mlir/utils/performance/tuningRunner.py
@@ -431,6 +431,66 @@ def get_field(name: str) -> Optional[str]:
                             max_tflops=max_tflops)
 
 
+@dataclass
+class ETATracker:
+    """Track completion times for accurate ETA estimation using median of successful configs."""
+    total_configs: int
+    num_workers: int
+    initial_times: List[float] = field(default_factory=list)
+    initial_ok_count: int = 0
+    _success_times: List[float] = field(default_factory=list, init=False)
+    _processed: int = field(default=0, init=False)
+    _ok_count: int = field(default=0, init=False)
+    _fail_count: int = field(default=0, init=False)
+
+    def __post_init__(self):
+        self._success_times = list(self.initial_times)
+        self._ok_count = self.initial_ok_count
+
+    def record(self, result: TuningResult) -> None:
+        self._processed += 1
+        if result.success:
+            self._ok_count += 1
+            self._success_times.append(result.elapsed_seconds)
+        else:
+            self._fail_count += 1
+
+    def _format_rate(self, seconds: float) -> str:
+        if seconds < 60:
+            return f"{seconds:.1f}s/cfg"
+        elif seconds < 3600:
+            return f"{seconds / 60:.1f}m/cfg"
+        else:
+            return f"{seconds / 3600:.1f}h/cfg"
+
+    def _format_eta(self, seconds: float) -> str:
+        if seconds < 60:
+            return "<1m"
+        elif seconds < 3600:
+            return f"{int(seconds // 60)}m"
+        elif seconds < 86400:
+            hours = int(seconds // 3600)
+            minutes = int((seconds % 3600) // 60)
+            return f"{hours}h{minutes}m"
+        else:
+            days = int(seconds // 86400)
+            hours = int((seconds % 86400) // 3600)
+            return f"{days}d{hours}h"
+
+    def get_postfix_str(self) -> str:
+        remaining = self.total_configs - self._processed
+
+        rate = "n/a"
+        eta = "n/a"
+        if len(self._success_times) >= 3:
+            median = statistics.median(self._success_times)
+            eta_seconds = (remaining / self.num_workers) * median
+            rate = self._format_rate(median)
+            eta = self._format_eta(eta_seconds)
+
+        return f"ok={self._ok_count}, fail={self._fail_count}, rate={rate}, eta={eta}"
+
+
 @dataclass
 class TuningContext:
     """Encapsulates all state and configuration needed for tuning operations."""
@@ -560,66 +620,6 @@ def _set_memory_policy(self, numa_node: int) -> None:
             pass  # libnuma not available, rely on first-touch policy
 
 
-@dataclass
-class ETATracker:
-    """Track completion times for accurate ETA estimation using median of successful configs."""
-    total_configs: int
-    num_workers: int
-    initial_times: List[float] = field(default_factory=list)
-    initial_ok_count: int = 0
-    _success_times: List[float] = field(default_factory=list, init=False)
-    _processed: int = field(default=0, init=False)
-    _ok_count: int = field(default=0, init=False)
-    _fail_count: int = field(default=0, init=False)
-
-    def __post_init__(self):
-        self._success_times = list(self.initial_times)
-        self._ok_count = self.initial_ok_count
-
-    def record(self, result: TuningResult) -> None:
-        self._processed += 1
-        if result.success:
-            self._ok_count += 1
-            self._success_times.append(result.elapsed_seconds)
-        else:
-            self._fail_count += 1
-
-    def _format_rate(self, seconds: float) -> str:
-        if seconds < 60:
-            return f"{seconds:.1f}s/cfg"
-        elif seconds < 3600:
-            return f"{seconds / 60:.1f}m/cfg"
-        else:
-            return f"{seconds / 3600:.1f}h/cfg"
-
-    def _format_eta(self, seconds: float) -> str:
-        if seconds < 60:
-            return "<1m"
-        elif seconds < 3600:
-            return f"{int(seconds // 60)}m"
-        elif seconds < 86400:
-            hours = int(seconds // 3600)
-            minutes = int((seconds % 3600) // 60)
-            return f"{hours}h{minutes}m"
-        else:
-            days = int(seconds // 86400)
-            hours = int((seconds % 86400) // 3600)
-            return f"{days}d{hours}h"
-
-    def get_postfix_str(self) -> str:
-        remaining = self.total_configs - self._processed
-
-        rate = "n/a"
-        eta = "n/a"
-        if len(self._success_times) >= 3:
-            median = statistics.median(self._success_times)
-            eta_seconds = (remaining / self.num_workers) * median
-            rate = self._format_rate(median)
-            eta = self._format_eta(eta_seconds)
-
-        return f"ok={self._ok_count}, fail={self._fail_count}, rate={rate}, eta={eta}"
-
-
 # =============================================================================
 # Output Writers
 # =============================================================================
@@ -1218,8 +1218,10 @@ def resolve_paths(op_type: Operation, parsed_args) -> Paths:
     """Resolve paths based on operation type and arguments."""
     if op_type == Operation.FUSION:
         configs_path = "./fusion_config_file"
+    elif parsed_args.config:
+        configs_path = None
     else:
-        configs_path = None if parsed_args.config else parsed_args.configs_file
+        configs_path = parsed_args.configs_file
     return perfRunner.create_paths(configs_path, parsed_args.mlir_build_dir)
 
 
@@ -1277,6 +1279,15 @@ def get_config_class(op_type: Operation) -> type:
     return config_classes.get(op_type, PerfConfiguration)
 
 
+def load_configs_from_stdin() -> str:
+    """Read configs from stdin and return path to a temporary file."""
+    content = sys.stdin.read()
+    fd, path = tempfile.mkstemp(suffix='.txt', prefix='tuning_configs_')
+    with os.fdopen(fd, 'w') as f:
+        f.write(content)
+    return path
+
+
 def load_configs(op_type: Operation, parsed_args, paths: Paths) -> List[str]:
     """Load configurations based on operation type and arguments."""
     if parsed_args.config:
@@ -1468,58 +1479,64 @@ def main(args=None):
 
     parsed_args = parse_arguments(gpu_topology, available_gpus, args)
 
-    op_type = Operation.from_name(parsed_args.op)
-    paths = resolve_paths(op_type, parsed_args)
-
-    if not paths.mlir_paths:
-        print("rocMLIR build dir was not provided/found", file=sys.stderr)
-        return 1
-
-    arch = perfRunner.get_arch()
-    chip = perfRunner.get_chip()
-    num_cu = perfRunner.get_num_cu(chip)
-    num_chiplets = perfRunner.get_num_chiplets(chip, num_cu)
-
-    options = Options(arch=arch,
-                      num_cu=num_cu,
-                      num_chiplets=num_chiplets,
-                      debug=parsed_args.debug,
-                      quiet=parsed_args.quiet,
-                      tuning_space_kind=parsed_args.tuning_space,
-                      rocmlir_gen_flags=parsed_args.rocmlir_gen_flags,
-                      verify_mode=parsed_args.verify_mode,
-                      verify_perfconfigs=parsed_args.verify_perf_configs,
-                      tflops=parsed_args.tflops,
-                      output=ensure_tsv_extension(parsed_args.output),
-                      abort_on_error=parsed_args.abort_on_error,
-                      retune=parsed_args.retune,
-                      gpu_ids=parsed_args.gpus,
-                      num_cpus=parsed_args.num_cpus,
-                      wait_for_compiles=parsed_args.wait_for_compiles)
-
-    if op_type == Operation.FUSION:
-        op_type = extract_fusion_configs(parsed_args.test_dir, paths, options)
-
+    stdin_temp_file = None
     try:
+        # Handle stdin for configs file
+        if parsed_args.configs_file == '-':
+            stdin_temp_file = load_configs_from_stdin()
+            parsed_args.configs_file = stdin_temp_file
+
+        op_type = Operation.from_name(parsed_args.op)
+        paths = resolve_paths(op_type, parsed_args)
+
+        if not paths.mlir_paths:
+            print("rocMLIR build dir was not provided/found", file=sys.stderr)
+            return 1
+
+        arch = perfRunner.get_arch()
+        chip = perfRunner.get_chip()
+        num_cu = perfRunner.get_num_cu(chip)
+        num_chiplets = perfRunner.get_num_chiplets(chip, num_cu)
+
+        options = Options(arch=arch,
+                        num_cu=num_cu,
+                        num_chiplets=num_chiplets,
+                        debug=parsed_args.debug,
+                        quiet=parsed_args.quiet,
+                        tuning_space_kind=parsed_args.tuning_space,
+                        rocmlir_gen_flags=parsed_args.rocmlir_gen_flags,
+                        verify_mode=parsed_args.verify_mode,
+                        verify_perfconfigs=parsed_args.verify_perf_configs,
+                        tflops=parsed_args.tflops,
+                        output=ensure_tsv_extension(parsed_args.output),
+                        abort_on_error=parsed_args.abort_on_error,
+                        retune=parsed_args.retune,
+                        gpu_ids=parsed_args.gpus,
+                        num_cpus=parsed_args.num_cpus,
+                        wait_for_compiles=parsed_args.wait_for_compiles)
+
+        if op_type == Operation.FUSION:
+            op_type = extract_fusion_configs(parsed_args.test_dir, paths, options)
+
         conf_class = get_config_class(op_type)
         configs = load_configs(op_type, parsed_args, paths)
-    except ValueError as e:
-        print(str(e), file=sys.stderr)
-        return 1
 
-    ctx = TuningContext(configs=configs,
-                        conf_class=conf_class,
-                        paths=paths,
-                        options=options,
-                        gpu_topology=gpu_topology,
-                        numa_topology=NumaTopology.discover())
+        ctx = TuningContext(configs=configs,
+                            conf_class=conf_class,
+                            paths=paths,
+                            options=options,
+                            gpu_topology=gpu_topology,
+                            numa_topology=NumaTopology.discover())
 
-    try:
         tuning_succeeded = tune_configs(ctx)
         return 0 if tuning_succeeded else 1
+
     except KeyboardInterrupt:
         print("Tuning interrupted by user", file=sys.stderr)
-        return 1
+        return 130  # 128 + SIGINT
+    finally:
+        if stdin_temp_file:
+            os.unlink(stdin_temp_file)
 
 
 if __name__ == '__main__':

From 7eb9851b496a8f010d1202b332b8e0c922f641a4 Mon Sep 17 00:00:00 2001
From: Mirza Halilcevic <mirza.halilcevic@amd.com>
Date: Sat, 10 Jan 2026 11:52:00 +0000
Subject: [PATCH 03/23] Revert to old output format.

---
 mlir/utils/performance/perfRunner.py   |  17 ++-
 mlir/utils/performance/tuningRunner.py | 142 ++++++++++---------------
 2 files changed, 63 insertions(+), 96 deletions(-)

diff --git a/mlir/utils/performance/perfRunner.py b/mlir/utils/performance/perfRunner.py
index 71fbd5051c56..f8ed4b7d78f4 100644
--- a/mlir/utils/performance/perfRunner.py
+++ b/mlir/utils/performance/perfRunner.py
@@ -293,25 +293,22 @@ def read_tuning_db(path: Optional[str]) -> MaybeTuningDb:
         with open(path, 'r') as db_file:
             for line in db_file:
                 line = line.strip()
-                if line.startswith('#'):
+                if not line or line.startswith('#'):
                     continue
                 entries = line.split('\t')
 
                 # note: legacy format has 3 entries
                 if len(entries) == 3:
                     arch, config, perfconfig = entries
-                    ret[arch, config] = perfconfig
-                # note: new format has 4 entries
-                elif len(entries) == 4:
-                    arch, _, config, perfconfig = entries
-                    ret[arch, config] = perfconfig
-                # note: 5-entry form includes tflops at end
-                elif len(entries) == 5:
-                    arch, _, config, perfconfig, _ = entries
-                    ret[arch, config] = perfconfig
+                # note: new format has 4+ entries
+                elif len(entries) >= 4:
+                    arch, _, config, perfconfig = entries[:4]
                 else:
                     print("Warning: Malformed tuning database entry:", line)
                     continue
+
+                ret[arch, config] = perfconfig
+
         return ret
     except FileNotFoundError:
         if path:
diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py
index 9a95b6ca739d..c7c7b57612b5 100755
--- a/mlir/utils/performance/tuningRunner.py
+++ b/mlir/utils/performance/tuningRunner.py
@@ -247,11 +247,8 @@ def count(self) -> int:
     def from_output_file(cls, options: Options) -> 'TunedConfigsCache':
         """Load previously tuned configurations from an output TSV file.
 
-        Supports both old and new file formats:
-        - Old format: header starts with '# '; tuning space embedded in column name (e.g., perfConfig (quick))
-        - New format: proper tsv header (no #); metadata in ## comments before header
-
-        Only data lines that match the current tuning space, arch, and numCUs are loaded.
+        Format: # arch\tnumCUs\ttestVector\tperfConfig (tuning_space)\t[TFlops]\t[elapsedSeconds]
+        Only loads entries matching current arch, numCUs, and tuning space.
         """
         cache = cls()
 
@@ -260,13 +257,8 @@ def from_output_file(cls, options: Options) -> 'TunedConfigsCache':
 
         current_commit = get_git_commit_hash()
 
-        # Pending metadata
-        file_commit: Optional[str] = None
-        file_tuning_space: Optional[str] = None
-        file_arch: Optional[str] = None
-        file_num_cu: Optional[int] = None
-
         # Active section state
+        metadata: Dict[str, Optional[Any]] = {}
         matching_section = False
         column_indices: Dict[str, int] = {}
 
@@ -279,52 +271,31 @@ def from_output_file(cls, options: Options) -> 'TunedConfigsCache':
 
                     # Check for metadata line
                     if line.startswith('## '):
-                        key_value = line[3:]
-                        if ':' in key_value:
-                            key, value = key_value.split(':', 1)
-                            key = key.strip()
-                            value = value.strip()
-                            if key == 'commit':
-                                file_commit = value
-                            elif key == 'tuningSpace':
-                                file_tuning_space = value
-                            elif key == 'arch':
-                                file_arch = value
-                            elif key == 'numCUs':
-                                try:
-                                    file_num_cu = int(value)
-                                except ValueError:
-                                    pass
+                        parts = line[3:].split(':', 1)
+                        if len(parts) == 2:
+                            key = parts[0].strip()
+                            value = parts[1].strip()
+                            metadata[key] = value
                         continue
 
                     # Check for header line
                     if cls._is_header_line(line):
-                        # Determine if this section matches based on metadata or old format
-                        if file_tuning_space is not None:
-                            # New format: use metadata
-                            matching_section = (file_tuning_space == options.tuning_space_kind and
-                                                (file_arch is None or file_arch == options.arch) and
-                                                (file_num_cu is None or
-                                                 file_num_cu == options.num_cu))
-                        elif f'({options.tuning_space_kind})' in line:
-                            # Old format: tuning space embedded in header
-                            matching_section = True
-                        else:
-                            matching_section = False
+                        # Determine if this section matches based on tuning space
+                        matching_section = f'({options.tuning_space_kind})' in line
 
                         if matching_section:
                             column_indices = cls._parse_header_line(line)
-                            if file_commit and file_commit != current_commit and not options.quiet:
+
+                            # Warn if commit hashes differ
+                            file_commit = metadata.get('commit', 'unknown')
+                            if file_commit != current_commit:
                                 print(
                                     f"Warning: Loading tuned configs from different commit "
                                     f"(file: {file_commit[:8]}, current: {current_commit[:8]})",
                                     file=sys.stderr)
 
-                        # Reset pending metadata for next section
-                        file_commit = None
-                        file_tuning_space = None
-                        file_arch = None
-                        file_num_cu = None
+                        # Reset metadata for next section
+                        metadata = {}
                         continue
 
                     # Skip other comment lines
@@ -350,21 +321,20 @@ def from_output_file(cls, options: Options) -> 'TunedConfigsCache':
 
     @staticmethod
     def _is_header_line(line: str) -> bool:
-        """Check if line is a column header (old or new format)."""
-        # Old format: '# arch\t...'
-        if line.startswith('# '):
-            return line[2:].startswith('arch\t')
-        # New format: 'testVector\t...'
-        return line.startswith('testVector\t')
+        """Check if line is a column header."""
+        return line.startswith('# arch\t')
 
     @staticmethod
     def _parse_header_line(line: str) -> Dict[str, int]:
         """Parse column header and return name -> index mapping."""
+        # Strip leading '# ' if present
         header_text = line[2:] if line.startswith('# ') else line
         indices = {}
         for i, col in enumerate(header_text.split('\t')):
             if col:
-                indices[col.split()[0]] = i
+                # Exctract base column name (handles 'perfConfig (tuning_space)')
+                col_name = col.split()[0]
+                indices[col_name] = i
         return indices
 
     @staticmethod
@@ -385,14 +355,10 @@ def get_field(name: str) -> Optional[str]:
                 return fields[idx]
             return None
 
-        # Old format: arch and numCUs are columns
-        if 'arch' in column_indices:
-            if get_field('arch') != arch:
-                return None
-
-        if 'numCUs' in column_indices:
-            if get_field('numCUs') != str(num_cu):
-                return None
+        if get_field('arch') != arch:
+            return None
+        if get_field('numCUs') != str(num_cu):
+            return None
 
         test_vector = get_field('testVector')
         if not test_vector:
@@ -652,33 +618,37 @@ def _write_header(self):
         if self.header_written:
             return
 
-        # Add a blank line if appending
         if self._is_appending:
-            print("", file=self.file)
+            print("", file=self.file)  # Blank line before new section
 
         # Metadata comments
         print(f"## commit: {get_git_commit_hash()}", file=self.file)
-        print(f"## tuningSpace: {self.options.tuning_space_kind}", file=self.file)
-        print(f'## arch: {self.options.arch}', file=self.file)
-        print(f'## numCUs: {self.options.num_cu}', file=self.file)
-        print(f'## numChiplets: {self.options.num_chiplets}', file=self.file)
 
-        # TSV header
-        columns = ['testVector', 'perfConfig']
+        # TSV header with '# ' prefix
+        columns = [
+            'arch', 'numCUs', 'numChiplets', 'testVector',
+            f'perfConfig ({self.options.tuning_space_kind})'
+        ]
         if self.options.tflops:
             columns.append('TFlops')
         columns.append('elapsedSeconds')
-        print("\t".join(columns), file=self.file)
+        print("# " + "\t".join(columns), file=self.file)
 
         self.file.flush()
         self.header_written = True
 
     def write_result(self, result: TuningResult):
+        assert result.success and result.winning_config and result.max_tflops, "write_result called with failed result"
+
         self._write_header()
 
-        fields = [result.test_vector, result.winning_config or ""]
+        fields = [
+            self.options.arch,
+            str(self.options.num_cu),
+            str(self.options.num_chiplets), result.test_vector, result.winning_config
+        ]
         if self.options.tflops:
-            fields.append(f"{result.max_tflops}" if result.max_tflops else "")
+            fields.append(str(result.max_tflops))
         fields.append(f"{result.elapsed_seconds:.1f}")
         print("\t".join(fields), file=self.file)
 
@@ -1499,21 +1469,21 @@ def main(args=None):
         num_chiplets = perfRunner.get_num_chiplets(chip, num_cu)
 
         options = Options(arch=arch,
-                        num_cu=num_cu,
-                        num_chiplets=num_chiplets,
-                        debug=parsed_args.debug,
-                        quiet=parsed_args.quiet,
-                        tuning_space_kind=parsed_args.tuning_space,
-                        rocmlir_gen_flags=parsed_args.rocmlir_gen_flags,
-                        verify_mode=parsed_args.verify_mode,
-                        verify_perfconfigs=parsed_args.verify_perf_configs,
-                        tflops=parsed_args.tflops,
-                        output=ensure_tsv_extension(parsed_args.output),
-                        abort_on_error=parsed_args.abort_on_error,
-                        retune=parsed_args.retune,
-                        gpu_ids=parsed_args.gpus,
-                        num_cpus=parsed_args.num_cpus,
-                        wait_for_compiles=parsed_args.wait_for_compiles)
+                          num_cu=num_cu,
+                          num_chiplets=num_chiplets,
+                          debug=parsed_args.debug,
+                          quiet=parsed_args.quiet,
+                          tuning_space_kind=parsed_args.tuning_space,
+                          rocmlir_gen_flags=parsed_args.rocmlir_gen_flags,
+                          verify_mode=parsed_args.verify_mode,
+                          verify_perfconfigs=parsed_args.verify_perf_configs,
+                          tflops=parsed_args.tflops,
+                          output=ensure_tsv_extension(parsed_args.output),
+                          abort_on_error=parsed_args.abort_on_error,
+                          retune=parsed_args.retune,
+                          gpu_ids=parsed_args.gpus,
+                          num_cpus=parsed_args.num_cpus,
+                          wait_for_compiles=parsed_args.wait_for_compiles)
 
         if op_type == Operation.FUSION:
             op_type = extract_fusion_configs(parsed_args.test_dir, paths, options)

From 44ead00f147f3458c4d729d671ba2de9b6f8da38 Mon Sep 17 00:00:00 2001
From: Mirza Halilcevic <mirza.halilcevic@amd.com>
Date: Sat, 10 Jan 2026 22:48:27 +0000
Subject: [PATCH 04/23] Add state file for crash and interrupt recovery.

---
 mlir/utils/performance/tuningRunner.py | 301 ++++++++++++++++++++++++-
 1 file changed, 295 insertions(+), 6 deletions(-)

diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py
index c7c7b57612b5..d86024c6689f 100755
--- a/mlir/utils/performance/tuningRunner.py
+++ b/mlir/utils/performance/tuningRunner.py
@@ -22,6 +22,7 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from contextlib import nullcontext
 from dataclasses import dataclass, field
+from enum import Enum
 from typing import Any, Dict, List, Optional
 from collections import deque
 
@@ -49,7 +50,7 @@
 MLIR_N_REPEATS = 10
 WARMUP_ITERATIONS = 1
 SLEEP_US = 100  # 0.1 ms
-MAX_FAILURES = 10
+MAX_FAILURES = 20
 
 # =============================================================================
 # Configuration & Results
@@ -217,6 +218,253 @@ def _parse_cpu_list(cpu_list_str: str) -> List[int]:
         return cpus
 
 
+# =============================================================================
+# State Management
+# =============================================================================
+
+
+class ConfigState(Enum):
+    """Possible states for a tuning configuration in the state file.
+
+    State transitions:
+        PENDING (implicit) -> RUNNING: Config starts tuning
+        RUNNING -> SUCCESS (implicit): Tuning completes successfully (removed from state, written to output)
+        RUNNING -> FAILED: Tuning completes with error
+        RUNNING -> INTERRUPTED: User interrupted (Ctrl+C) during tuning
+        RUNNING -> CRASHED: Detected on next startup (stale RUNNING state)
+        FAILED/CRASHED -> PENDING: User requests retry with --retry-failed
+
+    Note: PENDING and SUCCESS are implicit states:
+        - PENDING: not in state file AND not in output file
+        - SUCCESS: in output file (not tracked in state file)
+    """
+    RUNNING = "running"  # Currently being tuned
+    FAILED = "failed"  # Tuning completed with error
+    INTERRUPTED = "interrupted"  # User interrupted during tuning (Ctrl+C)
+    CRASHED = "crashed"  # Process crashed while tuning (detected on startup)
+
+
+@dataclass
+class TuningStateContext:
+    """Context that identifies a tuning run. State is invalidated if context changes."""
+    arch: str
+    num_cu: int
+    tuning_space: str
+
+    def matches(self, other: 'TuningStateContext') -> bool:
+        return (self.arch == other.arch and self.num_cu == other.num_cu and
+                self.tuning_space == other.tuning_space)
+
+
+@dataclass
+class TuningState:
+    """Persistent state for tuning runs, survives crashes and interrupts."""
+    context: TuningStateContext
+    configs: Dict[str, ConfigState] = field(default_factory=dict)
+
+    def set_running(self, test_vector: str) -> None:
+        """Mark a config as currently running."""
+        self.configs[test_vector] = ConfigState.RUNNING
+
+    def set_failed(self, test_vector: str) -> None:
+        """Mark a config as failed."""
+        self.configs[test_vector] = ConfigState.FAILED
+
+    def set_interrupted(self, test_vector: str) -> None:
+        """Mark a config as interrupted by user."""
+        self.configs[test_vector] = ConfigState.INTERRUPTED
+
+    def set_crashed(self, test_vector: str) -> None:
+        """Mark a config as crashed."""
+        self.configs[test_vector] = ConfigState.CRASHED
+
+    def remove(self, test_vector: str) -> None:
+        """Remove a config from state (e.g., on success)."""
+        self.configs.pop(test_vector, None)
+
+    def should_skip(self, test_vector: str) -> bool:
+        """Check if a config should be skipped (failed or crashed)."""
+        return self.configs.get(test_vector) in (ConfigState.FAILED, ConfigState.CRASHED)
+
+    def _count_by_state(self, *states: ConfigState) -> int:
+        """Count configs in any of the given states."""
+        return sum(1 for s in self.configs.values() if s in states)
+
+    def failed_count(self) -> int:
+        """Count of failed configs."""
+        return self._count_by_state(ConfigState.FAILED)
+
+    def crashed_count(self) -> int:
+        """Count of crashed configs."""
+        return self._count_by_state(ConfigState.CRASHED)
+
+    def skip_count(self) -> int:
+        """Count of configs that should be skipped (failed + crashed)."""
+        return self._count_by_state(ConfigState.FAILED, ConfigState.CRASHED)
+
+    def promote_running_to_crashed(self) -> int:
+        """Move all RUNNING configs to CRASHED (crash recovery). Returns count."""
+        count = 0
+        for tv in self.configs:
+            if self.configs[tv] == ConfigState.RUNNING:
+                self.configs[tv] = ConfigState.CRASHED
+                count += 1
+        return count
+
+    def promote_running_to_interrupted(self) -> int:
+        """Move all RUNNING configs to INTERRUPTED (clean shutdown). Returns count."""
+        count = 0
+        for tv in self.configs:
+            if self.configs[tv] == ConfigState.RUNNING:
+                self.configs[tv] = ConfigState.INTERRUPTED
+                count += 1
+        return count
+
+
+class TuningStateFile:
+    """Manages reading and writing of tuning state to a JSON file.
+
+    If filepath is None, all operations are no-ops (null object pattern).
+    """
+
+    def __init__(self, filepath: Optional[str]):
+        self.filepath = filepath
+        self._lock = threading.Lock()
+        self._state: Optional[TuningState] = None
+
+    def load(self, expected_context: TuningStateContext, quiet: bool = False) -> 'TuningStateFile':
+        """Load state from file. Returns self for chaining.
+
+        On load:
+        - INTERRUPTED configs are demoted to PENDING (removed from state)
+        - RUNNING configs are promoted to CRASHED (indicates previous crash)
+        """
+        if not self.filepath:
+            self._state = TuningState(context=expected_context)
+            return self
+
+        if not os.path.exists(self.filepath):
+            self._state = TuningState(context=expected_context)
+            return self
+
+        try:
+            with open(self.filepath, 'r') as f:
+                data = json.load(f)
+
+            file_context = TuningStateContext(arch=data.get('arch', ''),
+                                              num_cu=data.get('numCUs', 0),
+                                              tuning_space=data.get('tuningSpace', ''))
+
+            if not file_context.matches(expected_context):
+                if not quiet:
+                    print("State file context mismatch, starting fresh", file=sys.stderr)
+                self._state = TuningState(context=expected_context)
+                return self
+
+            configs = {}
+            for tv, state_str in data.get('configs', {}).items():
+                try:
+                    config_state = ConfigState(state_str)
+                    # Demote INTERRUPTED to PENDING (don't add to configs)
+                    if config_state == ConfigState.INTERRUPTED:
+                        continue
+                    # Promote RUNNING to CRASHED (stale running = crash)
+                    if config_state == ConfigState.RUNNING:
+                        config_state = ConfigState.CRASHED
+                    configs[tv] = config_state
+                except ValueError:
+                    pass  # Skip invalid states
+
+            self._state = TuningState(context=expected_context, configs=configs)
+            return self
+
+        except (json.JSONDecodeError, KeyError, TypeError) as e:
+            if not quiet:
+                print(f"Warning: Failed to load state file: {e}", file=sys.stderr)
+            self._state = TuningState(context=expected_context)
+            return self
+
+    @property
+    def state(self) -> TuningState:
+        """Get the current state. Must call load() first."""
+        if self._state is None:
+            raise RuntimeError("State not loaded. Call load() first.")
+        return self._state
+
+    def _save_locked(self) -> None:
+        """Save state to file atomically. Assumes lock is held."""
+        if not self.filepath or not self._state:
+            return
+
+        data = {
+            'arch': self._state.context.arch,
+            'numCUs': self._state.context.num_cu,
+            'tuningSpace': self._state.context.tuning_space,
+            'configs': {
+                tv: s.value for tv, s in self._state.configs.items()
+            }
+        }
+
+        # Write to temp file then rename for atomicity
+        temp_path = self.filepath + '.tmp'
+        with open(temp_path, 'w') as f:
+            json.dump(data, f, indent=2)
+        os.replace(temp_path, self.filepath)
+
+    def save(self) -> None:
+        """Save state to file atomically. No-op if filepath is None."""
+        with self._lock:
+            self._save_locked()
+
+    def delete(self) -> None:
+        """Delete the state file. No-op if filepath is None."""
+        if not self.filepath:
+            return
+
+        with self._lock:
+            if os.path.exists(self.filepath):
+                os.remove(self.filepath)
+            self._state = None
+
+    def set_running(self, test_vector: str) -> None:
+        """Mark a config as running and save."""
+        if self._state:
+            with self._lock:
+                self._state.set_running(test_vector)
+                self._save_locked()
+
+    def set_failed(self, test_vector: str) -> None:
+        """Mark a config as failed and save."""
+        if self._state:
+            with self._lock:
+                self._state.set_failed(test_vector)
+                self._save_locked()
+
+    def set_success(self, test_vector: str) -> None:
+        """Remove a config from state (success) and save."""
+        if self._state:
+            with self._lock:
+                self._state.remove(test_vector)
+                self._save_locked()
+
+    def finalize_interrupted(self, quiet: bool = False) -> None:
+        """Mark any RUNNING configs as INTERRUPTED and save. Called on clean shutdown."""
+        if self._state:
+            with self._lock:
+                interrupted_count = self._state.promote_running_to_interrupted()
+                if interrupted_count > 0 and not quiet:
+                    print(f"Marked {interrupted_count} running config(s) as interrupted",
+                          file=sys.stderr)
+                self._save_locked()
+
+
+def get_state_filepath(output_filepath: str) -> Optional[str]:
+    """Get the state file path for a given output file."""
+    if output_filepath == '-':
+        return None
+    return f"{output_filepath}.state"
+
+
 # =============================================================================
 # Tuning Infrastructure
 # =============================================================================
@@ -669,6 +917,7 @@ def __init__(self, filepath: str):
         self.header_written = False
 
     def __enter__(self):
+        self.header_written = os.path.exists(self.filepath) and os.path.getsize(self.filepath) > 0
         self.file = open(self.filepath, 'a')
         return self
 
@@ -1070,12 +1319,44 @@ def tune_configs(ctx: TuningContext) -> bool:
         if cache.count() > 0 and not ctx.options.quiet:
             print(f"Found {cache.count()} tuned config(s) in {ctx.options.output}", file=sys.stderr)
 
+    # Load state file
+    state_context = TuningStateContext(arch=ctx.options.arch,
+                                       num_cu=ctx.options.num_cu,
+                                       tuning_space=ctx.options.tuning_space_kind)
+    state_file = TuningStateFile(get_state_filepath(ctx.options.output))
+
+    if ctx.options.retune:
+        state_file.delete()
+
+    state_file.load(state_context, ctx.options.quiet)
+    state = state_file.state
+
+    if not ctx.options.retune:
+        crashed_count = state.crashed_count()
+        if crashed_count > 0 and not ctx.options.quiet:
+            print(f"Detected {crashed_count} crashed config(s) from previous run", file=sys.stderr)
+
+        if state.skip_count() > 0 and not ctx.options.quiet:
+            print(f"Found {state.skip_count()} failed/crashed config(s) in state file",
+                  file=sys.stderr)
+
+    state_file.save()
+
     # Filter out already-tuned configs
     pending_configs = [c for c in ctx.configs if not cache.contains(c)]
-    skipped_count = len(ctx.configs) - len(pending_configs)
-    if skipped_count > 0 and not ctx.options.quiet:
-        print(f"Skipping {skipped_count} of {len(ctx.configs)} already tuned config(s)",
-              file=sys.stderr)
+    skipped_success = len(ctx.configs) - len(pending_configs)
+
+    # Filter out failed/crashed configs from state file
+    before_filter = len(pending_configs)
+    pending_configs = [c for c in pending_configs if not state.should_skip(c)]
+    skipped_failed = before_filter - len(pending_configs)
+
+    total_skipped = skipped_success + skipped_failed
+
+    if skipped_success > 0 and not ctx.options.quiet:
+        print(f"Skipping {skipped_success} already tuned config(s)", file=sys.stderr)
+    if skipped_failed > 0 and not ctx.options.quiet:
+        print(f"Skipping {skipped_failed} failed/crashed config(s)", file=sys.stderr)
 
     if not pending_configs:
         print("All configurations already tuned", file=sys.stderr)
@@ -1087,6 +1368,9 @@ def tune_configs(ctx: TuningContext) -> bool:
 
     def execute_tuning_task(test_vector: str) -> TuningResult:
         gpu_id = pool.acquire_gpu_for_thread()
+
+        state_file.set_running(test_vector)
+
         start_time = time.time()
         compile_threads = ctx.get_compile_threads(gpu_id)
         result = tune_config(test_vector, ctx.conf_class, ctx.paths, ctx.options, gpu_id,
@@ -1118,7 +1402,7 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
 
                 progress_bar = tqdm(
                     total=len(ctx.configs),
-                    initial=skipped_count,
+                    initial=total_skipped,
                     disable=ctx.options.quiet,
                     file=sys.stderr,
                     desc=f"Tuning {ctx.conf_class.__name__} ({ctx.options.tuning_space_kind})",
@@ -1145,9 +1429,12 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
                         results_writer.write_result(result)
                         if debug_writer:
                             debug_writer.write_entries(result.entries)
+                        state_file.set_success(result.test_vector)
                     else:
                         has_errors = True
                         consecutive_failures += 1
+                        state_file.set_failed(result.test_vector)
+
                         error_text = result.error or "Unknown error"
                         formatted_error = f"[GPU {result.gpu_id}] Error tuning {result.test_vector}\n" + '\n'.join(
                             f"\t{line}" for line in error_text.splitlines())
@@ -1178,6 +1465,8 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
                 if progress_bar:
                     progress_bar.close()
 
+                state_file.finalize_interrupted(ctx.options.quiet)
+
 
 # =============================================================================
 # Configuration Loading

From 8a9345641987a30e99971efb2bb8cc359f037967 Mon Sep 17 00:00:00 2001
From: Mirza Halilcevic <mirza.halilcevic@amd.com>
Date: Sat, 10 Jan 2026 23:05:00 +0000
Subject: [PATCH 05/23] Add --retry-failed option.

---
 mlir/utils/performance/tuningRunner.py | 47 +++++++++++++++-----------
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py
index d86024c6689f..fb3402ec8d8b 100755
--- a/mlir/utils/performance/tuningRunner.py
+++ b/mlir/utils/performance/tuningRunner.py
@@ -73,6 +73,7 @@ class Options:
     output: str
     abort_on_error: bool
     retune: bool
+    retry_failed: bool
     gpu_ids: List[int]
     num_cpus: Optional[int]
     wait_for_compiles: bool
@@ -652,6 +653,7 @@ class ETATracker:
     num_workers: int
     initial_times: List[float] = field(default_factory=list)
     initial_ok_count: int = 0
+    initial_fail_count: int = 0
     _success_times: List[float] = field(default_factory=list, init=False)
     _processed: int = field(default=0, init=False)
     _ok_count: int = field(default=0, init=False)
@@ -660,6 +662,7 @@ class ETATracker:
     def __post_init__(self):
         self._success_times = list(self.initial_times)
         self._ok_count = self.initial_ok_count
+        self._fail_count = self.initial_fail_count
 
     def record(self, result: TuningResult) -> None:
         self._processed += 1
@@ -1324,32 +1327,31 @@ def tune_configs(ctx: TuningContext) -> bool:
                                        num_cu=ctx.options.num_cu,
                                        tuning_space=ctx.options.tuning_space_kind)
     state_file = TuningStateFile(get_state_filepath(ctx.options.output))
-
-    if ctx.options.retune:
-        state_file.delete()
-
     state_file.load(state_context, ctx.options.quiet)
     state = state_file.state
 
-    if not ctx.options.retune:
-        crashed_count = state.crashed_count()
-        if crashed_count > 0 and not ctx.options.quiet:
-            print(f"Detected {crashed_count} crashed config(s) from previous run", file=sys.stderr)
+    crashed_count = state.crashed_count()
+    if crashed_count > 0 and not ctx.options.quiet:
+        print(f"Detected {crashed_count} crashed config(s) from previous run", file=sys.stderr)
 
-        if state.skip_count() > 0 and not ctx.options.quiet:
-            print(f"Found {state.skip_count()} failed/crashed config(s) in state file",
-                  file=sys.stderr)
+    if state.skip_count() > 0 and not ctx.options.quiet:
+        print(f"Found {state.skip_count()} failed/crashed config(s) in state file", file=sys.stderr)
 
     state_file.save()
 
-    # Filter out already-tuned configs
-    pending_configs = [c for c in ctx.configs if not cache.contains(c)]
-    skipped_success = len(ctx.configs) - len(pending_configs)
+    # Filter out already-tuned configs (unless --retune)
+    pending_configs = ctx.configs
+    skipped_success = 0
+    if not ctx.options.retune:
+        pending_configs = [c for c in pending_configs if not cache.contains(c)]
+        skipped_success = len(ctx.configs) - len(pending_configs)
 
-    # Filter out failed/crashed configs from state file
-    before_filter = len(pending_configs)
-    pending_configs = [c for c in pending_configs if not state.should_skip(c)]
-    skipped_failed = before_filter - len(pending_configs)
+    # Filter out failed/crashed configs (unless --retry-failed or --retune)
+    skipped_failed = 0
+    if not ctx.options.retry_failed and not ctx.options.retune:
+        before_filter = len(pending_configs)
+        pending_configs = [c for c in pending_configs if not state.should_skip(c)]
+        skipped_failed = before_filter - len(pending_configs)
 
     total_skipped = skipped_success + skipped_failed
 
@@ -1398,7 +1400,8 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
                 eta_tracker = ETATracker(total_configs=len(pending_configs),
                                          num_workers=num_workers,
                                          initial_times=initial_times,
-                                         initial_ok_count=cache.count())
+                                         initial_ok_count=skipped_success,
+                                         initial_fail_count=skipped_failed)
 
                 progress_bar = tqdm(
                     total=len(ctx.configs),
@@ -1703,6 +1706,11 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N
         default=False,
         help="Force retuning of all configs, ignoring existing results in the output file")
 
+    parser.add_argument("--retry-failed",
+                        action='store_true',
+                        default=False,
+                        help="Retry previously failed/crashed configs instead of skipping them")
+
     parser.add_argument("--gpus",
                         type=int,
                         nargs='+',
@@ -1770,6 +1778,7 @@ def main(args=None):
                           output=ensure_tsv_extension(parsed_args.output),
                           abort_on_error=parsed_args.abort_on_error,
                           retune=parsed_args.retune,
+                          retry_failed=parsed_args.retry_failed,
                           gpu_ids=parsed_args.gpus,
                           num_cpus=parsed_args.num_cpus,
                           wait_for_compiles=parsed_args.wait_for_compiles)

From aeedb2a5bf52a9e39f5ebaa6165c8353ac82eadc Mon Sep 17 00:00:00 2001
From: Mirza Halilcevic <mirza.halilcevic@amd.com>
Date: Sun, 11 Jan 2026 23:35:47 +0000
Subject: [PATCH 06/23] Use proper python logger.

---
 mlir/utils/performance/tuningRunner.py | 249 +++++++++++++------------
 1 file changed, 129 insertions(+), 120 deletions(-)

diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py
index fb3402ec8d8b..7a587b759f13 100755
--- a/mlir/utils/performance/tuningRunner.py
+++ b/mlir/utils/performance/tuningRunner.py
@@ -12,6 +12,8 @@
 
 import argparse
 import glob
+import json
+import logging
 import os
 import statistics
 import subprocess
@@ -26,7 +28,6 @@
 from typing import Any, Dict, List, Optional
 from collections import deque
 
-import json
 import numpy as np
 import pandas as pd
 from tqdm import tqdm
@@ -52,6 +53,46 @@
 SLEEP_US = 100  # 0.1 ms
 MAX_FAILURES = 20
 
+# =============================================================================
+# Logging Setup
+# =============================================================================
+
+
+class TqdmLoggingHandler(logging.Handler):
+    """Logging handler that uses tqdm.write() to avoid corrupting progress bars."""
+
+    def emit(self, record):
+        try:
+            msg = self.format(record)
+            tqdm.write(msg, file=sys.stderr)
+        except Exception:
+            self.handleError(record)
+
+
+def setup_logger(verbose: bool = False) -> logging.Logger:
+    """Configure and return a logger for tuningRunner."""
+    log = logging.getLogger("tuningRunner")
+    log.setLevel(logging.DEBUG if verbose else logging.INFO)
+
+    # Clear any existing handlers
+    log.handlers.clear()
+
+    # Use tqdm-aware handler
+    handler = TqdmLoggingHandler()
+    handler.setLevel(logging.DEBUG if verbose else logging.INFO)
+
+    # Simple format: level and message
+    formatter = logging.Formatter('%(levelname)s: %(message)s')
+    handler.setFormatter(formatter)
+
+    log.addHandler(handler)
+
+    return log
+
+
+# Module-level logger
+logger: logging.Logger = setup_logger()
+
 # =============================================================================
 # Configuration & Results
 # =============================================================================
@@ -62,7 +103,7 @@ class Options:
     """Configuration options for the tuning process."""
     debug: bool
     tuning_space_kind: str
-    quiet: bool
+    verbose: bool
     arch: str
     num_cu: int
     num_chiplets: int
@@ -157,19 +198,19 @@ def discover() -> 'GpuTopology':
                     gpus[gpu_id] = Gpu(gpu_id=gpu_id, sku=sku, numa_node=numa_node)
             if gpus:
                 return GpuTopology(gpus=gpus)
-            print("Warning: rocm-smi returned no GPU cards", file=sys.stderr)
+            logger.warning("rocm-smi returned no GPU cards")
         except subprocess.CalledProcessError as e:
-            print(f"Warning: rocm-smi failed with return code {e.returncode}", file=sys.stderr)
+            logger.warning(f"rocm-smi failed with return code {e.returncode}")
         except subprocess.TimeoutExpired:
-            print("Warning: rocm-smi timed out", file=sys.stderr)
+            logger.warning("rocm-smi timed out")
         except FileNotFoundError:
-            print("Warning: rocm-smi not found in PATH", file=sys.stderr)
+            logger.warning("rocm-smi not found in PATH")
         except json.JSONDecodeError as e:
-            print(f"Warning: Failed to parse rocm-smi JSON output: {e}", file=sys.stderr)
+            logger.warning(f"Failed to parse rocm-smi JSON output: {e}")
         except (ValueError, KeyError) as e:
-            print(f"Warning: Failed to extract GPU info from rocm-smi output: {e}", file=sys.stderr)
+            logger.warning(f"Failed to extract GPU info from rocm-smi output: {e}")
 
-        print("Warning: Could not detect GPUs, defaulting to GPU 0", file=sys.stderr)
+        logger.warning("Could not detect GPUs, defaulting to GPU 0")
         return GpuTopology(gpus={0: Gpu(gpu_id=0, sku="unknown", numa_node=0)})
 
 
@@ -199,6 +240,8 @@ def discover() -> 'NumaTopology':
                     if os.path.exists(cpulist_path):
                         with open(cpulist_path, 'r') as f:
                             numa_to_cpus[node_id] = NumaTopology._parse_cpu_list(f.read())
+                    else:
+                        logger.warning(f"Missing cpulist for NUMA node {node_id}")
 
         # Fallback: single node with all CPUs
         if not numa_to_cpus:
@@ -303,15 +346,6 @@ def skip_count(self) -> int:
         """Count of configs that should be skipped (failed + crashed)."""
         return self._count_by_state(ConfigState.FAILED, ConfigState.CRASHED)
 
-    def promote_running_to_crashed(self) -> int:
-        """Move all RUNNING configs to CRASHED (crash recovery). Returns count."""
-        count = 0
-        for tv in self.configs:
-            if self.configs[tv] == ConfigState.RUNNING:
-                self.configs[tv] = ConfigState.CRASHED
-                count += 1
-        return count
-
     def promote_running_to_interrupted(self) -> int:
         """Move all RUNNING configs to INTERRUPTED (clean shutdown). Returns count."""
         count = 0
@@ -333,7 +367,7 @@ def __init__(self, filepath: Optional[str]):
         self._lock = threading.Lock()
         self._state: Optional[TuningState] = None
 
-    def load(self, expected_context: TuningStateContext, quiet: bool = False) -> 'TuningStateFile':
+    def load(self, expected_context: TuningStateContext) -> 'TuningStateFile':
         """Load state from file. Returns self for chaining.
 
         On load:
@@ -357,8 +391,7 @@ def load(self, expected_context: TuningStateContext, quiet: bool = False) -> 'Tu
                                               tuning_space=data.get('tuningSpace', ''))
 
             if not file_context.matches(expected_context):
-                if not quiet:
-                    print("State file context mismatch, starting fresh", file=sys.stderr)
+                logger.info("State file context mismatch, starting fresh")
                 self._state = TuningState(context=expected_context)
                 return self
 
@@ -380,8 +413,7 @@ def load(self, expected_context: TuningStateContext, quiet: bool = False) -> 'Tu
             return self
 
         except (json.JSONDecodeError, KeyError, TypeError) as e:
-            if not quiet:
-                print(f"Warning: Failed to load state file: {e}", file=sys.stderr)
+            logger.warning(f"Failed to load state file: {e}")
             self._state = TuningState(context=expected_context)
             return self
 
@@ -448,14 +480,13 @@ def set_success(self, test_vector: str) -> None:
                 self._state.remove(test_vector)
                 self._save_locked()
 
-    def finalize_interrupted(self, quiet: bool = False) -> None:
+    def finalize_interrupted(self) -> None:
         """Mark any RUNNING configs as INTERRUPTED and save. Called on clean shutdown."""
         if self._state:
             with self._lock:
                 interrupted_count = self._state.promote_running_to_interrupted()
-                if interrupted_count > 0 and not quiet:
-                    print(f"Marked {interrupted_count} running config(s) as interrupted",
-                          file=sys.stderr)
+                if interrupted_count > 0:
+                    logger.info(f"Marked {interrupted_count} running config(s) as interrupted")
                 self._save_locked()
 
 
@@ -538,10 +569,9 @@ def from_output_file(cls, options: Options) -> 'TunedConfigsCache':
                             # Warn if commit hashes differ
                             file_commit = metadata.get('commit', 'unknown')
                             if file_commit != current_commit:
-                                print(
-                                    f"Warning: Loading tuned configs from different commit "
-                                    f"(file: {file_commit[:8]}, current: {current_commit[:8]})",
-                                    file=sys.stderr)
+                                logger.warning(
+                                    f"Loading tuned configs from different commit "
+                                    f"(file: {file_commit[:8]}, current: {current_commit[:8]})")
 
                         # Reset metadata for next section
                         metadata = {}
@@ -562,9 +592,7 @@ def from_output_file(cls, options: Options) -> 'TunedConfigsCache':
                         cache._results[result.test_vector] = result
 
         except Exception as e:
-            if not options.quiet:
-                print(f"Warning: Failed to load existing tuning results from {options.output}: {e}",
-                      file=sys.stderr)
+            logger.warning(f"Failed to load existing tuning results from {options.output}: {e}")
 
         return cache
 
@@ -747,11 +775,10 @@ def _compute_thread_allocation(self) -> Dict[int, int]:
                 scale_factor = self.options.num_cpus / total_allocated
                 for gpu_id in allocation:
                     allocation[gpu_id] = max(1, int(allocation[gpu_id] * scale_factor))
-            elif not self.options.quiet:
-                print(
-                    f"Note: --num-cpus={self.options.num_cpus} exceeds optimal {total_allocated}, "
-                    f"using optimal allocation",
-                    file=sys.stderr)
+            else:
+                logger.info(
+                    f"--num-cpus={self.options.num_cpus} exceeds optimal {total_allocated}, "
+                    f"using optimal allocation")
 
         return allocation
 
@@ -760,15 +787,13 @@ def get_compile_threads(self, gpu_id: int) -> int:
         return self._threads_per_gpu.get(gpu_id, 1)
 
     def print_gpu_summary(self):
-        """Print summary of GPU allocation to stderr."""
-        if self.options.quiet:
-            return
+        """Print summary of GPU allocation."""
         num_active = len(self.options.gpu_ids)
-        print(f"Using {num_active} GPU(s):", file=sys.stderr)
+        logger.info(f"Using {num_active} GPU(s):")
         for gpu_id in self.options.gpu_ids[:num_active]:
             node = self.gpu_topology.get_numa_node(gpu_id)
             threads = self._threads_per_gpu.get(gpu_id, 1)
-            print(f"  GPU {gpu_id}: NUMA node {node}, {threads} compile threads", file=sys.stderr)
+            logger.info(f"  GPU {gpu_id}: NUMA node {node}, {threads} compile threads")
 
 
 class GpuWorkerPool:
@@ -811,8 +836,7 @@ def _apply_numa_affinity(self, gpu_id: int) -> None:
             try:
                 os.sched_setaffinity(0, set(cpu_list))
             except OSError:
-                if not self._ctx.options.quiet:
-                    print(f"Warning: Could not set CPU affinity for GPU {gpu_id}", file=sys.stderr)
+                logger.warning(f"Could not set CPU affinity for GPU {gpu_id}")
 
         self._set_memory_policy(node)
 
@@ -849,7 +873,7 @@ def __init__(self, filepath: str, options: Options):
         self.filepath = filepath
         self.options = options
         self.file = None
-        self.header_written = False
+        self._header_written = False
         self._is_appending = False
 
     def __enter__(self):
@@ -866,7 +890,7 @@ def __exit__(self, exc_type, exc_value, traceback):
             self.file.close()
 
     def _write_header(self):
-        if self.header_written:
+        if self._header_written:
             return
 
         if self._is_appending:
@@ -886,7 +910,7 @@ def _write_header(self):
         print("# " + "\t".join(columns), file=self.file)
 
         self.file.flush()
-        self.header_written = True
+        self._header_written = True
 
     def write_result(self, result: TuningResult):
         assert result.success and result.winning_config and result.max_tflops, "write_result called with failed result"
@@ -905,11 +929,6 @@ def write_result(self, result: TuningResult):
 
         self.file.flush()
 
-    def write_error(self, content: str):
-        self._write_header()
-        print('\n'.join(f"### {line}" for line in content.splitlines()), file=self.file)
-        self.file.flush()
-
 
 class DebugFileWriter:
     """Context manager for writing debug entries to TSV file."""
@@ -917,10 +936,10 @@ class DebugFileWriter:
     def __init__(self, filepath: str):
         self.filepath = filepath
         self.file = None
-        self.header_written = False
+        self._header_written = False
 
     def __enter__(self):
-        self.header_written = os.path.exists(self.filepath) and os.path.getsize(self.filepath) > 0
+        self._header_written = os.path.exists(self.filepath) and os.path.getsize(self.filepath) > 0
         self.file = open(self.filepath, 'a')
         return self
 
@@ -935,11 +954,11 @@ def write_entries(self, entries: List[Dict]):
         pd.DataFrame(entries).to_csv(self.file,
                                      sep='\t',
                                      mode='a',
-                                     header=not self.header_written,
+                                     header=not self._header_written,
                                      index=False)
 
         self.file.flush()
-        self.header_written = True
+        self._header_written = True
 
 
 # =============================================================================
@@ -978,15 +997,6 @@ def __call__(self, parser, namespace, values, option_string=None):
         setattr(namespace, self.dest, values)
 
 
-def ensure_tsv_extension(filepath: str) -> str:
-    """Ensure filepath has .tsv extension, unless it's stdout."""
-    if filepath == '-':
-        return filepath
-    if not filepath.endswith('.tsv'):
-        return filepath + '.tsv'
-    return filepath
-
-
 def get_git_commit_hash() -> str:
     """Get the current git commit hash."""
     try:
@@ -1032,9 +1042,9 @@ def kill_process(proc) -> None:
         proc.kill()
         proc.wait(timeout=10)
     except subprocess.TimeoutExpired:
-        print(f"Warning: Process {proc.pid} did not terminate in time after kill", file=sys.stderr)
+        logger.warning(f"Process {proc.pid} did not terminate in time after kill")
     except Exception as e:
-        print(f"Warning: Failed to kill process {proc.pid}: {e}", file=sys.stderr)
+        logger.warning(f"Failed to kill process {proc.pid}: {e}")
 
 
 # =============================================================================
@@ -1073,9 +1083,7 @@ def verify_perfconfig(perfconfig, config, paths: Paths, options: Options, gpu_id
     ])
 
     debug_info = f"[GPU {gpu_id}] Verification pipeline:\n" + verification_pipeline
-
-    if not options.quiet and options.debug:
-        print(debug_info, file=sys.stderr)
+    logger.debug(debug_info)
 
     with tempfile.TemporaryDirectory() as tmpdir:
         p1 = None
@@ -1260,9 +1268,7 @@ def tune_config(test_vector, conf_class, paths: Paths, options: Options, gpu_id:
             tuning_pipeline = ' '.join(tuning_driver_command)
 
         debug_info = f"[GPU {gpu_id}] Tuning '{test_vector}':\n" + tuning_pipeline
-
-        if not options.quiet and options.debug:
-            print(debug_info, file=sys.stderr)
+        logger.debug(debug_info)
 
         # Note: communicate waits for process to terminate which might cause CI timeouts if tuning takes too long
         tuning_stdout, tuning_stderr = tuning_driver.communicate()
@@ -1319,23 +1325,24 @@ def tune_configs(ctx: TuningContext) -> bool:
     cache = TunedConfigsCache()
     if not ctx.options.retune:
         cache = TunedConfigsCache.from_output_file(ctx.options)
-        if cache.count() > 0 and not ctx.options.quiet:
-            print(f"Found {cache.count()} tuned config(s) in {ctx.options.output}", file=sys.stderr)
+        if cache.count() > 0:
+            logger.info(f"Found {cache.count()} tuned config(s) in {ctx.options.output}")
 
     # Load state file
     state_context = TuningStateContext(arch=ctx.options.arch,
                                        num_cu=ctx.options.num_cu,
                                        tuning_space=ctx.options.tuning_space_kind)
     state_file = TuningStateFile(get_state_filepath(ctx.options.output))
-    state_file.load(state_context, ctx.options.quiet)
+    state_file.load(state_context)
     state = state_file.state
 
     crashed_count = state.crashed_count()
-    if crashed_count > 0 and not ctx.options.quiet:
-        print(f"Detected {crashed_count} crashed config(s) from previous run", file=sys.stderr)
+    if crashed_count > 0:
+        logger.warning(f"Detected {crashed_count} crashed config(s) from previous run")
 
-    if state.skip_count() > 0 and not ctx.options.quiet:
-        print(f"Found {state.skip_count()} failed/crashed config(s) in state file", file=sys.stderr)
+    failed_count = state.failed_count()
+    if failed_count > 0:
+        logger.info(f"Found {failed_count} failed config(s) in state file")
 
     state_file.save()
 
@@ -1355,13 +1362,13 @@ def tune_configs(ctx: TuningContext) -> bool:
 
     total_skipped = skipped_success + skipped_failed
 
-    if skipped_success > 0 and not ctx.options.quiet:
-        print(f"Skipping {skipped_success} already tuned config(s)", file=sys.stderr)
-    if skipped_failed > 0 and not ctx.options.quiet:
-        print(f"Skipping {skipped_failed} failed/crashed config(s)", file=sys.stderr)
+    if skipped_success > 0:
+        logger.info(f"Skipping {skipped_success} already tuned config(s)")
+    if skipped_failed > 0:
+        logger.info(f"Skipping {skipped_failed} failed/crashed config(s)")
 
     if not pending_configs:
-        print("All configurations already tuned", file=sys.stderr)
+        logger.info("No configurations to tune")
         return True
 
     pool = GpuWorkerPool(ctx)
@@ -1390,6 +1397,8 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
     executor = None
     progress_bar = None
 
+    has_errors = False
+
     with OutputFileWriter(ctx.options.output, ctx.options) as results_writer:
         with DebugFileWriter(f"{ctx.options.output}.debug") if ctx.options.debug else nullcontext(
         ) as debug_writer:
@@ -1406,7 +1415,7 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
                 progress_bar = tqdm(
                     total=len(ctx.configs),
                     initial=total_skipped,
-                    disable=ctx.options.quiet,
+                    disable=not sys.stderr.isatty(),
                     file=sys.stderr,
                     desc=f"Tuning {ctx.conf_class.__name__} ({ctx.options.tuning_space_kind})",
                     unit="config",
@@ -1421,7 +1430,6 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
                     for test_vector in pending_configs
                 }
 
-                has_errors = False
                 consecutive_failures = 0
 
                 for completed_future in as_completed(pending_futures):
@@ -1441,34 +1449,36 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
                         error_text = result.error or "Unknown error"
                         formatted_error = f"[GPU {result.gpu_id}] Error tuning {result.test_vector}\n" + '\n'.join(
                             f"\t{line}" for line in error_text.splitlines())
-                        print(formatted_error, file=sys.stderr)
-                        results_writer.write_error(formatted_error)
+                        logger.error(formatted_error)
 
                         if ctx.options.abort_on_error:
                             return False
 
                         if consecutive_failures >= MAX_FAILURES:
-                            print("Aborting due to too many consecutive failures", file=sys.stderr)
+                            logger.error("Aborting due to too many consecutive failures")
                             return False
 
                     eta_tracker.record(result)
                     progress_bar.update(1)
                     progress_bar.set_postfix_str(eta_tracker.get_postfix_str())
 
-                if has_errors:
-                    print("Encountered errors during tuning", file=sys.stderr)
-                else:
-                    print("Tuning completed successfully", file=sys.stderr)
-
-                return not has_errors
-
+            except KeyboardInterrupt:
+                logger.info("Tuning interrupted by user")
+                raise
             finally:
                 if executor:
                     executor.shutdown(wait=False, cancel_futures=True)
                 if progress_bar:
                     progress_bar.close()
 
-                state_file.finalize_interrupted(ctx.options.quiet)
+                state_file.finalize_interrupted()
+
+    if has_errors:
+        logger.warning("Encountered errors during tuning")
+    else:
+        logger.info("Tuning completed successfully")
+
+    return not has_errors
 
 
 # =============================================================================
@@ -1487,13 +1497,12 @@ def resolve_paths(op_type: Operation, parsed_args) -> Paths:
     return perfRunner.create_paths(configs_path, parsed_args.mlir_build_dir)
 
 
-def extract_fusion_configs(test_dir, paths: Paths, options: Options) -> Operation:
+def extract_fusion_configs(test_dir, paths: Paths) -> Operation:
     """Extract tuning configurations from fusion E2E test files."""
     all_configs = []
     op_type = Operation.FUSION
     for filename in glob.glob(test_dir + '/*mlir'):
-        if not options.quiet:
-            print("Extract from:", filename, file=sys.stderr)
+        logger.info(f"Extract from: {filename}")
         test_entry = perfRunner.get_fusion_test_info(filename, paths)
         if not test_entry:
             continue
@@ -1501,23 +1510,20 @@ def extract_fusion_configs(test_dir, paths: Paths, options: Options) -> Operatio
         if not test_vector:
             continue
         if test_vector in all_configs:
-            if not options.quiet:
-                print("An entry already exists in the tuning DB", file=sys.stderr)
+            logger.info("An entry already exists in the tuning DB")
             continue
         command_line = test_vector.split(sep=' ')
         if command_line[0].startswith('conv'):
             if op_type == Operation.FUSION:
                 op_type = Operation.CONV
             elif op_type != Operation.CONV:
-                if not options.quiet:
-                    print("Invalid config op: ", test_vector, file=sys.stderr)
+                logger.warning(f"Invalid config op: {test_vector}")
                 continue
         else:
             if op_type == Operation.FUSION:
                 op_type = Operation.GEMM
             elif op_type != Operation.GEMM:
-                if not options.quiet:
-                    print("Invalid config op: ", test_vector, file=sys.stderr)
+                logger.warning(f"Invalid config op: {test_vector}")
                 continue
         all_configs.append(test_vector)
 
@@ -1645,12 +1651,11 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N
                         choices=["quick", "full", "greedy", "exhaustive"],
                         help="Tuning space kind to use")
 
-    parser.add_argument(
-        "-q",
-        "--quiet",
-        action='store_true',
-        default=False,
-        help="Suppress progress bars and informational messages, showing only errors")
+    parser.add_argument("-v",
+                        "--verbose",
+                        action='store_true',
+                        default=False,
+                        help="Enable verbose output, including commands being executed")
 
     parser.add_argument("--verify-mode",
                         default="gpu",
@@ -1737,6 +1742,8 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N
 
 
 def main(args=None):
+    global logger
+
     gpu_topology = GpuTopology.discover()
     available_gpus = sorted(gpu_topology.gpus.keys())
 
@@ -1746,6 +1753,9 @@ def main(args=None):
 
     parsed_args = parse_arguments(gpu_topology, available_gpus, args)
 
+    if parsed_args.verbose:
+        logger = setup_logger(verbose=parsed_args.verbose)
+
     stdin_temp_file = None
     try:
         # Handle stdin for configs file
@@ -1757,7 +1767,7 @@ def main(args=None):
         paths = resolve_paths(op_type, parsed_args)
 
         if not paths.mlir_paths:
-            print("rocMLIR build dir was not provided/found", file=sys.stderr)
+            logger.error("rocMLIR build dir was not provided/found")
             return 1
 
         arch = perfRunner.get_arch()
@@ -1769,13 +1779,13 @@ def main(args=None):
                           num_cu=num_cu,
                           num_chiplets=num_chiplets,
                           debug=parsed_args.debug,
-                          quiet=parsed_args.quiet,
+                          verbose=parsed_args.verbose,
                           tuning_space_kind=parsed_args.tuning_space,
                           rocmlir_gen_flags=parsed_args.rocmlir_gen_flags,
                           verify_mode=parsed_args.verify_mode,
                           verify_perfconfigs=parsed_args.verify_perf_configs,
                           tflops=parsed_args.tflops,
-                          output=ensure_tsv_extension(parsed_args.output),
+                          output=parsed_args.output,
                           abort_on_error=parsed_args.abort_on_error,
                           retune=parsed_args.retune,
                           retry_failed=parsed_args.retry_failed,
@@ -1784,7 +1794,7 @@ def main(args=None):
                           wait_for_compiles=parsed_args.wait_for_compiles)
 
         if op_type == Operation.FUSION:
-            op_type = extract_fusion_configs(parsed_args.test_dir, paths, options)
+            op_type = extract_fusion_configs(parsed_args.test_dir, paths)
 
         conf_class = get_config_class(op_type)
         configs = load_configs(op_type, parsed_args, paths)
@@ -1800,7 +1810,6 @@ def main(args=None):
         return 0 if tuning_succeeded else 1
 
     except KeyboardInterrupt:
-        print("Tuning interrupted by user", file=sys.stderr)
         return 130  # 128 + SIGINT
     finally:
         if stdin_temp_file:

From d54ab9e348a4e956cf0bd9ec481524cbd80e8f20 Mon Sep 17 00:00:00 2001
From: Mirza Halilcevic <mirza.halilcevic@amd.com>
Date: Mon, 12 Jan 2026 02:28:16 +0000
Subject: [PATCH 07/23] Improve readability of output.

---
 mlir/utils/performance/tuningRunner.py | 313 ++++++++++++++++---------
 1 file changed, 198 insertions(+), 115 deletions(-)

diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py
index 7a587b759f13..644243bedf50 100755
--- a/mlir/utils/performance/tuningRunner.py
+++ b/mlir/utils/performance/tuningRunner.py
@@ -4,10 +4,23 @@
 This script tunes MLIR kernels by running them with different performance configurations and selecting the best one based on execution time.
 
 Usage examples:
-    python3 tuningRunner.py --op gemm --configs-file=../mlir/utils/performance/configs/tier1-gemm-configs --output=tuning_db.tsv
-    python3 tuningRunner.py --op gemm --config="-g 3 -m 1024 -k 769 -n 512 -t f32 -transA 0 -transB 0"
-    python3 tuningRunner.py --op conv --tuning-space=quick --config="conv -F 1 -f NCHW -I NCHW -O NCHW -n 256 -c 1024 -H 14 -W 14 -k 2048 -y 1 -x 1 -p 0 -q 0 -u 2 -v 2 -l 1 -j 1 -m conv -g 1 -t 1"
-    python3 tuningRunner.py --op fusion --test-dir=../mlir/test/fusion/resnet50-e2e --output=tuning_db.tsv
+    # Tune GEMM configs from a file
+    python3 tuningRunner.py --op gemm -c configs/tier1-gemm-configs -o tuning_db.tsv
+
+    # Tune a single GEMM config
+    python3 tuningRunner.py --op gemm --config "-g 3 -m 1024 -k 769 -n 512 -t f32 -transA 0 -transB 0"
+
+    # Quick-tune CONV configs from a file
+    python3 tuningRunner.py --op conv -c configs/tier1-conv-configs --tuning-space quick
+
+    # Use a subset of available GPUs
+    python3 tuningRunner.py --op gemm -c configs/tier1-gemm-configs --gpus 2 3
+
+    # Tune fusion ops from E2E test directory
+    python3 tuningRunner.py --op fusion --test-dir ../mlir/test/fusion/resnet50-e2e
+
+    # Pipe configs from stdin
+    cat configs/tier1-gemm-configs | python3 tuningRunner.py --op gemm -c - -o tuning_db.tsv
 """
 
 import argparse
@@ -57,14 +70,44 @@
 # Logging Setup
 # =============================================================================
 
+# ANSI color codes
+_LOG_COLORS = {
+    logging.DEBUG: '\033[36m',  # Cyan
+    logging.INFO: '\033[34m',  # Blue
+    logging.WARNING: '\033[33m',  # Yellow
+    logging.ERROR: '\033[91m',  # Red
+    logging.CRITICAL: '\033[91m',  # Red
+}
+_COLOR_RESET = '\033[0m'
+
 
 class TqdmLoggingHandler(logging.Handler):
     """Logging handler that uses tqdm.write() to avoid corrupting progress bars."""
 
+    def __init__(self, use_color: bool = False):
+        super().__init__()
+        self.use_color = use_color
+
     def emit(self, record):
         try:
-            msg = self.format(record)
-            tqdm.write(msg, file=sys.stderr)
+            msg = record.getMessage()
+            levelname = record.levelname
+
+            if self.use_color:
+                color = _LOG_COLORS.get(record.levelno, '')
+                prefix = f"{color}{levelname}{_COLOR_RESET}: "
+            else:
+                prefix = f"{levelname}: "
+
+            indent = ' ' * 4
+            lines = msg.splitlines()
+            if len(lines) == 1:
+                formatted = prefix + lines[0]
+            else:
+                formatted = prefix + lines[0] + '\n' + '\n'.join(
+                    indent + line for line in lines[1:])
+
+            tqdm.write(formatted, file=sys.stderr)
         except Exception:
             self.handleError(record)
 
@@ -74,17 +117,12 @@ def setup_logger(verbose: bool = False) -> logging.Logger:
     log = logging.getLogger("tuningRunner")
     log.setLevel(logging.DEBUG if verbose else logging.INFO)
 
-    # Clear any existing handlers
     log.handlers.clear()
 
-    # Use tqdm-aware handler
-    handler = TqdmLoggingHandler()
+    use_color = sys.stderr.isatty()
+    handler = TqdmLoggingHandler(use_color=use_color)
     handler.setLevel(logging.DEBUG if verbose else logging.INFO)
 
-    # Simple format: level and message
-    formatter = logging.Formatter('%(levelname)s: %(message)s')
-    handler.setFormatter(formatter)
-
     log.addHandler(handler)
 
     return log
@@ -179,8 +217,7 @@ def validate_homogeneity(self, gpu_ids: List[int]) -> bool:
     def discover() -> 'GpuTopology':
         """Query GPU topology using rocm-smi.
 
-        rocm-smi reports physical device IDs regardless of environment variables
-        (e.g., ROCR_VISIBLE_DEVICES and HIP_VISIBLE_DEVICES).
+        rocm-smi reports physical device IDs regardless of environment variables (e.g., ROCR_VISIBLE_DEVICES and HIP_VISIBLE_DEVICES).
         """
         try:
             output = subprocess.check_output(
@@ -570,8 +607,8 @@ def from_output_file(cls, options: Options) -> 'TunedConfigsCache':
                             file_commit = metadata.get('commit', 'unknown')
                             if file_commit != current_commit:
                                 logger.warning(
-                                    f"Loading tuned configs from different commit "
-                                    f"(file: {file_commit[:8]}, current: {current_commit[:8]})")
+                                    f"Loading tuned configs from different commit (file: {file_commit[:8]}, current: {current_commit[:8]})"
+                                )
 
                         # Reset metadata for next section
                         metadata = {}
@@ -777,8 +814,8 @@ def _compute_thread_allocation(self) -> Dict[int, int]:
                     allocation[gpu_id] = max(1, int(allocation[gpu_id] * scale_factor))
             else:
                 logger.info(
-                    f"--num-cpus={self.options.num_cpus} exceeds optimal {total_allocated}, "
-                    f"using optimal allocation")
+                    f"--num-cpus={self.options.num_cpus} exceeds optimal {total_allocated}, using optimal allocation"
+                )
 
         return allocation
 
@@ -789,11 +826,12 @@ def get_compile_threads(self, gpu_id: int) -> int:
     def print_gpu_summary(self):
         """Print summary of GPU allocation."""
         num_active = len(self.options.gpu_ids)
-        logger.info(f"Using {num_active} GPU(s):")
+        lines = [f"Using {num_active} GPU(s)"]
         for gpu_id in self.options.gpu_ids[:num_active]:
             node = self.gpu_topology.get_numa_node(gpu_id)
             threads = self._threads_per_gpu.get(gpu_id, 1)
-            logger.info(f"  GPU {gpu_id}: NUMA node {node}, {threads} compile threads")
+            lines.append(f"GPU {gpu_id}: NUMA node {node}, {threads} compile threads")
+        logger.info("\n".join(lines))
 
 
 class GpuWorkerPool:
@@ -814,7 +852,6 @@ def acquire_gpu_for_thread(self) -> int:
         """Assign a GPU to the calling thread if not already assigned.
 
         Also pins the thread to CPUs on the GPU's NUMA node for better memory locality.
-        Returns the assigned GPU ID.
         """
         if hasattr(self._worker_state, 'assigned_gpu'):
             return self._worker_state.assigned_gpu
@@ -1047,6 +1084,47 @@ def kill_process(proc) -> None:
         logger.warning(f"Failed to kill process {proc.pid}: {e}")
 
 
+def format_error(context: str,
+                 command: str = None,
+                 stdout: str = None,
+                 stderr: str = None,
+                 exit_code: int = None,
+                 gpu_id: int = None,
+                 max_lines: int = 10) -> str:
+    """Format an error message with optional details."""
+
+    def truncate(text: str) -> str:
+        if not text or not text.strip():
+            return None
+        lines = text.strip().splitlines()
+        if len(lines) <= max_lines:
+            return text.strip()
+        half = max_lines // 2
+        return '\n'.join(lines[:half] + [f'... ({len(lines) - max_lines} lines omitted) ...'] +
+                         lines[-half:])
+
+    parts = [context]
+
+    if exit_code is not None:
+        parts.append(f"Exit code: {exit_code}")
+
+    if command:
+        if gpu_id is not None:
+            parts.append(f"Reproduce: ROCR_VISIBLE_DEVICES={gpu_id} {command}")
+        else:
+            parts.append(f"Reproduce: {command}")
+
+    truncated_stdout = truncate(stdout)
+    if truncated_stdout:
+        parts.append("stdout:\n" + truncated_stdout)
+
+    truncated_stderr = truncate(stderr)
+    if truncated_stderr:
+        parts.append("stderr:\n" + truncated_stderr)
+
+    return '\n'.join(parts)
+
+
 # =============================================================================
 # Core Tuning Logic
 # =============================================================================
@@ -1081,9 +1159,7 @@ def verify_perfconfig(perfconfig, config, paths: Paths, options: Options, gpu_id
     verification_pipeline = " | ".join([
         ' '.join(rocmlir_gen_command), ' '.join(rocmlir_driver_command), ' '.join(rocprof_command)
     ])
-
-    debug_info = f"[GPU {gpu_id}] Verification pipeline:\n" + verification_pipeline
-    logger.debug(debug_info)
+    logger.debug(f"[GPU {gpu_id}] Verifying perfconfig '{perfconfig}'\n{verification_pipeline}")
 
     with tempfile.TemporaryDirectory() as tmpdir:
         p1 = None
@@ -1115,22 +1191,23 @@ def verify_perfconfig(perfconfig, config, paths: Paths, options: Options, gpu_id
                 outs, errs = p3.communicate(timeout=600)
                 outs = outs.decode('utf-8')
                 if p3.returncode != 0 or not CORRECT_RESULT_RE.search(outs):
-                    raise TuningError(f"""Verification failed
-{debug_info}
-stdout:
-{outs}
-stderr:
-{errs.decode('utf-8')}""")
+                    raise TuningError(
+                        format_error(f"Verification failed for perfconfig '{perfconfig}'",
+                                     command=verification_pipeline,
+                                     stdout=outs,
+                                     stderr=errs.decode('utf-8'),
+                                     exit_code=p3.returncode,
+                                     gpu_id=gpu_id))
 
             except subprocess.TimeoutExpired:
                 kill_process(p3)
                 outs, errs = p3.communicate()
-                raise TuningError(f"""Verification timed out
-{debug_info}
-stdout:
-{outs.decode('utf-8')}
-stderr:
-{errs.decode('utf-8')}""")
+                raise TuningError(
+                    format_error(f"Verification timed out for perfconfig '{perfconfig}'",
+                                 command=verification_pipeline,
+                                 stdout=outs.decode('utf-8'),
+                                 stderr=errs.decode('utf-8'),
+                                 gpu_id=gpu_id))
 
             stats_file = os.path.join(
                 tmpdir,
@@ -1183,13 +1260,9 @@ def find_best_perfconfig(tuning_output, config, paths: Paths, options: Options,
         these_tflops = entry['TFlops']
 
         if options.verify_perfconfigs and not np.isnan(nano_seconds):
-            try:
-                verify_ns = verify_perfconfig(perfconfig, config, paths, options, gpu_id)
-            except TuningError as e:
-                raise TuningError(
-                    f"Error during verification of perf config {perfconfig}\n{str(e)}")
+            verify_ns = verify_perfconfig(perfconfig, config, paths, options, gpu_id)
             if np.isnan(verify_ns):
-                raise TuningError(f"Verification failed for perf config {perfconfig}")
+                raise TuningError(f"Verification returned NaN for perfconfig '{perfconfig}'")
 
         if not np.isnan(these_tflops) and these_tflops > max_tflops:
             max_tflops = these_tflops
@@ -1247,11 +1320,17 @@ def tune_config(test_vector, conf_class, paths: Paths, options: Options, gpu_id:
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE,
                                           env=env)
-            output, _ = tuning_key.communicate()
+            output, err = tuning_key.communicate()
             if tuning_key.returncode != 0:
                 return {
-                    'success': False,
-                    'error': f"rocmlir-gen failed with return code {tuning_key.returncode}"
+                    'success':
+                        False,
+                    'error':
+                        format_error("Failed to generate tuning key",
+                                     command=' '.join(rocmlir_gen_command),
+                                     stderr=err.decode('utf-8'),
+                                     exit_code=tuning_key.returncode,
+                                     gpu_id=gpu_id)
                 }
             result = output.decode('utf-8').strip().split('\t')
             command_line = result[2].split(sep=' ')
@@ -1267,18 +1346,22 @@ def tune_config(test_vector, conf_class, paths: Paths, options: Options, gpu_id:
                                              env=env)
             tuning_pipeline = ' '.join(tuning_driver_command)
 
-        debug_info = f"[GPU {gpu_id}] Tuning '{test_vector}':\n" + tuning_pipeline
-        logger.debug(debug_info)
+        logger.debug(f"[GPU {gpu_id}] Tuning '{test_vector}'\n{tuning_pipeline}")
 
         # Note: communicate waits for process to terminate which might cause CI timeouts if tuning takes too long
         tuning_stdout, tuning_stderr = tuning_driver.communicate()
 
         if tuning_driver.returncode != 0:
-            error_msg = f"rocmlir-tuning-driver failed with return code {tuning_driver.returncode}"
-            stderr_content = tuning_stderr.decode('utf-8').strip()
-            if stderr_content:
-                error_msg += f"\nstderr:\n{stderr_content}"
-            return {'success': False, 'error': error_msg}
+            return {
+                'success':
+                    False,
+                'error':
+                    format_error("Tuning failed",
+                                 command=tuning_pipeline,
+                                 stderr=tuning_stderr.decode('utf-8'),
+                                 exit_code=tuning_driver.returncode,
+                                 gpu_id=gpu_id)
+            }
 
         tuning_output = tuning_stdout.decode('utf-8').splitlines()
         winning_config, max_tflops, entries = find_best_perfconfig(tuning_output, config, paths,
@@ -1297,15 +1380,12 @@ def tune_config(test_vector, conf_class, paths: Paths, options: Options, gpu_id:
         try:
             verify_ns = verify_perfconfig(winning_config, config, paths, options, gpu_id)
         except TuningError as e:
-            return {
-                'success': False,
-                'error': f"Error during verification of winning config {winning_config}\n{str(e)}"
-            }
+            return {'success': False, 'error': str(e)}
 
         if np.isnan(verify_ns):
             return {
                 'success': False,
-                'error': f"Verification failed for winning config {winning_config}"
+                'error': f"Verification returned NaN for winning perfconfig '{winning_config}'"
             }
 
         verify_tflops = config.compute_tflops(verify_ns)
@@ -1375,6 +1455,14 @@ def tune_configs(ctx: TuningContext) -> bool:
     num_workers = min(pool.worker_count, len(ctx.configs))
     ctx.print_gpu_summary()
 
+    # Prepare ETA tracker with historical data
+    initial_times = [r.elapsed_seconds for r in cache.get_all_results() if r.elapsed_seconds > 0.0]
+    eta_tracker = ETATracker(total_configs=len(pending_configs),
+                             num_workers=num_workers,
+                             initial_times=initial_times,
+                             initial_ok_count=skipped_success,
+                             initial_fail_count=skipped_failed)
+
     def execute_tuning_task(test_vector: str) -> TuningResult:
         gpu_id = pool.acquire_gpu_for_thread()
 
@@ -1394,24 +1482,14 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
                             verify_tflops=result.get('verify_tflops'),
                             error=result.get('error'))
 
-    executor = None
-    progress_bar = None
-
-    has_errors = False
-
     with OutputFileWriter(ctx.options.output, ctx.options) as results_writer:
         with DebugFileWriter(f"{ctx.options.output}.debug") if ctx.options.debug else nullcontext(
         ) as debug_writer:
-            try:  # No context manager for executor because we need to shutdown with wait=False
-                initial_times = [
-                    r.elapsed_seconds for r in cache.get_all_results() if r.elapsed_seconds > 0.0
-                ]
-                eta_tracker = ETATracker(total_configs=len(pending_configs),
-                                         num_workers=num_workers,
-                                         initial_times=initial_times,
-                                         initial_ok_count=skipped_success,
-                                         initial_fail_count=skipped_failed)
 
+            executor = None
+            progress_bar = None
+
+            try:  # No context manager for executor because we need to shutdown with wait=False
                 progress_bar = tqdm(
                     total=len(ctx.configs),
                     initial=total_skipped,
@@ -1430,6 +1508,7 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
                     for test_vector in pending_configs
                 }
 
+                has_errors = False
                 consecutive_failures = 0
 
                 for completed_future in as_completed(pending_futures):
@@ -1446,10 +1525,10 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
                         consecutive_failures += 1
                         state_file.set_failed(result.test_vector)
 
-                        error_text = result.error or "Unknown error"
-                        formatted_error = f"[GPU {result.gpu_id}] Error tuning {result.test_vector}\n" + '\n'.join(
-                            f"\t{line}" for line in error_text.splitlines())
-                        logger.error(formatted_error)
+                        error_msg = f"[GPU {result.gpu_id}] Tuning failed for '{result.test_vector}'"
+                        if result.error:
+                            error_msg += "\n" + result.error
+                        logger.error(error_msg)
 
                         if ctx.options.abort_on_error:
                             return False
@@ -1732,11 +1811,13 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N
         metavar='N',
         help="Maximum CPU threads for compilation (default: auto-detect based on NUMA topology)")
 
-    parser.add_argument("--wait-for-compiles",
-                        action='store_true',
-                        default=False,
-                        help="Wait for all compilation tasks to complete before starting tuning. "
-                        "Useful for systems with shared CPU/GPU memory (e.g., APUs).")
+    parser.add_argument(
+        "--wait-for-compiles",
+        action='store_true',
+        default=False,
+        help=
+        "Wait for all compilation tasks to complete before starting tuning. Useful for systems with shared CPU/GPU memory (e.g., APUs)."
+    )
 
     return parser.parse_args(args)
 
@@ -1770,50 +1851,52 @@ def main(args=None):
             logger.error("rocMLIR build dir was not provided/found")
             return 1
 
-        arch = perfRunner.get_arch()
-        chip = perfRunner.get_chip()
-        num_cu = perfRunner.get_num_cu(chip)
-        num_chiplets = perfRunner.get_num_chiplets(chip, num_cu)
-
-        options = Options(arch=arch,
-                          num_cu=num_cu,
-                          num_chiplets=num_chiplets,
-                          debug=parsed_args.debug,
-                          verbose=parsed_args.verbose,
-                          tuning_space_kind=parsed_args.tuning_space,
-                          rocmlir_gen_flags=parsed_args.rocmlir_gen_flags,
-                          verify_mode=parsed_args.verify_mode,
-                          verify_perfconfigs=parsed_args.verify_perf_configs,
-                          tflops=parsed_args.tflops,
-                          output=parsed_args.output,
-                          abort_on_error=parsed_args.abort_on_error,
-                          retune=parsed_args.retune,
-                          retry_failed=parsed_args.retry_failed,
-                          gpu_ids=parsed_args.gpus,
-                          num_cpus=parsed_args.num_cpus,
-                          wait_for_compiles=parsed_args.wait_for_compiles)
-
         if op_type == Operation.FUSION:
             op_type = extract_fusion_configs(parsed_args.test_dir, paths)
 
         conf_class = get_config_class(op_type)
         configs = load_configs(op_type, parsed_args, paths)
 
-        ctx = TuningContext(configs=configs,
-                            conf_class=conf_class,
-                            paths=paths,
-                            options=options,
-                            gpu_topology=gpu_topology,
-                            numa_topology=NumaTopology.discover())
+    finally:
+        if stdin_temp_file:
+            os.unlink(stdin_temp_file)
+
+    arch = perfRunner.get_arch()
+    chip = perfRunner.get_chip()
+    num_cu = perfRunner.get_num_cu(chip)
+    num_chiplets = perfRunner.get_num_chiplets(chip, num_cu)
+
+    options = Options(arch=arch,
+                      num_cu=num_cu,
+                      num_chiplets=num_chiplets,
+                      debug=parsed_args.debug,
+                      verbose=parsed_args.verbose,
+                      tuning_space_kind=parsed_args.tuning_space,
+                      rocmlir_gen_flags=parsed_args.rocmlir_gen_flags,
+                      verify_mode=parsed_args.verify_mode,
+                      verify_perfconfigs=parsed_args.verify_perf_configs,
+                      tflops=parsed_args.tflops,
+                      output=parsed_args.output,
+                      abort_on_error=parsed_args.abort_on_error,
+                      retune=parsed_args.retune,
+                      retry_failed=parsed_args.retry_failed,
+                      gpu_ids=parsed_args.gpus,
+                      num_cpus=parsed_args.num_cpus,
+                      wait_for_compiles=parsed_args.wait_for_compiles)
+
+    ctx = TuningContext(configs=configs,
+                        conf_class=conf_class,
+                        paths=paths,
+                        options=options,
+                        gpu_topology=gpu_topology,
+                        numa_topology=NumaTopology.discover())
 
+    try:
         tuning_succeeded = tune_configs(ctx)
-        return 0 if tuning_succeeded else 1
-
     except KeyboardInterrupt:
         return 130  # 128 + SIGINT
-    finally:
-        if stdin_temp_file:
-            os.unlink(stdin_temp_file)
+
+    return 0 if tuning_succeeded else 1
 
 
 if __name__ == '__main__':

From a2c338e4e3c8a4c167ca12f128ab711d8f7cde0d Mon Sep 17 00:00:00 2001
From: Mirza Halilcevic <mirza.halilcevic@amd.com>
Date: Mon, 12 Jan 2026 02:56:03 +0000
Subject: [PATCH 08/23] Log warnings from tuning driver.

---
 mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp | 1 -
 mlir/utils/jenkins/Jenkinsfile                  | 4 ++--
 mlir/utils/jenkins/Jenkinsfile.downstream       | 4 ++--
 mlir/utils/performance/tuningRunner.py          | 7 ++++++-
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp b/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp
index 2a735302a3ea..78513dc5b16f 100644
--- a/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp
+++ b/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp
@@ -985,7 +985,6 @@ createTunableParamSpace(ModuleOp mod, TuningParamSetKind kind,
         // greedy is not implemented for non-accel
         if (!archInfo.isAccel(op) && kind == TuningParamSetKind::Greedy) {
           kind = TuningParamSetKind::Exhaustive;
-          // TODO: tuningRunner hides this warning
           llvm::errs() << "Greedy tuning not implemented for non-accel, using "
                           "Exhaustive instead\n";
         }
diff --git a/mlir/utils/jenkins/Jenkinsfile b/mlir/utils/jenkins/Jenkinsfile
index ffaf6e6baf1d..413bc0be1593 100644
--- a/mlir/utils/jenkins/Jenkinsfile
+++ b/mlir/utils/jenkins/Jenkinsfile
@@ -1179,10 +1179,10 @@ PY
                                                     stage("Tune Fusion") {
                                                         dir('build') {
                                                             // Tune resnet50
-                                                            sh """python3 ./bin/tuningRunner.py --quiet --abort-on-error --op fusion --test-dir ../mlir/test/fusion/resnet50-e2e/ -o tuning_fusion_${CHIP}.tsv"""
+                                                            sh """python3 ./bin/tuningRunner.py --abort-on-error --op fusion --test-dir ../mlir/test/fusion/resnet50-e2e/ -o tuning_fusion_${CHIP}.tsv"""
 
                                                             // Tune bert
-                                                            sh """python3 ./bin/tuningRunner.py --quiet --abort-on-error --op fusion --test-dir ../mlir/test/xmir/bert-torch-tosa-e2e/ -o tuning_fusion_${CHIP}.tsv"""
+                                                            sh """python3 ./bin/tuningRunner.py --abort-on-error --op fusion --test-dir ../mlir/test/xmir/bert-torch-tosa-e2e/ -o tuning_fusion_${CHIP}.tsv"""
                                                         }
                                                         sh 'rm -f build/CMakeCache.txt'
                                                     }
diff --git a/mlir/utils/jenkins/Jenkinsfile.downstream b/mlir/utils/jenkins/Jenkinsfile.downstream
index dd400380e7e9..b2f3d1e6dea6 100644
--- a/mlir/utils/jenkins/Jenkinsfile.downstream
+++ b/mlir/utils/jenkins/Jenkinsfile.downstream
@@ -150,12 +150,12 @@ pipeline {
                                             dir('build') {
                                                 timeout(time: 60, activity: true, unit: 'MINUTES') {
                                                     // Tune gemms, fail if the DB is not created
-                                                    sh """python3 ./bin/tuningRunner.py --quiet --abort-on-error \
+                                                    sh """python3 ./bin/tuningRunner.py --abort-on-error \
                                                             --operation gemm \
                                                             --configs-file=../mlir/utils/jenkins/ci-configs/selected-gemm-configs \
                                                             --output=tuning_gemm.tsv
                                                         [ -f tuning_gemm.tsv ]"""
-                                                    sh """python3 ./bin/tuningRunner.py --quiet --abort-on-error \
+                                                    sh """python3 ./bin/tuningRunner.py --abort-on-error \
                                                             --operation conv \
                                                             --configs-file=../mlir/utils/jenkins/ci-configs/selected-conv-configs \
                                                             --output=tuning_conv.tsv
diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py
index 644243bedf50..5dc14cca61ab 100755
--- a/mlir/utils/performance/tuningRunner.py
+++ b/mlir/utils/performance/tuningRunner.py
@@ -1356,12 +1356,17 @@ def tune_config(test_vector, conf_class, paths: Paths, options: Options, gpu_id:
                 'success':
                     False,
                 'error':
-                    format_error("Tuning failed",
+                    format_error("Tuning pipeline failed",
                                  command=tuning_pipeline,
                                  stderr=tuning_stderr.decode('utf-8'),
                                  exit_code=tuning_driver.returncode,
                                  gpu_id=gpu_id)
             }
+        else:
+            # Log any stderr output from tuning driver because it may contain warnings
+            tuning_stderr_str = tuning_stderr.decode('utf-8').strip()
+            if tuning_stderr_str:
+                logger.debug(f"[GPU {gpu_id}] rocmlir-tuning-driver stderr:\n{tuning_stderr_str}")
 
         tuning_output = tuning_stdout.decode('utf-8').splitlines()
         winning_config, max_tflops, entries = find_best_perfconfig(tuning_output, config, paths,

From 58bada89e3b9ee596a27641fa0814082fb1925e5 Mon Sep 17 00:00:00 2001
From: Mirza Halilcevic <mirza.halilcevic@amd.com>
Date: Sat, 17 Jan 2026 00:39:15 +0000
Subject: [PATCH 09/23] Reintroduce --quiet flag.

---
 mlir/utils/performance/tuningRunner.py | 27 +++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py
index 5dc14cca61ab..eb71dec76fc2 100755
--- a/mlir/utils/performance/tuningRunner.py
+++ b/mlir/utils/performance/tuningRunner.py
@@ -112,10 +112,16 @@ def emit(self, record):
             self.handleError(record)
 
 
-def setup_logger(verbose: bool = False) -> logging.Logger:
+def setup_logger(quiet: bool = False, verbose: bool = False) -> logging.Logger:
     """Configure and return a logger for tuningRunner."""
     log = logging.getLogger("tuningRunner")
-    log.setLevel(logging.DEBUG if verbose else logging.INFO)
+
+    if quiet:
+        log.setLevel(logging.ERROR)
+    elif verbose:
+        log.setLevel(logging.DEBUG)
+    else:
+        log.setLevel(logging.INFO)
 
     log.handlers.clear()
 
@@ -141,6 +147,7 @@ class Options:
     """Configuration options for the tuning process."""
     debug: bool
     tuning_space_kind: str
+    quiet: bool
     verbose: bool
     arch: str
     num_cu: int
@@ -428,7 +435,7 @@ def load(self, expected_context: TuningStateContext) -> 'TuningStateFile':
                                               tuning_space=data.get('tuningSpace', ''))
 
             if not file_context.matches(expected_context):
-                logger.info("State file context mismatch, starting fresh")
+                logger.warning("State file context mismatch, starting fresh")
                 self._state = TuningState(context=expected_context)
                 return self
 
@@ -1498,7 +1505,7 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
                 progress_bar = tqdm(
                     total=len(ctx.configs),
                     initial=total_skipped,
-                    disable=not sys.stderr.isatty(),
+                    disable=ctx.options.quiet or not sys.stderr.isatty(),
                     file=sys.stderr,
                     desc=f"Tuning {ctx.conf_class.__name__} ({ctx.options.tuning_space_kind})",
                     unit="config",
@@ -1558,7 +1565,7 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
                 state_file.finalize_interrupted()
 
     if has_errors:
-        logger.warning("Encountered errors during tuning")
+        logger.error("Encountered errors during tuning")
     else:
         logger.info("Tuning completed successfully")
 
@@ -1735,6 +1742,12 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N
                         choices=["quick", "full", "greedy", "exhaustive"],
                         help="Tuning space kind to use")
 
+    parser.add_argument("-q",
+                        "--quiet",
+                        action='store_true',
+                        default=False,
+                        help="Suppress non-error output")
+
     parser.add_argument("-v",
                         "--verbose",
                         action='store_true',
@@ -1839,8 +1852,7 @@ def main(args=None):
 
     parsed_args = parse_arguments(gpu_topology, available_gpus, args)
 
-    if parsed_args.verbose:
-        logger = setup_logger(verbose=parsed_args.verbose)
+    logger = setup_logger(quiet=parsed_args.quiet, verbose=parsed_args.verbose)
 
     stdin_temp_file = None
     try:
@@ -1875,6 +1887,7 @@ def main(args=None):
                       num_cu=num_cu,
                       num_chiplets=num_chiplets,
                       debug=parsed_args.debug,
+                      quiet=parsed_args.quiet,
                       verbose=parsed_args.verbose,
                       tuning_space_kind=parsed_args.tuning_space,
                       rocmlir_gen_flags=parsed_args.rocmlir_gen_flags,

From 1cf2e36c0d13100a5dffe4d5aac47933f56a02ae Mon Sep 17 00:00:00 2001
From: Mirza Halilcevic <mirza.halilcevic@amd.com>
Date: Sun, 18 Jan 2026 00:58:36 +0000
Subject: [PATCH 10/23] Let important exceptions propagate and clean up code.

---
 mlir/utils/performance/tuningRunner.py | 677 ++++++++++++-------------
 1 file changed, 326 insertions(+), 351 deletions(-)

diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py
index eb71dec76fc2..eeec94be3d58 100755
--- a/mlir/utils/performance/tuningRunner.py
+++ b/mlir/utils/performance/tuningRunner.py
@@ -114,28 +114,21 @@ def emit(self, record):
 
 def setup_logger(quiet: bool = False, verbose: bool = False) -> logging.Logger:
     """Configure and return a logger for tuningRunner."""
-    log = logging.getLogger("tuningRunner")
+    assert not (quiet and verbose), "quiet and verbose are mutually exclusive"
 
     if quiet:
-        log.setLevel(logging.ERROR)
+        logger.setLevel(logging.ERROR)
     elif verbose:
-        log.setLevel(logging.DEBUG)
+        logger.setLevel(logging.DEBUG)
     else:
-        log.setLevel(logging.INFO)
+        logger.setLevel(logging.INFO)
 
-    log.handlers.clear()
-
-    use_color = sys.stderr.isatty()
-    handler = TqdmLoggingHandler(use_color=use_color)
-    handler.setLevel(logging.DEBUG if verbose else logging.INFO)
-
-    log.addHandler(handler)
-
-    return log
+    logger.handlers.clear()
+    logger.addHandler(TqdmLoggingHandler(use_color=sys.stderr.isatty()))
 
 
 # Module-level logger
-logger: logging.Logger = setup_logger()
+logger: logging.Logger = logging.getLogger("tuningRunner")
 
 # =============================================================================
 # Configuration & Results
@@ -170,8 +163,8 @@ class TuningResult:
     """Result of tuning a single configuration."""
     test_vector: str
     success: bool
-    gpu_id: int
-    elapsed_seconds: float
+    gpu_id: int = -1
+    elapsed_seconds: float = 0.0
     winning_config: Optional[str] = None
     max_tflops: Optional[float] = None
     entries: List[Dict] = field(default_factory=list)
@@ -194,7 +187,7 @@ class TuningError(Exception):
 # =============================================================================
 
 
-@dataclass
+@dataclass(frozen=True)
 class Gpu:
     """Information about a GPU."""
     gpu_id: int
@@ -202,22 +195,21 @@ class Gpu:
     numa_node: int
 
 
-@dataclass
+@dataclass(frozen=True)
 class GpuTopology:
     """System GPU topology with NUMA mappings."""
     gpus: Dict[int, Gpu]  # GPU ID -> Gpu
 
     def get_numa_node(self, gpu_id: int) -> int:
-        """Get NUMA node for a GPU, defaults to 0 if unknown."""
-        if gpu_id in self.gpus:
-            return self.gpus[gpu_id].numa_node
-        return 0
+        """Get NUMA node for a GPU."""
+        return self.gpus[gpu_id].numa_node
 
     def validate_homogeneity(self, gpu_ids: List[int]) -> bool:
         """Validate that all selected GPUs are of the same model."""
         if len(gpu_ids) <= 1:
             return True
-        skus = {self.gpus[gpu_id].sku for gpu_id in gpu_ids if gpu_id in self.gpus}
+
+        skus = {self.gpus[gpu_id].sku for gpu_id in gpu_ids}
         return len(skus) == 1
 
     @staticmethod
@@ -226,46 +218,36 @@ def discover() -> 'GpuTopology':
 
         rocm-smi reports physical device IDs regardless of environment variables (e.g., ROCR_VISIBLE_DEVICES and HIP_VISIBLE_DEVICES).
         """
-        try:
-            output = subprocess.check_output(
-                ["rocm-smi", "--showproductname", "--showtoponuma", "--json"],
-                text=True,
-                timeout=10)
-            data = json.loads(output)
-            gpus = {}
-            for key, value in data.items():
-                if key.startswith("card"):
-                    gpu_id = int(key.replace("card", ""))
-                    sku = value.get("Card SKU", "unknown")
-                    numa_node_str = value.get("(Topology) Numa Node")
-                    numa_node = int(numa_node_str) if numa_node_str is not None else 0
-                    gpus[gpu_id] = Gpu(gpu_id=gpu_id, sku=sku, numa_node=numa_node)
-            if gpus:
-                return GpuTopology(gpus=gpus)
-            logger.warning("rocm-smi returned no GPU cards")
-        except subprocess.CalledProcessError as e:
-            logger.warning(f"rocm-smi failed with return code {e.returncode}")
-        except subprocess.TimeoutExpired:
-            logger.warning("rocm-smi timed out")
-        except FileNotFoundError:
-            logger.warning("rocm-smi not found in PATH")
-        except json.JSONDecodeError as e:
-            logger.warning(f"Failed to parse rocm-smi JSON output: {e}")
-        except (ValueError, KeyError) as e:
-            logger.warning(f"Failed to extract GPU info from rocm-smi output: {e}")
-
-        logger.warning("Could not detect GPUs, defaulting to GPU 0")
-        return GpuTopology(gpus={0: Gpu(gpu_id=0, sku="unknown", numa_node=0)})
+        output = subprocess.check_output(
+            ["rocm-smi", "--showproductname", "--showtoponuma", "--json"], text=True, timeout=10)
+        data = json.loads(output)
 
+        gpus = {}
+        for key, value in data.items():
+            if key.startswith("card"):
+                gpu_id = int(key.replace("card", ""))
 
-@dataclass
+                sku = value["Card SKU"]
+
+                numa_node_str = value.get("(Topology) Numa Node")
+                numa_node = int(numa_node_str) if numa_node_str is not None else 0
+
+                gpus[gpu_id] = Gpu(gpu_id=gpu_id, sku=sku, numa_node=numa_node)
+
+        if not gpus:
+            raise RuntimeError("rocm-smi returned no GPU cards")
+
+        return GpuTopology(gpus=gpus)
+
+
+@dataclass(frozen=True)
 class NumaTopology:
     """System NUMA topology with CPU mappings."""
     numa_to_cpus: Dict[int, List[int]]  # NUMA node -> list of CPU IDs
 
     def get_cpus_for_numa_node(self, numa_node: int) -> List[int]:
         """Get CPUs belonging to a NUMA node."""
-        return self.numa_to_cpus.get(numa_node, [])
+        return self.numa_to_cpus[numa_node]
 
     @staticmethod
     def discover() -> 'NumaTopology':
@@ -281,11 +263,8 @@ def discover() -> 'NumaTopology':
                 if entry.startswith("node") and entry[4:].isdigit():
                     node_id = int(entry[4:])
                     cpulist_path = os.path.join(numa_base, entry, "cpulist")
-                    if os.path.exists(cpulist_path):
-                        with open(cpulist_path, 'r') as f:
-                            numa_to_cpus[node_id] = NumaTopology._parse_cpu_list(f.read())
-                    else:
-                        logger.warning(f"Missing cpulist for NUMA node {node_id}")
+                    with open(cpulist_path, 'r') as f:
+                        numa_to_cpus[node_id] = NumaTopology._parse_cpu_list(f.read())
 
         # Fallback: single node with all CPUs
         if not numa_to_cpus:
@@ -332,7 +311,7 @@ class ConfigState(Enum):
     CRASHED = "crashed"  # Process crashed while tuning (detected on startup)
 
 
-@dataclass
+@dataclass(frozen=True)
 class TuningStateContext:
     """Context that identifies a tuning run. State is invalidated if context changes."""
     arch: str
@@ -546,7 +525,7 @@ def get_state_filepath(output_filepath: str) -> Optional[str]:
 # =============================================================================
 
 
-@dataclass
+@dataclass(frozen=True)
 class TunedConfigsCache:
     """Cache for previously tuned configurations loaded from output file."""
     _results: Dict[str, TuningResult] = field(default_factory=dict)
@@ -571,13 +550,13 @@ def count(self) -> int:
     def from_output_file(cls, options: Options) -> 'TunedConfigsCache':
         """Load previously tuned configurations from an output TSV file.
 
-        Format: # arch\tnumCUs\ttestVector\tperfConfig (tuning_space)\t[TFlops]\t[elapsedSeconds]
-        Only loads entries matching current arch, numCUs, and tuning space.
+        Format: # arch\tnumCUs\tnumChiplets\ttestVector\tperfConfig (tuning_space)\t[TFlops]\telapsedSeconds
+        Only loads entries matching current arch and tuning space.
         """
-        cache = cls()
-
         if options.output == '-' or not os.path.exists(options.output):
-            return cache
+            return cls()
+
+        results: Dict[str, TuningResult] = {}
 
         current_commit = get_git_commit_hash()
 
@@ -586,59 +565,50 @@ def from_output_file(cls, options: Options) -> 'TunedConfigsCache':
         matching_section = False
         column_indices: Dict[str, int] = {}
 
-        try:
-            with open(options.output, mode='r') as f:
-                for line in f:
-                    line = line.strip()
-                    if not line:
-                        continue
-
-                    # Check for metadata line
-                    if line.startswith('## '):
-                        parts = line[3:].split(':', 1)
-                        if len(parts) == 2:
-                            key = parts[0].strip()
-                            value = parts[1].strip()
-                            metadata[key] = value
-                        continue
-
-                    # Check for header line
-                    if cls._is_header_line(line):
-                        # Determine if this section matches based on tuning space
-                        matching_section = f'({options.tuning_space_kind})' in line
-
-                        if matching_section:
-                            column_indices = cls._parse_header_line(line)
-
-                            # Warn if commit hashes differ
-                            file_commit = metadata.get('commit', 'unknown')
-                            if file_commit != current_commit:
-                                logger.warning(
-                                    f"Loading tuned configs from different commit (file: {file_commit[:8]}, current: {current_commit[:8]})"
-                                )
-
-                        # Reset metadata for next section
-                        metadata = {}
-                        continue
-
-                    # Skip other comment lines
-                    if line.startswith('#'):
-                        continue
-
-                    # Skip data lines from non-matching sections
-                    if not matching_section or not column_indices:
-                        continue
-
-                    # Parse data line
-                    result = cls._parse_data_line(line.split('\t'), column_indices, options.arch,
-                                                  options.num_cu)
-                    if result:
-                        cache._results[result.test_vector] = result
-
-        except Exception as e:
-            logger.warning(f"Failed to load existing tuning results from {options.output}: {e}")
-
-        return cache
+        with open(options.output, mode='r') as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+
+                # Check for metadata line
+                if line.startswith('## '):
+                    parts = line[3:].split(':')
+                    if len(parts) == 2:
+                        metadata[parts[0].strip()] = parts[1].strip()
+                    continue
+
+                # Check for header line
+                if cls._is_header_line(line):
+                    # Determine if this section matches based on tuning space
+                    matching_section = f'({options.tuning_space_kind})' in line
+                    if matching_section:
+                        column_indices = cls._parse_header_line(line)
+                        # Warn if commit hashes differ
+                        file_commit = metadata.get('commit', 'unknown')
+                        if file_commit != current_commit:
+                            logger.warning(
+                                f"Loading tuned configs from different commit (file: {file_commit[:8]}, current: {current_commit[:8]})"
+                            )
+
+                    # Reset metadata for next section
+                    metadata = {}
+                    continue
+
+                # Skip other comment lines
+                if line.startswith('#'):
+                    continue
+
+                # Skip data lines from non-matching sections
+                if not matching_section or not column_indices:
+                    continue
+
+                # Parse data line
+                result = cls._parse_data_line(line.split('\t'), column_indices, options.arch)
+                if result:
+                    results[result.test_vector] = result
+
+        return cls(_results=results)
 
     @staticmethod
     def _is_header_line(line: str) -> bool:
@@ -650,24 +620,26 @@ def _parse_header_line(line: str) -> Dict[str, int]:
         """Parse column header and return name -> index mapping."""
         # Strip leading '# ' if present
         header_text = line[2:] if line.startswith('# ') else line
+
         indices = {}
         for i, col in enumerate(header_text.split('\t')):
-            if col:
-                # Exctract base column name (handles 'perfConfig (tuning_space)')
-                col_name = col.split()[0]
-                indices[col_name] = i
+            if not col:
+                continue
+            # Exctract base column name (handles 'perfConfig (tuning_space)')
+            col_name = col.split()[0]
+            indices[col_name] = i
+
         return indices
 
     @staticmethod
-    def _parse_data_line(fields: List[str], column_indices: Dict[str, int], arch: str,
-                         num_cu: int) -> Optional[TuningResult]:
+    def _parse_data_line(fields: List[str], column_indices: Dict[str, int],
+                         arch: str) -> Optional[TuningResult]:
         """Parse a data line and return TuningResult if valid.
 
         A line is valid if:
-        - arch and numCUs match current system (if columns exist, for old format)
+        - arch matches current system
         - testVector is present
         - perfConfig is present and not 'None'
-        - TFlops is a valid finite number (if column exists)
         """
 
         def get_field(name: str) -> Optional[str]:
@@ -678,8 +650,6 @@ def get_field(name: str) -> Optional[str]:
 
         if get_field('arch') != arch:
             return None
-        if get_field('numCUs') != str(num_cu):
-            return None
 
         test_vector = get_field('testVector')
         if not test_vector:
@@ -690,17 +660,14 @@ def get_field(name: str) -> Optional[str]:
             return None
 
         max_tflops = None
-        if 'TFlops' in column_indices:
-            tflops_str = get_field('TFlops')
-            if not tflops_str:
-                return None
+        tflops_str = get_field('TFlops')
+        if tflops_str:
             try:
                 tflops_val = float(tflops_str)
-                if np.isnan(tflops_val) or np.isinf(tflops_val):
-                    return None
-                max_tflops = tflops_val
+                if np.isfinite(tflops_val):
+                    max_tflops = tflops_val
             except ValueError:
-                return None
+                pass
 
         elapsed_seconds = 0.0
         elapsed_str = get_field('elapsedSeconds')
@@ -723,26 +690,18 @@ class ETATracker:
     """Track completion times for accurate ETA estimation using median of successful configs."""
     total_configs: int
     num_workers: int
-    initial_times: List[float] = field(default_factory=list)
-    initial_ok_count: int = 0
-    initial_fail_count: int = 0
-    _success_times: List[float] = field(default_factory=list, init=False)
+    success_times: List[float] = field(default_factory=list)
+    ok_count: int = 0
+    fail_count: int = 0
     _processed: int = field(default=0, init=False)
-    _ok_count: int = field(default=0, init=False)
-    _fail_count: int = field(default=0, init=False)
-
-    def __post_init__(self):
-        self._success_times = list(self.initial_times)
-        self._ok_count = self.initial_ok_count
-        self._fail_count = self.initial_fail_count
 
     def record(self, result: TuningResult) -> None:
         self._processed += 1
         if result.success:
-            self._ok_count += 1
-            self._success_times.append(result.elapsed_seconds)
+            self.ok_count += 1
+            self.success_times.append(result.elapsed_seconds)
         else:
-            self._fail_count += 1
+            self.fail_count += 1
 
     def _format_rate(self, seconds: float) -> str:
         if seconds < 60:
@@ -771,13 +730,13 @@ def get_postfix_str(self) -> str:
 
         rate = "n/a"
         eta = "n/a"
-        if len(self._success_times) >= 3:
-            median = statistics.median(self._success_times)
+        if len(self.success_times) >= 3:
+            median = statistics.median(self.success_times)
             eta_seconds = (remaining / self.num_workers) * median
             rate = self._format_rate(median)
             eta = self._format_eta(eta_seconds)
 
-        return f"ok={self._ok_count}, fail={self._fail_count}, rate={rate}, eta={eta}"
+        return f"ok={self.ok_count}, fail={self.fail_count}, rate={rate}, eta={eta}"
 
 
 @dataclass
@@ -828,7 +787,7 @@ def _compute_thread_allocation(self) -> Dict[int, int]:
 
     def get_compile_threads(self, gpu_id: int) -> int:
         """Get the number of compile threads allocated to a GPU."""
-        return self._threads_per_gpu.get(gpu_id, 1)
+        return self._threads_per_gpu[gpu_id]
 
     def print_gpu_summary(self):
         """Print summary of GPU allocation."""
@@ -836,7 +795,7 @@ def print_gpu_summary(self):
         lines = [f"Using {num_active} GPU(s)"]
         for gpu_id in self.options.gpu_ids[:num_active]:
             node = self.gpu_topology.get_numa_node(gpu_id)
-            threads = self._threads_per_gpu.get(gpu_id, 1)
+            threads = self._threads_per_gpu[gpu_id]
             lines.append(f"GPU {gpu_id}: NUMA node {node}, {threads} compile threads")
         logger.info("\n".join(lines))
 
@@ -876,11 +835,7 @@ def _apply_numa_affinity(self, gpu_id: int) -> None:
         node = self._ctx.gpu_topology.get_numa_node(gpu_id)
         cpu_list = self._ctx.numa_topology.get_cpus_for_numa_node(node)
 
-        if cpu_list:
-            try:
-                os.sched_setaffinity(0, set(cpu_list))
-            except OSError:
-                logger.warning(f"Could not set CPU affinity for GPU {gpu_id}")
+        os.sched_setaffinity(0, set(cpu_list))
 
         self._set_memory_policy(node)
 
@@ -902,7 +857,7 @@ def _set_memory_policy(self, numa_node: int) -> None:
                                   ctypes.byref(ctypes.c_ulong(nodemask)),
                                   maxnode=64)
         except (OSError, AttributeError):
-            pass  # libnuma not available, rely on first-touch policy
+            logger.debug("libnuma not available, skipping memory policy setup")
 
 
 # =============================================================================
@@ -957,7 +912,7 @@ def _write_header(self):
         self._header_written = True
 
     def write_result(self, result: TuningResult):
-        assert result.success and result.winning_config and result.max_tflops, "write_result called with failed result"
+        assert result.success and result.winning_config and result.max_tflops, "write_result called with invalid result"
 
         self._write_header()
 
@@ -991,15 +946,13 @@ def __exit__(self, exc_type, exc_value, traceback):
         if self.file:
             self.file.close()
 
-    def write_entries(self, entries: List[Dict]):
-        if not entries:
-            return
+    def write_result(self, result: TuningResult):
+        assert result.success and result.entries, "write_result called with invalid result"
 
-        pd.DataFrame(entries).to_csv(self.file,
-                                     sep='\t',
-                                     mode='a',
-                                     header=not self._header_written,
-                                     index=False)
+        pd.DataFrame(result.entries).to_csv(self.file,
+                                            sep='\t',
+                                            header=not self._header_written,
+                                            index=False)
 
         self.file.flush()
         self._header_written = True
@@ -1013,7 +966,7 @@ def write_entries(self, entries: List[Dict]):
 class TuningArgumentParser(argparse.ArgumentParser):
     """ArgumentParser with custom validation for tuning arguments."""
 
-    def __init__(self, *args, gpu_topology: GpuTopology = None, **kwargs):
+    def __init__(self, *args, gpu_topology: Optional[GpuTopology] = None, **kwargs):
         super().__init__(*args, **kwargs)
         self._gpu_topology = gpu_topology
 
@@ -1044,10 +997,10 @@ def __call__(self, parser, namespace, values, option_string=None):
 def get_git_commit_hash() -> str:
     """Get the current git commit hash."""
     try:
-        commit_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD'],
-                                              stderr=subprocess.DEVNULL).decode().strip()
-        return commit_hash
-    except Exception:
+        return subprocess.check_output(['git', 'rev-parse', 'HEAD'],
+                                       stderr=subprocess.DEVNULL).decode().strip()
+    except (subprocess.CalledProcessError, FileNotFoundError, OSError) as e:
+        logger.debug(f"Failed to get git commit hash: {e}")
         return "unknown"
 
 
@@ -1075,10 +1028,10 @@ def verify_mode_flags(verify_mode: str) -> str:
         return "-pv"
     if verify_mode == "gpu":
         return "-pv_with_gpu --verifier-keep-perf-config=false"
-    raise ValueError("Unknown verification mode", verify_mode)
+    raise ValueError(f"Unknown verification mode: {verify_mode}")
 
 
-def kill_process(proc) -> None:
+def kill_process(proc: Optional[subprocess.Popen]) -> None:
     """Terminate a subprocess and wait for cleanup."""
     if proc is None:
         return
@@ -1092,11 +1045,11 @@ def kill_process(proc) -> None:
 
 
 def format_error(context: str,
-                 command: str = None,
-                 stdout: str = None,
-                 stderr: str = None,
-                 exit_code: int = None,
-                 gpu_id: int = None,
+                 command: Optional[str] = None,
+                 stdout: Optional[str] = None,
+                 stderr: Optional[str] = None,
+                 exit_code: Optional[int] = None,
+                 gpu_id: Optional[int] = None,
                  max_lines: int = 10) -> str:
     """Format an error message with optional details."""
 
@@ -1123,11 +1076,11 @@ def truncate(text: str) -> str:
 
     truncated_stdout = truncate(stdout)
     if truncated_stdout:
-        parts.append("stdout:\n" + truncated_stdout)
+        parts.append("STDOUT:\n" + truncated_stdout)
 
     truncated_stderr = truncate(stderr)
     if truncated_stderr:
-        parts.append("stderr:\n" + truncated_stderr)
+        parts.append("STDERR:\n" + truncated_stderr)
 
     return '\n'.join(parts)
 
@@ -1137,10 +1090,11 @@ def truncate(text: str) -> str:
 # =============================================================================
 
 
-def verify_perfconfig(perfconfig, config, paths: Paths, options: Options, gpu_id: int) -> float:
+def verify_perfconfig(perfconfig: str, config: PerfConfiguration, paths: Paths, options: Options,
+                      gpu_id: int) -> float:
     """Verify a performance config by running with profiling.
 
-    Returns the execution time in nanoseconds, or NaN if verification fails.
+    Returns the execution time in nanoseconds, or raises TuningError on failure.
     """
     config.set_perfconfig(perfconfig)
 
@@ -1230,57 +1184,62 @@ def verify_perfconfig(perfconfig, config, paths: Paths, options: Options, gpu_id
     return nano_seconds
 
 
-def find_best_perfconfig(tuning_output, config, paths: Paths, options: Options,
-                         gpu_id: int) -> tuple[str, float, List[Dict]]:
+def find_best_perfconfig(tuning_output: List[str], config: PerfConfiguration, paths: Paths,
+                         options: Options,
+                         gpu_id: int) -> tuple[Optional[str], Optional[float], List[Dict]]:
     """Parse tuning driver output and find the best performing perfconfig.
 
     Returns the winning config, its TFLOPS, and all entries.
     """
-    max_tflops = -np.inf
-    winning_config = "None"
+    max_tflops: Optional[float] = None
+    winning_config: Optional[str] = None
     entries = []
 
     for line in tuning_output:
         result = line.strip()
         if not result:
             continue
+
+        parts = result.split('\t')
+        if len(parts) < 2:
+            logger.debug(f"Skipping malformed tuning output line: '{result}'")
+            continue
+
+        perfconfig = parts[0]
+        time = parts[-1]
         try:
-            parts = result.split('\t')
-            if len(parts) < 2:
-                continue  # Skip silently - can happen during normal shutdown
-            perfconfig = parts[0]
-            time = parts[-1]
             if time == "N/A":
                 nano_seconds = np.nan
                 measurements = None
             else:
                 nano_seconds = float(time)
                 measurements = json.loads(parts[1]) if len(parts) == 3 else None
-        except ValueError:
-            continue  # Skip silently - can happen during normal shutdown
+        except (ValueError, json.JSONDecodeError):
+            logger.debug(f"Skipping malformed tuning output line: '{result}'")
+            continue
 
         config.set_perfconfig(perfconfig)
         entry = config.table_entry(nano_seconds)
         if options.debug:
             entry["Measurements"] = measurements
         entries.append(entry)
-        these_tflops = entry['TFlops']
 
         if options.verify_perfconfigs and not np.isnan(nano_seconds):
             verify_ns = verify_perfconfig(perfconfig, config, paths, options, gpu_id)
             if np.isnan(verify_ns):
                 raise TuningError(f"Verification returned NaN for perfconfig '{perfconfig}'")
 
-        if not np.isnan(these_tflops) and these_tflops > max_tflops:
+        these_tflops = entry['TFlops']
+        if not np.isnan(these_tflops) and (max_tflops is None or these_tflops > max_tflops):
             max_tflops = these_tflops
             winning_config = perfconfig
 
     return winning_config, max_tflops, entries
 
 
-def tune_config(test_vector, conf_class, paths: Paths, options: Options, gpu_id: int,
-                num_compile_threads: int) -> Dict[str, Any]:
-    """Tune a single configuration and return the results."""
+def tune_config(test_vector: str, conf_class: type, paths: Paths, options: Options, gpu_id: int,
+                num_compile_threads: int) -> TuningResult:
+    """Tune a single configuration and return the result."""
     tuning_driver_args = [
         f"--tuning-space={options.tuning_space_kind}", f"--num-iterations={MLIR_N_REPEATS}",
         f"--warmup-iterations={WARMUP_ITERATIONS}", "--use-median", f"--sleep-us={SLEEP_US}",
@@ -1295,14 +1254,17 @@ def tune_config(test_vector, conf_class, paths: Paths, options: Options, gpu_id:
     try:
         rocmlir_gen_command = [paths.mlir_paths.rocmlir_gen_path]
         tuning_driver_command = [paths.mlir_paths.rocmlir_tuning_driver_path] + tuning_driver_args
+
         if not test_vector.endswith(".mlir"):
             command_line = test_vector.split(sep=' ')
             try:
                 config = conf_class.from_command_line(command_line, options.arch, options.num_cu,
                                                       options.num_chiplets)
             except ValueError as e:
-                return {'success': False, 'error': str(e)}
-            test_vector = config.to_command_line()
+                return TuningResult(test_vector=test_vector,
+                                    success=False,
+                                    gpu_id=gpu_id,
+                                    error=str(e))
             command_line_options = config.generate_mlir_driver_commandline(
                 options.rocmlir_gen_flags, kernel_repeats=None)
             # Note, we don't need the -ph, this goes to the tuning driver.
@@ -1329,23 +1291,25 @@ def tune_config(test_vector, conf_class, paths: Paths, options: Options, gpu_id:
                                           env=env)
             output, err = tuning_key.communicate()
             if tuning_key.returncode != 0:
-                return {
-                    'success':
-                        False,
-                    'error':
-                        format_error("Failed to generate tuning key",
+                error = format_error("Failed to generate tuning key",
                                      command=' '.join(rocmlir_gen_command),
                                      stderr=err.decode('utf-8'),
                                      exit_code=tuning_key.returncode,
                                      gpu_id=gpu_id)
-                }
+                return TuningResult(test_vector=test_vector,
+                                    success=False,
+                                    gpu_id=gpu_id,
+                                    error=error)
             result = output.decode('utf-8').strip().split('\t')
             command_line = result[2].split(sep=' ')
             try:
                 config = conf_class.from_command_line(command_line, options.arch, options.num_cu,
                                                       options.num_chiplets)
             except ValueError as e:
-                return {'success': False, 'error': str(e)}
+                return TuningResult(test_vector=test_vector,
+                                    success=False,
+                                    gpu_id=gpu_id,
+                                    error=str(e))
             tuning_driver_command += [test_vector]
             tuning_driver = subprocess.Popen(tuning_driver_command,
                                              stdout=subprocess.PIPE,
@@ -1359,16 +1323,12 @@ def tune_config(test_vector, conf_class, paths: Paths, options: Options, gpu_id:
         tuning_stdout, tuning_stderr = tuning_driver.communicate()
 
         if tuning_driver.returncode != 0:
-            return {
-                'success':
-                    False,
-                'error':
-                    format_error("Tuning pipeline failed",
+            error = format_error("Tuning pipeline failed",
                                  command=tuning_pipeline,
                                  stderr=tuning_stderr.decode('utf-8'),
                                  exit_code=tuning_driver.returncode,
                                  gpu_id=gpu_id)
-            }
+            return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id, error=error)
         else:
             # Log any stderr output from tuning driver because it may contain warnings
             tuning_stderr_str = tuning_stderr.decode('utf-8').strip()
@@ -1379,43 +1339,48 @@ def tune_config(test_vector, conf_class, paths: Paths, options: Options, gpu_id:
         winning_config, max_tflops, entries = find_best_perfconfig(tuning_output, config, paths,
                                                                    options, gpu_id)
     except TuningError as e:
-        return {'success': False, 'error': str(e)}
+        return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id, error=str(e))
     finally:
         kill_process(rocmlir_gen)
         kill_process(tuning_driver)
 
-    if winning_config == "None":
-        return {'success': False, 'error': "No valid perf config found"}
+    if winning_config is None:
+        return TuningResult(test_vector=test_vector,
+                            success=False,
+                            gpu_id=gpu_id,
+                            error="No valid perf config found")
 
     verify_tflops = None
     if options.verify_mode != "none":
         try:
             verify_ns = verify_perfconfig(winning_config, config, paths, options, gpu_id)
         except TuningError as e:
-            return {'success': False, 'error': str(e)}
+            return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id, error=str(e))
 
         if np.isnan(verify_ns):
-            return {
-                'success': False,
-                'error': f"Verification returned NaN for winning perfconfig '{winning_config}'"
-            }
+            return TuningResult(
+                test_vector=test_vector,
+                success=False,
+                gpu_id=gpu_id,
+                error=f"Verification returned NaN for winning perfconfig '{winning_config}'")
 
         verify_tflops = config.compute_tflops(verify_ns)
 
-    return {
-        'success': True,
-        'winning_config': winning_config,
-        'max_tflops': max_tflops,
-        'entries': entries,
-        'verify_tflops': verify_tflops
-    }
+    return TuningResult(test_vector=test_vector,
+                        success=True,
+                        gpu_id=gpu_id,
+                        winning_config=winning_config,
+                        max_tflops=max_tflops,
+                        entries=entries,
+                        verify_tflops=verify_tflops)
 
 
 def tune_configs(ctx: TuningContext) -> bool:
     """Tune multiple configurations in parallel across available GPUs."""
     # Load cached results unless retuning is forced
-    cache = TunedConfigsCache()
-    if not ctx.options.retune:
+    if ctx.options.retune:
+        cache = TunedConfigsCache()
+    else:
         cache = TunedConfigsCache.from_output_file(ctx.options)
         if cache.count() > 0:
             logger.info(f"Found {cache.count()} tuned config(s) in {ctx.options.output}")
@@ -1430,11 +1395,11 @@ def tune_configs(ctx: TuningContext) -> bool:
 
     crashed_count = state.crashed_count()
     if crashed_count > 0:
-        logger.warning(f"Detected {crashed_count} crashed config(s) from previous run")
+        logger.warning(f"Found {crashed_count} crashed config(s) in state file")
 
     failed_count = state.failed_count()
     if failed_count > 0:
-        logger.info(f"Found {failed_count} failed config(s) in state file")
+        logger.warning(f"Found {failed_count} failed config(s) in state file")
 
     state_file.save()
 
@@ -1457,23 +1422,24 @@ def tune_configs(ctx: TuningContext) -> bool:
     if skipped_success > 0:
         logger.info(f"Skipping {skipped_success} already tuned config(s)")
     if skipped_failed > 0:
-        logger.info(f"Skipping {skipped_failed} failed/crashed config(s)")
+        logger.info(
+            f"Skipping {skipped_failed} failed/crashed config(s) - use '--retry-failed' to retune")
 
     if not pending_configs:
         logger.info("No configurations to tune")
         return True
 
     pool = GpuWorkerPool(ctx)
-    num_workers = min(pool.worker_count, len(ctx.configs))
+    num_workers = min(pool.worker_count, len(pending_configs))
     ctx.print_gpu_summary()
 
     # Prepare ETA tracker with historical data
     initial_times = [r.elapsed_seconds for r in cache.get_all_results() if r.elapsed_seconds > 0.0]
     eta_tracker = ETATracker(total_configs=len(pending_configs),
                              num_workers=num_workers,
-                             initial_times=initial_times,
-                             initial_ok_count=skipped_success,
-                             initial_fail_count=skipped_failed)
+                             success_times=initial_times,
+                             ok_count=skipped_success,
+                             fail_count=skipped_failed)
 
     def execute_tuning_task(test_vector: str) -> TuningResult:
         gpu_id = pool.acquire_gpu_for_thread()
@@ -1484,85 +1450,78 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
         compile_threads = ctx.get_compile_threads(gpu_id)
         result = tune_config(test_vector, ctx.conf_class, ctx.paths, ctx.options, gpu_id,
                              compile_threads)
-        return TuningResult(test_vector=test_vector,
-                            success=result.get('success', False),
-                            gpu_id=gpu_id,
-                            elapsed_seconds=time.time() - start_time,
-                            winning_config=result.get('winning_config'),
-                            max_tflops=result.get('max_tflops'),
-                            entries=result.get('entries', []),
-                            verify_tflops=result.get('verify_tflops'),
-                            error=result.get('error'))
-
-    with OutputFileWriter(ctx.options.output, ctx.options) as results_writer:
-        with DebugFileWriter(f"{ctx.options.output}.debug") if ctx.options.debug else nullcontext(
-        ) as debug_writer:
-
-            executor = None
-            progress_bar = None
-
-            try:  # No context manager for executor because we need to shutdown with wait=False
-                progress_bar = tqdm(
-                    total=len(ctx.configs),
-                    initial=total_skipped,
-                    disable=ctx.options.quiet or not sys.stderr.isatty(),
-                    file=sys.stderr,
-                    desc=f"Tuning {ctx.conf_class.__name__} ({ctx.options.tuning_space_kind})",
-                    unit="config",
-                    leave=False,
-                    bar_format=
-                    '{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [t={elapsed}{postfix}]')
+        result.elapsed_seconds = time.time() - start_time
+
+        return result
+
+    has_errors = False
+    consecutive_failures = 0
+
+    with (OutputFileWriter(ctx.options.output, ctx.options) as results_writer,
+          DebugFileWriter(f"{ctx.options.output}.debug") if ctx.options.debug else nullcontext() as
+          debug_writer):
+        executor = None
+        progress_bar = None
+
+        try:  # No context manager for executor because we need to shutdown with wait=False
+            progress_bar = tqdm(
+                total=len(ctx.configs),
+                initial=total_skipped,
+                disable=ctx.options.quiet or not sys.stderr.isatty(),
+                file=sys.stderr,
+                desc=f"Tuning {ctx.conf_class.__name__} ({ctx.options.tuning_space_kind})",
+                unit="config",
+                leave=True,
+                bar_format=
+                '{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [t={elapsed}{postfix}]')
+            progress_bar.set_postfix_str(eta_tracker.get_postfix_str())
+
+            executor = ThreadPoolExecutor(max_workers=num_workers)
+            pending_futures = {
+                executor.submit(execute_tuning_task, test_vector): test_vector
+                for test_vector in pending_configs
+            }
+
+            for completed_future in as_completed(pending_futures):
+                result = completed_future.result()
+
+                if result.success:
+                    consecutive_failures = 0
+                    results_writer.write_result(result)
+                    if debug_writer:
+                        debug_writer.write_result(result)
+                    state_file.set_success(result.test_vector)
+                else:
+                    has_errors = True
+                    consecutive_failures += 1
+                    state_file.set_failed(result.test_vector)
+
+                    error_msg = f"[GPU {result.gpu_id}] Tuning failed for '{result.test_vector}'"
+                    if result.error:
+                        error_msg += "\n" + result.error
+                    logger.error(error_msg)
+
+                eta_tracker.record(result)
+                progress_bar.update(1)
                 progress_bar.set_postfix_str(eta_tracker.get_postfix_str())
 
-                executor = ThreadPoolExecutor(max_workers=num_workers)
-                pending_futures = {
-                    executor.submit(execute_tuning_task, test_vector): test_vector
-                    for test_vector in pending_configs
-                }
-
-                has_errors = False
-                consecutive_failures = 0
-
-                for completed_future in as_completed(pending_futures):
-                    result = completed_future.result()
-
-                    if result.success:
-                        consecutive_failures = 0
-                        results_writer.write_result(result)
-                        if debug_writer:
-                            debug_writer.write_entries(result.entries)
-                        state_file.set_success(result.test_vector)
-                    else:
-                        has_errors = True
-                        consecutive_failures += 1
-                        state_file.set_failed(result.test_vector)
-
-                        error_msg = f"[GPU {result.gpu_id}] Tuning failed for '{result.test_vector}'"
-                        if result.error:
-                            error_msg += "\n" + result.error
-                        logger.error(error_msg)
-
-                        if ctx.options.abort_on_error:
-                            return False
-
-                        if consecutive_failures >= MAX_FAILURES:
-                            logger.error("Aborting due to too many consecutive failures")
-                            return False
-
-                    eta_tracker.record(result)
-                    progress_bar.update(1)
-                    progress_bar.set_postfix_str(eta_tracker.get_postfix_str())
-
-            except KeyboardInterrupt:
-                logger.info("Tuning interrupted by user")
-                raise
-            finally:
-                if executor:
-                    executor.shutdown(wait=False, cancel_futures=True)
-                if progress_bar:
-                    progress_bar.close()
-
-                state_file.finalize_interrupted()
+                if has_errors and ctx.options.abort_on_error:
+                    return False
+
+                if consecutive_failures >= MAX_FAILURES:
+                    logger.error("Aborting due to too many consecutive failures")
+                    return False
+
+        except KeyboardInterrupt:
+            logger.info("Tuning interrupted by user")
+            raise
+        finally:
+            if executor:
+                executor.shutdown(wait=False, cancel_futures=True)
+            if progress_bar:
+                progress_bar.close()
+
+            state_file.finalize_interrupted()
 
     if has_errors:
         logger.error("Encountered errors during tuning")
@@ -1577,7 +1536,7 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
 # =============================================================================
 
 
-def resolve_paths(op_type: Operation, parsed_args) -> Paths:
+def resolve_paths(op_type: Operation, parsed_args: argparse.Namespace) -> Paths:
     """Resolve paths based on operation type and arguments."""
     if op_type == Operation.FUSION:
         configs_path = "./fusion_config_file"
@@ -1588,34 +1547,42 @@ def resolve_paths(op_type: Operation, parsed_args) -> Paths:
     return perfRunner.create_paths(configs_path, parsed_args.mlir_build_dir)
 
 
-def extract_fusion_configs(test_dir, paths: Paths) -> Operation:
-    """Extract tuning configurations from fusion E2E test files."""
+def extract_fusion_configs(test_dir: str, paths: Paths) -> Operation:
+    """Extract tuning configurations from fusion E2E test files.
+
+    Writes extracted configs to paths.configuration_file_path and returns the detected operation type.
+    """
     all_configs = []
     op_type = Operation.FUSION
+
     for filename in glob.glob(test_dir + '/*mlir'):
         logger.info(f"Extract from: {filename}")
         test_entry = perfRunner.get_fusion_test_info(filename, paths)
         if not test_entry:
             continue
+
         test_vector = test_entry['testVector']
         if not test_vector:
             continue
+
         if test_vector in all_configs:
-            logger.info("An entry already exists in the tuning DB")
+            logger.debug("Duplicate entry skipped")
             continue
+
         command_line = test_vector.split(sep=' ')
         if command_line[0].startswith('conv'):
             if op_type == Operation.FUSION:
                 op_type = Operation.CONV
             elif op_type != Operation.CONV:
-                logger.warning(f"Invalid config op: {test_vector}")
+                logger.warning(f"Mixed operation types, skipping: {test_vector}")
                 continue
         else:
             if op_type == Operation.FUSION:
                 op_type = Operation.GEMM
             elif op_type != Operation.GEMM:
-                logger.warning(f"Invalid config op: {test_vector}")
+                logger.warning(f"Mixed operation types, skipping: {test_vector}")
                 continue
+
         all_configs.append(test_vector)
 
     with open(paths.configuration_file_path, 'w') as outfile:
@@ -1635,7 +1602,8 @@ def get_config_class(op_type: Operation) -> type:
         Operation.CONV_GEMM: ConvGemmConfiguration,
     }
 
-    return config_classes.get(op_type, PerfConfiguration)
+    assert op_type in config_classes, f"No config class for operation: {op_type}"
+    return config_classes[op_type]
 
 
 def load_configs_from_stdin() -> str:
@@ -1647,10 +1615,10 @@ def load_configs_from_stdin() -> str:
     return path
 
 
-def load_configs(op_type: Operation, parsed_args, paths: Paths) -> List[str]:
+def load_configs(op_type: Operation, parsed_args: argparse.Namespace, paths: Paths) -> List[str]:
     """Load configurations based on operation type and arguments."""
     if parsed_args.config:
-        return parsed_args.config
+        return [parsed_args.config]
 
     loaders = {
         Operation.CONV:
@@ -1667,11 +1635,8 @@ def load_configs(op_type: Operation, parsed_args, paths: Paths) -> List[str]:
             lambda: perfRunner.get_conv_gemm_configurations(paths.configuration_file_path),
     }
 
-    loader = loaders.get(op_type)
-    if loader:
-        return loader()
-
-    raise ValueError(f"Unsupported operation type: {op_type}")
+    assert op_type in loaders, f"No config loader for operation: {op_type}"
+    return loaders[op_type]()
 
 
 # =============================================================================
@@ -1679,7 +1644,9 @@ def load_configs(op_type: Operation, parsed_args, paths: Paths) -> List[str]:
 # =============================================================================
 
 
-def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=None):
+def parse_arguments(gpu_topology: GpuTopology,
+                    available_gpus: List[int],
+                    args=None) -> argparse.Namespace:
     """Parse and validate command-line arguments."""
     parser = TuningArgumentParser(
         prog="tuningRunner.py",
@@ -1694,11 +1661,12 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N
         "--configs-file",
         "--configs_file",  # for backward compatibility
         type=str,
-        help="Path to file containing list of configurations to tune")
+        metavar='FILE',
+        help="Path to file containing list of configurations to tune. Use '-' for stdin.")
 
     config_group.add_argument("--config",
                               type=str,
-                              nargs='*',
+                              metavar='CONFIG',
                               help="Specific config to tune. Format depends on --op type.")
 
     parser.add_argument("--op",
@@ -1712,6 +1680,7 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N
         "--output",
         type=str,
         default="tuning_results_local.tsv",
+        metavar='FILE',
         help=
         "Output file path for tuning results in TSV format. Results will be appended if file exists. Use '-' for stdout."
     )
@@ -1720,6 +1689,7 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N
         "--mlir-build-dir",
         type=str,
         default=perfRunner.find_mlir_build_dir(),
+        metavar='DIR',
         help=
         "Path to rocMLIR build directory containing rocmlir-gen, rocmlir-driver, rocmlir-tuning-driver, and other build artifacts",
     )
@@ -1729,6 +1699,7 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N
         "--rocmlir_gen_flags",  # for backward compatibility
         type=str,
         default="",
+        metavar='FLAGS',
         help="Additional flags to pass to rocmlir-gen")
 
     parser.add_argument("-d",
@@ -1742,17 +1713,19 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N
                         choices=["quick", "full", "greedy", "exhaustive"],
                         help="Tuning space kind to use")
 
-    parser.add_argument("-q",
-                        "--quiet",
-                        action='store_true',
-                        default=False,
-                        help="Suppress non-error output")
+    logging_group = parser.add_mutually_exclusive_group()
 
-    parser.add_argument("-v",
-                        "--verbose",
-                        action='store_true',
-                        default=False,
-                        help="Enable verbose output, including commands being executed")
+    logging_group.add_argument("-q",
+                               "--quiet",
+                               action='store_true',
+                               default=False,
+                               help="Suppress non-error output")
+
+    logging_group.add_argument("-v",
+                               "--verbose",
+                               action='store_true',
+                               default=False,
+                               help="Enable verbose output, including commands being executed")
 
     parser.add_argument("--verify-mode",
                         default="gpu",
@@ -1772,6 +1745,7 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N
         "--test_dir",  # for backward compatibility
         default="../mlir/test/fusion/resnet50-e2e",
         type=str,
+        metavar='DIR',
         help=
         "Directory containing fusion E2E tests to extract configs from. Only used when --op=fusion."
     )
@@ -1783,6 +1757,7 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N
                             "fp8_fp8", "f4E2M1FN"
                         ],
                         default=["f32", "f16", "i8"],
+                        metavar='TYPE',
                         help="Force a set of data types for gemm tuning. Only used when --op=gemm.")
 
     parser.add_argument(
@@ -1790,6 +1765,7 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N
         nargs='+',
         choices=["f32", "f8E8M0FNU"],
         default=None,
+        metavar='TYPE',
         help="Force a set of scale types for gemm tuning. Only used when --op=gemm.")
 
     parser.add_argument("--tflops",
@@ -1841,8 +1817,7 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N
 
 
 def main(args=None):
-    global logger
-
+    numa_topology = NumaTopology.discover()
     gpu_topology = GpuTopology.discover()
     available_gpus = sorted(gpu_topology.gpus.keys())
 
@@ -1852,7 +1827,7 @@ def main(args=None):
 
     parsed_args = parse_arguments(gpu_topology, available_gpus, args)
 
-    logger = setup_logger(quiet=parsed_args.quiet, verbose=parsed_args.verbose)
+    setup_logger(quiet=parsed_args.quiet, verbose=parsed_args.verbose)
 
     stdin_temp_file = None
     try:
@@ -1907,7 +1882,7 @@ def main(args=None):
                         paths=paths,
                         options=options,
                         gpu_topology=gpu_topology,
-                        numa_topology=NumaTopology.discover())
+                        numa_topology=numa_topology)
 
     try:
         tuning_succeeded = tune_configs(ctx)

From 110dccbacd1c13a9837bef429aff3835708481f5 Mon Sep 17 00:00:00 2001
From: Mirza Halilcevic <mirza.halilcevic@amd.com>
Date: Sun, 18 Jan 2026 12:11:14 +0000
Subject: [PATCH 11/23] Simplify state file and support multiple contexts.

---
 mlir/utils/performance/tuningRunner.py | 242 +++++++++----------------
 1 file changed, 82 insertions(+), 160 deletions(-)

diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py
index eeec94be3d58..c7521b313221 100755
--- a/mlir/utils/performance/tuningRunner.py
+++ b/mlir/utils/performance/tuningRunner.py
@@ -64,7 +64,6 @@
 MLIR_N_REPEATS = 10
 WARMUP_ITERATIONS = 1
 SLEEP_US = 100  # 0.1 ms
-MAX_FAILURES = 20
 
 # =============================================================================
 # Logging Setup
@@ -311,66 +310,36 @@ class ConfigState(Enum):
     CRASHED = "crashed"  # Process crashed while tuning (detected on startup)
 
 
-@dataclass(frozen=True)
-class TuningStateContext:
-    """Context that identifies a tuning run. State is invalidated if context changes."""
-    arch: str
-    num_cu: int
-    tuning_space: str
-
-    def matches(self, other: 'TuningStateContext') -> bool:
-        return (self.arch == other.arch and self.num_cu == other.num_cu and
-                self.tuning_space == other.tuning_space)
-
-
 @dataclass
 class TuningState:
-    """Persistent state for tuning runs, survives crashes and interrupts."""
-    context: TuningStateContext
+    """State tracking for configs within a single context."""
     configs: Dict[str, ConfigState] = field(default_factory=dict)
 
     def set_running(self, test_vector: str) -> None:
-        """Mark a config as currently running."""
         self.configs[test_vector] = ConfigState.RUNNING
 
     def set_failed(self, test_vector: str) -> None:
-        """Mark a config as failed."""
         self.configs[test_vector] = ConfigState.FAILED
 
     def set_interrupted(self, test_vector: str) -> None:
-        """Mark a config as interrupted by user."""
         self.configs[test_vector] = ConfigState.INTERRUPTED
 
-    def set_crashed(self, test_vector: str) -> None:
-        """Mark a config as crashed."""
-        self.configs[test_vector] = ConfigState.CRASHED
-
     def remove(self, test_vector: str) -> None:
-        """Remove a config from state (e.g., on success)."""
         self.configs.pop(test_vector, None)
 
     def should_skip(self, test_vector: str) -> bool:
-        """Check if a config should be skipped (failed or crashed)."""
         return self.configs.get(test_vector) in (ConfigState.FAILED, ConfigState.CRASHED)
 
-    def _count_by_state(self, *states: ConfigState) -> int:
-        """Count configs in any of the given states."""
-        return sum(1 for s in self.configs.values() if s in states)
+    def is_empty(self) -> bool:
+        return not self.configs
 
     def failed_count(self) -> int:
-        """Count of failed configs."""
-        return self._count_by_state(ConfigState.FAILED)
+        return sum(1 for s in self.configs.values() if s == ConfigState.FAILED)
 
     def crashed_count(self) -> int:
-        """Count of crashed configs."""
-        return self._count_by_state(ConfigState.CRASHED)
-
-    def skip_count(self) -> int:
-        """Count of configs that should be skipped (failed + crashed)."""
-        return self._count_by_state(ConfigState.FAILED, ConfigState.CRASHED)
+        return sum(1 for s in self.configs.values() if s == ConfigState.CRASHED)
 
     def promote_running_to_interrupted(self) -> int:
-        """Move all RUNNING configs to INTERRUPTED (clean shutdown). Returns count."""
         count = 0
         for tv in self.configs:
             if self.configs[tv] == ConfigState.RUNNING:
@@ -380,137 +349,114 @@ def promote_running_to_interrupted(self) -> int:
 
 
 class TuningStateFile:
-    """Manages reading and writing of tuning state to a JSON file.
+    """Manages multi-context tuning state in a JSON file.
+
+    File format:
+    {
+        "contexts": {
+            "<arch>/<tuning_space>": {
+                "test_vector_1": "failed",
+                "test_vector_2": "crashed"
+            }
+        }
+    }
 
-    If filepath is None, all operations are no-ops (null object pattern).
+    If filepath is None, all operations are no-ops.
     """
 
-    def __init__(self, filepath: Optional[str]):
+    def __init__(self, filepath: Optional[str], arch: str, tuning_space: str):
         self.filepath = filepath
+        self.context_key = f"{arch}/{tuning_space}"
         self._lock = threading.Lock()
-        self._state: Optional[TuningState] = None
+        self._all_contexts: Dict[str, Dict[str, str]] = {}  # context_key -> {tv -> state_str}
+        self._state = TuningState()
 
-    def load(self, expected_context: TuningStateContext) -> 'TuningStateFile':
-        """Load state from file. Returns self for chaining.
+        self._load()
+        self._save_locked()  # Persist any state transitions from load
 
-        On load:
-        - INTERRUPTED configs are demoted to PENDING (removed from state)
-        - RUNNING configs are promoted to CRASHED (indicates previous crash)
-        """
-        if not self.filepath:
-            self._state = TuningState(context=expected_context)
-            return self
+    def _load(self) -> None:
+        """Load state from file.
 
-        if not os.path.exists(self.filepath):
-            self._state = TuningState(context=expected_context)
-            return self
+        For the active context only:
+        - INTERRUPTED configs are removed (will be retried)
+        - RUNNING configs become CRASHED (stale = crash)
+        """
+        if not self.filepath or not os.path.exists(self.filepath):
+            return
 
         try:
             with open(self.filepath, 'r') as f:
                 data = json.load(f)
+            self._all_contexts = data.get('contexts', {})
+        except (json.JSONDecodeError, TypeError, OSError) as e:
+            logger.warning(f"Failed to load state file, starting fresh: {e}")
+            return
 
-            file_context = TuningStateContext(arch=data.get('arch', ''),
-                                              num_cu=data.get('numCUs', 0),
-                                              tuning_space=data.get('tuningSpace', ''))
-
-            if not file_context.matches(expected_context):
-                logger.warning("State file context mismatch, starting fresh")
-                self._state = TuningState(context=expected_context)
-                return self
-
-            configs = {}
-            for tv, state_str in data.get('configs', {}).items():
+        # Process configs for active context with state transitions
+        if self.context_key in self._all_contexts:
+            for tv, state_str in self._all_contexts[self.context_key].items():
                 try:
-                    config_state = ConfigState(state_str)
-                    # Demote INTERRUPTED to PENDING (don't add to configs)
-                    if config_state == ConfigState.INTERRUPTED:
-                        continue
-                    # Promote RUNNING to CRASHED (stale running = crash)
-                    if config_state == ConfigState.RUNNING:
-                        config_state = ConfigState.CRASHED
-                    configs[tv] = config_state
+                    state = ConfigState(state_str)
+                    if state == ConfigState.INTERRUPTED:
+                        continue  # Remove - will retry
+                    if state == ConfigState.RUNNING:
+                        state = ConfigState.CRASHED  # Stale running = crashed
+                    self._state.configs[tv] = state
                 except ValueError:
-                    pass  # Skip invalid states
-
-            self._state = TuningState(context=expected_context, configs=configs)
-            return self
-
-        except (json.JSONDecodeError, KeyError, TypeError) as e:
-            logger.warning(f"Failed to load state file: {e}")
-            self._state = TuningState(context=expected_context)
-            return self
+                    logger.warning(f"Unknown state '{state_str}' for config '{tv}' in state file")
 
     @property
     def state(self) -> TuningState:
-        """Get the current state. Must call load() first."""
-        if self._state is None:
-            raise RuntimeError("State not loaded. Call load() first.")
         return self._state
 
     def _save_locked(self) -> None:
-        """Save state to file atomically. Assumes lock is held."""
-        if not self.filepath or not self._state:
+        if not self.filepath:
             return
 
-        data = {
-            'arch': self._state.context.arch,
-            'numCUs': self._state.context.num_cu,
-            'tuningSpace': self._state.context.tuning_space,
-            'configs': {
+        # Update active context in all_contexts
+        if not self._state.is_empty():
+            self._all_contexts[self.context_key] = {
                 tv: s.value for tv, s in self._state.configs.items()
             }
-        }
+        else:
+            self._all_contexts.pop(self.context_key, None)
+
+        # Remove empty contexts
+        self._all_contexts = {k: v for k, v in self._all_contexts.items() if v}
+
+        # Delete file if nothing left, otherwise save
+        if not self._all_contexts:
+            if os.path.exists(self.filepath):
+                os.remove(self.filepath)
+            return
 
-        # Write to temp file then rename for atomicity
         temp_path = self.filepath + '.tmp'
         with open(temp_path, 'w') as f:
-            json.dump(data, f, indent=2)
+            json.dump({'contexts': self._all_contexts}, f, indent=2)
         os.replace(temp_path, self.filepath)
 
-    def save(self) -> None:
-        """Save state to file atomically. No-op if filepath is None."""
+    def set_running(self, test_vector: str) -> None:
         with self._lock:
+            self._state.set_running(test_vector)
             self._save_locked()
 
-    def delete(self) -> None:
-        """Delete the state file. No-op if filepath is None."""
-        if not self.filepath:
-            return
-
-        with self._lock:
-            if os.path.exists(self.filepath):
-                os.remove(self.filepath)
-            self._state = None
-
-    def set_running(self, test_vector: str) -> None:
-        """Mark a config as running and save."""
-        if self._state:
-            with self._lock:
-                self._state.set_running(test_vector)
-                self._save_locked()
-
     def set_failed(self, test_vector: str) -> None:
-        """Mark a config as failed and save."""
-        if self._state:
-            with self._lock:
-                self._state.set_failed(test_vector)
-                self._save_locked()
+        with self._lock:
+            self._state.set_failed(test_vector)
+            self._save_locked()
 
     def set_success(self, test_vector: str) -> None:
-        """Remove a config from state (success) and save."""
-        if self._state:
-            with self._lock:
-                self._state.remove(test_vector)
-                self._save_locked()
+        with self._lock:
+            self._state.remove(test_vector)
+            self._save_locked()
 
     def finalize_interrupted(self) -> None:
-        """Mark any RUNNING configs as INTERRUPTED and save. Called on clean shutdown."""
-        if self._state:
-            with self._lock:
-                interrupted_count = self._state.promote_running_to_interrupted()
-                if interrupted_count > 0:
-                    logger.info(f"Marked {interrupted_count} running config(s) as interrupted")
-                self._save_locked()
+        """Mark RUNNING configs as INTERRUPTED on clean shutdown."""
+        with self._lock:
+            count = self._state.promote_running_to_interrupted()
+            if count > 0:
+                logger.info(f"Marked {count} running config(s) as interrupted")
+            self._save_locked()
 
 
 def get_state_filepath(output_filepath: str) -> Optional[str]:
@@ -1257,14 +1203,8 @@ def tune_config(test_vector: str, conf_class: type, paths: Paths, options: Optio
 
         if not test_vector.endswith(".mlir"):
             command_line = test_vector.split(sep=' ')
-            try:
-                config = conf_class.from_command_line(command_line, options.arch, options.num_cu,
-                                                      options.num_chiplets)
-            except ValueError as e:
-                return TuningResult(test_vector=test_vector,
-                                    success=False,
-                                    gpu_id=gpu_id,
-                                    error=str(e))
+            config = conf_class.from_command_line(command_line, options.arch, options.num_cu,
+                                                  options.num_chiplets)
             command_line_options = config.generate_mlir_driver_commandline(
                 options.rocmlir_gen_flags, kernel_repeats=None)
             # Note, we don't need the -ph, this goes to the tuning driver.
@@ -1302,14 +1242,8 @@ def tune_config(test_vector: str, conf_class: type, paths: Paths, options: Optio
                                     error=error)
             result = output.decode('utf-8').strip().split('\t')
             command_line = result[2].split(sep=' ')
-            try:
-                config = conf_class.from_command_line(command_line, options.arch, options.num_cu,
-                                                      options.num_chiplets)
-            except ValueError as e:
-                return TuningResult(test_vector=test_vector,
-                                    success=False,
-                                    gpu_id=gpu_id,
-                                    error=str(e))
+            config = conf_class.from_command_line(command_line, options.arch, options.num_cu,
+                                                  options.num_chiplets)
             tuning_driver_command += [test_vector]
             tuning_driver = subprocess.Popen(tuning_driver_command,
                                              stdout=subprocess.PIPE,
@@ -1386,11 +1320,8 @@ def tune_configs(ctx: TuningContext) -> bool:
             logger.info(f"Found {cache.count()} tuned config(s) in {ctx.options.output}")
 
     # Load state file
-    state_context = TuningStateContext(arch=ctx.options.arch,
-                                       num_cu=ctx.options.num_cu,
-                                       tuning_space=ctx.options.tuning_space_kind)
-    state_file = TuningStateFile(get_state_filepath(ctx.options.output))
-    state_file.load(state_context)
+    state_file = TuningStateFile(get_state_filepath(ctx.options.output), ctx.options.arch,
+                                 ctx.options.tuning_space_kind)
     state = state_file.state
 
     crashed_count = state.crashed_count()
@@ -1401,8 +1332,6 @@ def tune_configs(ctx: TuningContext) -> bool:
     if failed_count > 0:
         logger.warning(f"Found {failed_count} failed config(s) in state file")
 
-    state_file.save()
-
     # Filter out already-tuned configs (unless --retune)
     pending_configs = ctx.configs
     skipped_success = 0
@@ -1455,7 +1384,6 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
         return result
 
     has_errors = False
-    consecutive_failures = 0
 
     with (OutputFileWriter(ctx.options.output, ctx.options) as results_writer,
           DebugFileWriter(f"{ctx.options.output}.debug") if ctx.options.debug else nullcontext() as
@@ -1471,7 +1399,7 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
                 file=sys.stderr,
                 desc=f"Tuning {ctx.conf_class.__name__} ({ctx.options.tuning_space_kind})",
                 unit="config",
-                leave=True,
+                leave=False,
                 bar_format=
                 '{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [t={elapsed}{postfix}]')
             progress_bar.set_postfix_str(eta_tracker.get_postfix_str())
@@ -1486,14 +1414,12 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
                 result = completed_future.result()
 
                 if result.success:
-                    consecutive_failures = 0
                     results_writer.write_result(result)
                     if debug_writer:
                         debug_writer.write_result(result)
                     state_file.set_success(result.test_vector)
                 else:
                     has_errors = True
-                    consecutive_failures += 1
                     state_file.set_failed(result.test_vector)
 
                     error_msg = f"[GPU {result.gpu_id}] Tuning failed for '{result.test_vector}'"
@@ -1508,10 +1434,6 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
                 if has_errors and ctx.options.abort_on_error:
                     return False
 
-                if consecutive_failures >= MAX_FAILURES:
-                    logger.error("Aborting due to too many consecutive failures")
-                    return False
-
         except KeyboardInterrupt:
             logger.info("Tuning interrupted by user")
             raise

From 90132b2de8394cdd2f9d625f44096fa92d518c04 Mon Sep 17 00:00:00 2001
From: Mirza Halilcevic <mirza.halilcevic@amd.com>
Date: Sun, 18 Jan 2026 12:25:47 +0000
Subject: [PATCH 12/23] Address copilot comments.

---
 mlir/utils/performance/tuningRunner.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py
index c7521b313221..ed7a0de0bdaf 100755
--- a/mlir/utils/performance/tuningRunner.py
+++ b/mlir/utils/performance/tuningRunner.py
@@ -571,7 +571,7 @@ def _parse_header_line(line: str) -> Dict[str, int]:
         for i, col in enumerate(header_text.split('\t')):
             if not col:
                 continue
-            # Exctract base column name (handles 'perfConfig (tuning_space)')
+            # Extract base column name (handles 'perfConfig (tuning_space)')
             col_name = col.split()[0]
             indices[col_name] = i
 
@@ -835,9 +835,6 @@ def __exit__(self, exc_type, exc_value, traceback):
             self.file.close()
 
     def _write_header(self):
-        if self._header_written:
-            return
-
         if self._is_appending:
             print("", file=self.file)  # Blank line before new section
 
@@ -852,15 +849,17 @@ def _write_header(self):
         if self.options.tflops:
             columns.append('TFlops')
         columns.append('elapsedSeconds')
-        print("# " + "\t".join(columns), file=self.file)
 
+        print("# " + "\t".join(columns), file=self.file)
         self.file.flush()
+
         self._header_written = True
 
     def write_result(self, result: TuningResult):
         assert result.success and result.winning_config and result.max_tflops, "write_result called with invalid result"
 
-        self._write_header()
+        if not self._header_written:
+            self._write_header()
 
         fields = [
             self.options.arch,
@@ -870,8 +869,8 @@ def write_result(self, result: TuningResult):
         if self.options.tflops:
             fields.append(str(result.max_tflops))
         fields.append(f"{result.elapsed_seconds:.1f}")
-        print("\t".join(fields), file=self.file)
 
+        print("\t".join(fields), file=self.file)
         self.file.flush()
 
 
@@ -899,8 +898,8 @@ def write_result(self, result: TuningResult):
                                             sep='\t',
                                             header=not self._header_written,
                                             index=False)
-
         self.file.flush()
+
         self._header_written = True
 
 
@@ -999,7 +998,7 @@ def format_error(context: str,
                  max_lines: int = 10) -> str:
     """Format an error message with optional details."""
 
-    def truncate(text: str) -> str:
+    def truncate(text: Optional[str]) -> Optional[str]:
         if not text or not text.strip():
             return None
         lines = text.strip().splitlines()

From b781b40c3eb1958a33b4cf22a3f1c6dde1a9ac97 Mon Sep 17 00:00:00 2001
From: Mirza Halilcevic <mirza.halilcevic@amd.com>
Date: Mon, 19 Jan 2026 18:11:54 +0000
Subject: [PATCH 13/23] Show tuning-driver output during failures.

---
 mlir/utils/performance/tuningRunner.py | 79 +++++++++++++-------------
 1 file changed, 41 insertions(+), 38 deletions(-)

diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py
index ed7a0de0bdaf..7b81fb2f3660 100755
--- a/mlir/utils/performance/tuningRunner.py
+++ b/mlir/utils/performance/tuningRunner.py
@@ -658,7 +658,9 @@ def _format_rate(self, seconds: float) -> str:
             return f"{seconds / 3600:.1f}h/cfg"
 
     def _format_eta(self, seconds: float) -> str:
-        if seconds < 60:
+        if seconds == 0:
+            return "0s"
+        elif seconds < 60:
             return "<1m"
         elif seconds < 3600:
             return f"{int(seconds // 60)}m"
@@ -678,7 +680,7 @@ def get_postfix_str(self) -> str:
         eta = "n/a"
         if len(self.success_times) >= 3:
             median = statistics.median(self.success_times)
-            eta_seconds = (remaining / self.num_workers) * median
+            eta_seconds = (remaining / self.num_workers) * median if self.num_workers > 0 else 0
             rate = self._format_rate(median)
             eta = self._format_eta(eta_seconds)
 
@@ -735,9 +737,9 @@ def get_compile_threads(self, gpu_id: int) -> int:
         """Get the number of compile threads allocated to a GPU."""
         return self._threads_per_gpu[gpu_id]
 
-    def print_gpu_summary(self):
+    def print_gpu_summary(self, num_workers: Optional[int] = None) -> None:
         """Print summary of GPU allocation."""
-        num_active = len(self.options.gpu_ids)
+        num_active = num_workers or len(self.options.gpu_ids)
         lines = [f"Using {num_active} GPU(s)"]
         for gpu_id in self.options.gpu_ids[:num_active]:
             node = self.gpu_topology.get_numa_node(gpu_id)
@@ -1255,20 +1257,23 @@ def tune_config(test_vector: str, conf_class: type, paths: Paths, options: Optio
         # Note: communicate waits for process to terminate which might cause CI timeouts if tuning takes too long
         tuning_stdout, tuning_stderr = tuning_driver.communicate()
 
+        tuning_output = tuning_stdout.decode('utf-8').splitlines()
+        tuning_errors = tuning_stderr.decode('utf-8')
+
         if tuning_driver.returncode != 0:
-            error = format_error("Tuning pipeline failed",
-                                 command=tuning_pipeline,
-                                 stderr=tuning_stderr.decode('utf-8'),
-                                 exit_code=tuning_driver.returncode,
-                                 gpu_id=gpu_id)
+            error = format_error(
+                "Tuning pipeline failed",
+                command=tuning_pipeline,
+                stdout=tuning_output[-10:],  # Last 10 lines of stdout
+                stderr=tuning_errors,
+                exit_code=tuning_driver.returncode,
+                gpu_id=gpu_id)
             return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id, error=error)
         else:
             # Log any stderr output from tuning driver because it may contain warnings
-            tuning_stderr_str = tuning_stderr.decode('utf-8').strip()
-            if tuning_stderr_str:
-                logger.debug(f"[GPU {gpu_id}] rocmlir-tuning-driver stderr:\n{tuning_stderr_str}")
+            if tuning_errors.strip():
+                logger.warning(f"[GPU {gpu_id}] rocmlir-tuning-driver stderr:\n{tuning_errors}")
 
-        tuning_output = tuning_stdout.decode('utf-8').splitlines()
         winning_config, max_tflops, entries = find_best_perfconfig(tuning_output, config, paths,
                                                                    options, gpu_id)
     except TuningError as e:
@@ -1310,29 +1315,27 @@ def tune_config(test_vector: str, conf_class: type, paths: Paths, options: Optio
 
 def tune_configs(ctx: TuningContext) -> bool:
     """Tune multiple configurations in parallel across available GPUs."""
-    # Load cached results unless retuning is forced
+    # Load tuned configs from output file (unless --retune)
     if ctx.options.retune:
         cache = TunedConfigsCache()
     else:
         cache = TunedConfigsCache.from_output_file(ctx.options)
-        if cache.count() > 0:
-            logger.info(f"Found {cache.count()} tuned config(s) in {ctx.options.output}")
 
     # Load state file
     state_file = TuningStateFile(get_state_filepath(ctx.options.output), ctx.options.arch,
                                  ctx.options.tuning_space_kind)
     state = state_file.state
 
-    crashed_count = state.crashed_count()
-    if crashed_count > 0:
-        logger.warning(f"Found {crashed_count} crashed config(s) in state file")
+    if cache.count() > 0:
+        logger.info(f"Found {cache.count()} tuned config(s) in {ctx.options.output}")
+    if state.crashed_count() > 0:
+        logger.warning(f"Found {state.crashed_count()} crashed config(s) in state file")
+    if state.failed_count() > 0:
+        logger.warning(f"Found {state.failed_count()} failed config(s) in state file")
 
-    failed_count = state.failed_count()
-    if failed_count > 0:
-        logger.warning(f"Found {failed_count} failed config(s) in state file")
+    pending_configs = ctx.configs
 
     # Filter out already-tuned configs (unless --retune)
-    pending_configs = ctx.configs
     skipped_success = 0
     if not ctx.options.retune:
         pending_configs = [c for c in pending_configs if not cache.contains(c)]
@@ -1359,7 +1362,7 @@ def tune_configs(ctx: TuningContext) -> bool:
 
     pool = GpuWorkerPool(ctx)
     num_workers = min(pool.worker_count, len(pending_configs))
-    ctx.print_gpu_summary()
+    ctx.print_gpu_summary(num_workers=num_workers)
 
     # Prepare ETA tracker with historical data
     initial_times = [r.elapsed_seconds for r in cache.get_all_results() if r.elapsed_seconds > 0.0]
@@ -1369,27 +1372,14 @@ def tune_configs(ctx: TuningContext) -> bool:
                              ok_count=skipped_success,
                              fail_count=skipped_failed)
 
-    def execute_tuning_task(test_vector: str) -> TuningResult:
-        gpu_id = pool.acquire_gpu_for_thread()
-
-        state_file.set_running(test_vector)
-
-        start_time = time.time()
-        compile_threads = ctx.get_compile_threads(gpu_id)
-        result = tune_config(test_vector, ctx.conf_class, ctx.paths, ctx.options, gpu_id,
-                             compile_threads)
-        result.elapsed_seconds = time.time() - start_time
-
-        return result
-
     has_errors = False
 
     with (OutputFileWriter(ctx.options.output, ctx.options) as results_writer,
           DebugFileWriter(f"{ctx.options.output}.debug") if ctx.options.debug else nullcontext() as
           debug_writer):
+
         executor = None
         progress_bar = None
-
         try:  # No context manager for executor because we need to shutdown with wait=False
             progress_bar = tqdm(
                 total=len(ctx.configs),
@@ -1403,6 +1393,19 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
                 '{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [t={elapsed}{postfix}]')
             progress_bar.set_postfix_str(eta_tracker.get_postfix_str())
 
+            def execute_tuning_task(test_vector: str) -> TuningResult:
+                gpu_id = pool.acquire_gpu_for_thread()
+
+                state_file.set_running(test_vector)
+
+                start_time = time.time()
+                compile_threads = ctx.get_compile_threads(gpu_id)
+                result = tune_config(test_vector, ctx.conf_class, ctx.paths, ctx.options, gpu_id,
+                                     compile_threads)
+                result.elapsed_seconds = time.time() - start_time
+
+                return result
+
             executor = ThreadPoolExecutor(max_workers=num_workers)
             pending_futures = {
                 executor.submit(execute_tuning_task, test_vector): test_vector

From 056b12c3fb90d3987ea9a0b781e40b835812e151 Mon Sep 17 00:00:00 2001
From: Mirza Halilcevic <mirza.halilcevic@amd.com>
Date: Mon, 19 Jan 2026 20:58:26 +0000
Subject: [PATCH 14/23] Add --status option.

---
 mlir/utils/performance/tuningRunner.py | 45 +++++++++++++++-----------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py
index 7b81fb2f3660..7505b18d55b1 100755
--- a/mlir/utils/performance/tuningRunner.py
+++ b/mlir/utils/performance/tuningRunner.py
@@ -1313,7 +1313,7 @@ def tune_config(test_vector: str, conf_class: type, paths: Paths, options: Optio
                         verify_tflops=verify_tflops)
 
 
-def tune_configs(ctx: TuningContext) -> bool:
+def tune_configs(ctx: TuningContext, status_only: bool) -> bool:
     """Tune multiple configurations in parallel across available GPUs."""
     # Load tuned configs from output file (unless --retune)
     if ctx.options.retune:
@@ -1356,6 +1356,10 @@ def tune_configs(ctx: TuningContext) -> bool:
         logger.info(
             f"Skipping {skipped_failed} failed/crashed config(s) - use '--retry-failed' to retune")
 
+    if status_only:
+        logger.info(f"{len(pending_configs)}/{len(ctx.configs)} config(s) pending tuning")
+        return True
+
     if not pending_configs:
         logger.info("No configurations to tune")
         return True
@@ -1480,7 +1484,7 @@ def extract_fusion_configs(test_dir: str, paths: Paths) -> Operation:
     op_type = Operation.FUSION
 
     for filename in glob.glob(test_dir + '/*mlir'):
-        logger.info(f"Extract from: {filename}")
+        logger.info(f"Extracting fusion configs from: {filename}")
         test_entry = perfRunner.get_fusion_test_info(filename, paths)
         if not test_entry:
             continue
@@ -1588,10 +1592,11 @@ def parse_arguments(gpu_topology: GpuTopology,
         metavar='FILE',
         help="Path to file containing list of configurations to tune. Use '-' for stdin.")
 
-    config_group.add_argument("--config",
-                              type=str,
-                              metavar='CONFIG',
-                              help="Specific config to tune. Format depends on --op type.")
+    config_group.add_argument(
+        "--config",
+        type=str,
+        metavar='CONFIG',
+        help="Specific config to tune. Can be a config string or path to an .mlir file.")
 
     parser.add_argument("--op",
                         "--operation",
@@ -1737,11 +1742,16 @@ def parse_arguments(gpu_topology: GpuTopology,
         "Wait for all compilation tasks to complete before starting tuning. Useful for systems with shared CPU/GPU memory (e.g., APUs)."
     )
 
+    parser.add_argument("-s",
+                        "--status",
+                        action='store_true',
+                        default=False,
+                        help="Only show tuning status without performing any tuning")
+
     return parser.parse_args(args)
 
 
 def main(args=None):
-    numa_topology = NumaTopology.discover()
     gpu_topology = GpuTopology.discover()
     available_gpus = sorted(gpu_topology.gpus.keys())
 
@@ -1753,16 +1763,15 @@ def main(args=None):
 
     setup_logger(quiet=parsed_args.quiet, verbose=parsed_args.verbose)
 
+    op_type = Operation.from_name(parsed_args.op)
+
+    # Handle stdin for configs file
     stdin_temp_file = None
-    try:
-        # Handle stdin for configs file
-        if parsed_args.configs_file == '-':
-            stdin_temp_file = load_configs_from_stdin()
-            parsed_args.configs_file = stdin_temp_file
+    if parsed_args.configs_file == '-':
+        parsed_args.configs_file = load_configs_from_stdin()
 
-        op_type = Operation.from_name(parsed_args.op)
+    try:
         paths = resolve_paths(op_type, parsed_args)
-
         if not paths.mlir_paths:
             logger.error("rocMLIR build dir was not provided/found")
             return 1
@@ -1770,9 +1779,7 @@ def main(args=None):
         if op_type == Operation.FUSION:
             op_type = extract_fusion_configs(parsed_args.test_dir, paths)
 
-        conf_class = get_config_class(op_type)
         configs = load_configs(op_type, parsed_args, paths)
-
     finally:
         if stdin_temp_file:
             os.unlink(stdin_temp_file)
@@ -1802,14 +1809,14 @@ def main(args=None):
                       wait_for_compiles=parsed_args.wait_for_compiles)
 
     ctx = TuningContext(configs=configs,
-                        conf_class=conf_class,
+                        conf_class=get_config_class(op_type),
                         paths=paths,
                         options=options,
                         gpu_topology=gpu_topology,
-                        numa_topology=numa_topology)
+                        numa_topology=NumaTopology.discover())
 
     try:
-        tuning_succeeded = tune_configs(ctx)
+        tuning_succeeded = tune_configs(ctx, status_only=parsed_args.status)
     except KeyboardInterrupt:
         return 130  # 128 + SIGINT
 

From 7041c6455a87df88df2b7e8ab04d4fb080ecf329 Mon Sep 17 00:00:00 2001
From: Mirza Halilcevic <mirza.halilcevic@amd.com>
Date: Wed, 21 Jan 2026 14:08:28 +0000
Subject: [PATCH 15/23] Improve order of logs for easier tracking.

---
 mlir/utils/performance/tuningRunner.py | 118 ++++++++++++++++---------
 1 file changed, 76 insertions(+), 42 deletions(-)

diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py
index 850b2c77caab..80e2a56879d1 100755
--- a/mlir/utils/performance/tuningRunner.py
+++ b/mlir/utils/performance/tuningRunner.py
@@ -28,6 +28,7 @@
 import json
 import logging
 import os
+import signal
 import statistics
 import subprocess
 import sys
@@ -111,6 +112,16 @@ def emit(self, record):
             self.handleError(record)
 
 
+class GpuLoggerAdapter(logging.LoggerAdapter):
+    """Logger adapter that prefixes messages with GPU ID."""
+
+    def process(self, msg, kwargs):
+        gpu_id = self.extra.get('gpu_id')
+        if gpu_id is not None:
+            return f"[GPU {gpu_id}] {msg}", kwargs
+        return msg, kwargs
+
+
 def setup_logger(quiet: bool = False, verbose: bool = False) -> logging.Logger:
     """Configure and return a logger for tuningRunner."""
     assert not (quiet and verbose), "quiet and verbose are mutually exclusive"
@@ -126,6 +137,11 @@ def setup_logger(quiet: bool = False, verbose: bool = False) -> logging.Logger:
     logger.addHandler(TqdmLoggingHandler(use_color=sys.stderr.isatty()))
 
 
+def get_gpu_logger(gpu_id: int) -> logging.LoggerAdapter:
+    """Get a logger adapter for a specific GPU."""
+    return GpuLoggerAdapter(logger, {'gpu_id': gpu_id})
+
+
 # Module-level logger
 logger: logging.Logger = logging.getLogger("tuningRunner")
 
@@ -168,7 +184,6 @@ class TuningResult:
     max_tflops: Optional[float] = None
     entries: List[Dict] = field(default_factory=list)
     verify_tflops: Optional[float] = None
-    error: Optional[str] = None
 
 
 # =============================================================================
@@ -909,6 +924,20 @@ def write_result(self, result: TuningResult):
 # Utilities
 # =============================================================================
 
+# Signals that indicate user/system requested termination (should not be logged as failures)
+TERMINATION_SIGNALS = frozenset({
+    signal.SIGINT,  # Ctrl+C
+    signal.SIGTERM,  # Graceful termination request
+    signal.SIGHUP,  # Terminal hangup
+    signal.SIGQUIT,  # Quit from keyboard
+})
+
+
+def raise_if_terminated(returncode: int) -> None:
+    """Raise KeyboardInterrupt if returncode indicates termination."""
+    if -returncode in TERMINATION_SIGNALS:
+        raise KeyboardInterrupt()
+
 
 class TuningArgumentParser(argparse.ArgumentParser):
     """ArgumentParser with custom validation for tuning arguments."""
@@ -1051,6 +1080,8 @@ def verify_perfconfig(perfconfig: str, config: PerfConfiguration, paths: Paths,
 
     Returns the execution time in nanoseconds, or raises TuningError on failure.
     """
+    gpu_logger = get_gpu_logger(gpu_id)
+
     config.set_perfconfig(perfconfig)
 
     command_line_options = config.generate_mlir_driver_commandline(options.rocmlir_gen_flags,
@@ -1075,7 +1106,7 @@ def verify_perfconfig(perfconfig: str, config: PerfConfiguration, paths: Paths,
     verification_pipeline = " | ".join([
         ' '.join(rocmlir_gen_command), ' '.join(rocmlir_driver_command), ' '.join(rocprof_command)
     ])
-    logger.debug(f"[GPU {gpu_id}] Verifying perfconfig '{perfconfig}'\n{verification_pipeline}")
+    gpu_logger.debug(f"Verifying perfconfig '{perfconfig}'\nCommand: {verification_pipeline}")
 
     with tempfile.TemporaryDirectory() as tmpdir:
         p1 = None
@@ -1105,6 +1136,7 @@ def verify_perfconfig(perfconfig: str, config: PerfConfiguration, paths: Paths,
 
             try:
                 outs, errs = p3.communicate(timeout=600)
+                raise_if_terminated(p3.returncode)
                 outs = outs.decode('utf-8')
                 if p3.returncode != 0 or not CORRECT_RESULT_RE.search(outs):
                     raise TuningError(
@@ -1146,6 +1178,8 @@ def find_best_perfconfig(tuning_output: List[str], config: PerfConfiguration, pa
 
     Returns the winning config, its TFLOPS, and all entries.
     """
+    gpu_logger = get_gpu_logger(gpu_id)
+
     max_tflops: Optional[float] = None
     winning_config: Optional[str] = None
     entries = []
@@ -1157,7 +1191,7 @@ def find_best_perfconfig(tuning_output: List[str], config: PerfConfiguration, pa
 
         parts = result.split('\t')
         if len(parts) < 2:
-            logger.debug(f"Skipping malformed tuning output line: '{result}'")
+            gpu_logger.debug(f"Skipping malformed tuning output line: '{result}'")
             continue
 
         perfconfig = parts[0]
@@ -1170,7 +1204,7 @@ def find_best_perfconfig(tuning_output: List[str], config: PerfConfiguration, pa
                 nano_seconds = float(time)
                 measurements = json.loads(parts[1]) if len(parts) == 3 else None
         except (ValueError, json.JSONDecodeError):
-            logger.debug(f"Skipping malformed tuning output line: '{result}'")
+            gpu_logger.debug(f"Skipping malformed tuning output line: '{result}'")
             continue
 
         config.set_perfconfig(perfconfig)
@@ -1195,6 +1229,8 @@ def find_best_perfconfig(tuning_output: List[str], config: PerfConfiguration, pa
 def tune_config(test_vector: str, conf_class: type, paths: Paths, options: Options, gpu_id: int,
                 num_compile_threads: int) -> TuningResult:
     """Tune a single configuration and return the result."""
+    gpu_logger = get_gpu_logger(gpu_id)
+
     tuning_driver_args = [
         f"--tuning-space={options.tuning_space_kind}", f"--num-iterations={MLIR_N_REPEATS}",
         f"--warmup-iterations={WARMUP_ITERATIONS}", "--use-median", f"--sleep-us={SLEEP_US}",
@@ -1239,16 +1275,15 @@ def tune_config(test_vector: str, conf_class: type, paths: Paths, options: Optio
                                           stderr=subprocess.PIPE,
                                           env=env)
             output, err = tuning_key.communicate()
+            raise_if_terminated(tuning_key.returncode)
             if tuning_key.returncode != 0:
-                error = format_error("Failed to generate tuning key",
-                                     command=' '.join(rocmlir_gen_command),
-                                     stderr=err.decode('utf-8'),
-                                     exit_code=tuning_key.returncode,
-                                     gpu_id=gpu_id)
-                return TuningResult(test_vector=test_vector,
-                                    success=False,
-                                    gpu_id=gpu_id,
-                                    error=error)
+                gpu_logger.error(
+                    format_error("Failed to generate tuning key",
+                                 command=' '.join(rocmlir_gen_command),
+                                 stderr=err.decode('utf-8'),
+                                 exit_code=tuning_key.returncode,
+                                 gpu_id=gpu_id))
+                return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id)
             result = output.decode('utf-8').strip().split('\t')
             command_line = result[2].split(sep=' ')
             config = conf_class.from_command_line(command_line, options.arch, options.num_cu,
@@ -1260,55 +1295,55 @@ def tune_config(test_vector: str, conf_class: type, paths: Paths, options: Optio
                                              env=env)
             tuning_pipeline = ' '.join(tuning_driver_command)
 
-        logger.debug(f"[GPU {gpu_id}] Tuning '{test_vector}'\n{tuning_pipeline}")
+        gpu_logger.debug(f"Tuning '{test_vector}'\nCommand: {tuning_pipeline}")
 
         # Note: communicate waits for process to terminate which might cause CI timeouts if tuning takes too long
         tuning_stdout, tuning_stderr = tuning_driver.communicate()
 
+        raise_if_terminated(tuning_driver.returncode)
+
         tuning_output = tuning_stdout.decode('utf-8').splitlines()
         tuning_errors = tuning_stderr.decode('utf-8')
 
         if tuning_driver.returncode != 0:
-            error = format_error(
-                "Tuning pipeline failed",
-                command=tuning_pipeline,
-                stdout=tuning_output[-10:],  # Last 10 lines of stdout
-                stderr=tuning_errors,
-                exit_code=tuning_driver.returncode,
-                gpu_id=gpu_id)
-            return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id, error=error)
+            gpu_logger.error(
+                format_error(
+                    "Tuning pipeline failed",
+                    command=tuning_pipeline,
+                    stdout=tuning_output[-10:],  # Last 10 lines of stdout
+                    stderr=tuning_errors,
+                    exit_code=tuning_driver.returncode,
+                    gpu_id=gpu_id))
+            return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id)
         else:
             # Log any stderr output from tuning driver because it may contain warnings
             if tuning_errors.strip():
-                logger.warning(f"[GPU {gpu_id}] rocmlir-tuning-driver stderr:\n{tuning_errors}")
+                gpu_logger.warning(f"rocmlir-tuning-driver stderr:\n{tuning_errors}")
 
         winning_config, max_tflops, entries = find_best_perfconfig(tuning_output, config, paths,
                                                                    options, gpu_id)
     except TuningError as e:
-        return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id, error=str(e))
+        gpu_logger.error(str(e))
+        return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id)
     finally:
         kill_process(rocmlir_gen)
         kill_process(tuning_driver)
 
     if winning_config is None:
-        return TuningResult(test_vector=test_vector,
-                            success=False,
-                            gpu_id=gpu_id,
-                            error="No valid perf config found")
+        gpu_logger.error("No valid perf config found")
+        return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id)
 
     verify_tflops = None
     if options.verify_mode != "none":
         try:
             verify_ns = verify_perfconfig(winning_config, config, paths, options, gpu_id)
         except TuningError as e:
-            return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id, error=str(e))
+            gpu_logger.error(str(e))
+            return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id)
 
         if np.isnan(verify_ns):
-            return TuningResult(
-                test_vector=test_vector,
-                success=False,
-                gpu_id=gpu_id,
-                error=f"Verification returned NaN for winning perfconfig '{winning_config}'")
+            gpu_logger.error(f"Verification returned NaN for winning perfconfig '{winning_config}'")
+            return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id)
 
         verify_tflops = config.compute_tflops(verify_ns)
 
@@ -1416,6 +1451,11 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
                                      compile_threads)
                 result.elapsed_seconds = time.time() - start_time
 
+                if result.success:
+                    state_file.set_success(result.test_vector)
+                else:
+                    state_file.set_failed(result.test_vector)
+
                 return result
 
             executor = ThreadPoolExecutor(max_workers=num_workers)
@@ -1431,15 +1471,9 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
                     results_writer.write_result(result)
                     if debug_writer:
                         debug_writer.write_result(result)
-                    state_file.set_success(result.test_vector)
                 else:
                     has_errors = True
-                    state_file.set_failed(result.test_vector)
-
-                    error_msg = f"[GPU {result.gpu_id}] Tuning failed for '{result.test_vector}'"
-                    if result.error:
-                        error_msg += "\n" + result.error
-                    logger.error(error_msg)
+                    logger.error(f"Tuning failed for '{result.test_vector}' on GPU {result.gpu_id}")
 
                 eta_tracker.record(result)
                 progress_bar.update(1)
@@ -1825,7 +1859,7 @@ def main(args=None):
     try:
         tuning_succeeded = tune_configs(ctx, status_only=parsed_args.status)
     except KeyboardInterrupt:
-        return 130  # 128 + SIGINT
+        return 128 + signal.SIGINT
 
     return 0 if tuning_succeeded else 1
 

From 8f4259e9a25e2d7f4b3ed66dc3c4b1713c6948f3 Mon Sep 17 00:00:00 2001
From: Mirza Halilcevic <mirza.halilcevic@amd.com>
Date: Wed, 21 Jan 2026 14:32:09 +0000
Subject: [PATCH 16/23] Update github ci python version.

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 08a622077d13..8e0a3d5a82af 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -21,7 +21,7 @@ jobs:
   py-checks:
     runs-on: ubuntu-latest
     container:
-      image: python:3.8
+      image: python:3.10
       options: --user root
     steps:
       - uses: actions/checkout@v4

From aeb430c1e6a241421f9806d1be41530a80799e06 Mon Sep 17 00:00:00 2001
From: Mirza Halilcevic <mirza.halilcevic@amd.com>
Date: Mon, 2 Feb 2026 20:39:18 +0000
Subject: [PATCH 17/23] Fix state transitions.

---
 mlir/utils/performance/tuningRunner.py | 76 ++++++++++++++++----------
 1 file changed, 47 insertions(+), 29 deletions(-)

diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py
index 80e2a56879d1..8ffa4662ea76 100755
--- a/mlir/utils/performance/tuningRunner.py
+++ b/mlir/utils/performance/tuningRunner.py
@@ -167,7 +167,7 @@ class Options:
     output: str
     abort_on_error: bool
     retune: bool
-    retry_failed: bool
+    retry_states: frozenset
     gpu_ids: List[int]
     num_cpus: Optional[int]
     wait_for_compiles: bool
@@ -309,15 +309,15 @@ class ConfigState(Enum):
 
     State transitions:
         PENDING (implicit) -> RUNNING: Config starts tuning
-        RUNNING -> SUCCESS (implicit): Tuning completes successfully (removed from state, written to output)
+        RUNNING -> SUCCEEDED (implicit): Tuning completes successfully (removed from state, written to output)
         RUNNING -> FAILED: Tuning completes with error
         RUNNING -> INTERRUPTED: User interrupted (Ctrl+C) during tuning
         RUNNING -> CRASHED: Detected on next startup (stale RUNNING state)
-        FAILED/CRASHED -> PENDING: User requests retry with --retry-failed
+        <state> -> PENDING: User requests retry with --retry <state>
 
-    Note: PENDING and SUCCESS are implicit states:
+    Note: PENDING and SUCCEEDED are implicit states:
         - PENDING: not in state file AND not in output file
-        - SUCCESS: in output file (not tracked in state file)
+        - SUCCEEDED: in output file (not tracked in state file)
     """
     RUNNING = "running"  # Currently being tuned
     FAILED = "failed"  # Tuning completed with error
@@ -325,25 +325,36 @@ class ConfigState(Enum):
     CRASHED = "crashed"  # Process crashed while tuning (detected on startup)
 
 
+# States representing unsuccessful tuning outcomes that are skipped by default
+UNSUCCESSFUL_STATES = frozenset({ConfigState.FAILED, ConfigState.CRASHED})
+
+
 @dataclass
 class TuningState:
     """State tracking for configs within a single context."""
     configs: Dict[str, ConfigState] = field(default_factory=dict)
+    _pre_running_states: Dict[str, ConfigState] = field(default_factory=dict)
 
     def set_running(self, test_vector: str) -> None:
+        if test_vector in self.configs:
+            self._pre_running_states[test_vector] = self.configs[test_vector]
         self.configs[test_vector] = ConfigState.RUNNING
 
     def set_failed(self, test_vector: str) -> None:
         self.configs[test_vector] = ConfigState.FAILED
+        self._pre_running_states.pop(test_vector, None)
 
     def set_interrupted(self, test_vector: str) -> None:
         self.configs[test_vector] = ConfigState.INTERRUPTED
+        self._pre_running_states.pop(test_vector, None)
 
     def remove(self, test_vector: str) -> None:
         self.configs.pop(test_vector, None)
+        self._pre_running_states.pop(test_vector, None)
 
-    def should_skip(self, test_vector: str) -> bool:
-        return self.configs.get(test_vector) in (ConfigState.FAILED, ConfigState.CRASHED)
+    def should_skip(self, test_vector: str, retry_states: frozenset = frozenset()) -> bool:
+        state = self.configs.get(test_vector)
+        return state in UNSUCCESSFUL_STATES and state not in retry_states
 
     def is_empty(self) -> bool:
         return not self.configs
@@ -358,7 +369,8 @@ def promote_running_to_interrupted(self) -> int:
         count = 0
         for tv in self.configs:
             if self.configs[tv] == ConfigState.RUNNING:
-                self.configs[tv] = ConfigState.INTERRUPTED
+                prev_state = self._pre_running_states.pop(tv, None)
+                self.configs[tv] = prev_state or ConfigState.INTERRUPTED
                 count += 1
         return count
 
@@ -460,7 +472,7 @@ def set_failed(self, test_vector: str) -> None:
             self._state.set_failed(test_vector)
             self._save_locked()
 
-    def set_success(self, test_vector: str) -> None:
+    def set_succeeded(self, test_vector: str) -> None:
         with self._lock:
             self._state.remove(test_vector)
             self._save_locked()
@@ -1379,25 +1391,29 @@ def tune_configs(ctx: TuningContext, status_only: bool) -> bool:
     pending_configs = ctx.configs
 
     # Filter out already-tuned configs (unless --retune)
-    skipped_success = 0
+    skipped_successful = 0
     if not ctx.options.retune:
         pending_configs = [c for c in pending_configs if not cache.contains(c)]
-        skipped_success = len(ctx.configs) - len(pending_configs)
+        skipped_successful = len(ctx.configs) - len(pending_configs)
 
-    # Filter out failed/crashed configs (unless --retry-failed or --retune)
-    skipped_failed = 0
-    if not ctx.options.retry_failed and not ctx.options.retune:
+    # Filter out unsuccessful configs (unless --retry or --retune)
+    skipped_unsuccessful = 0
+    if not ctx.options.retune:
         before_filter = len(pending_configs)
-        pending_configs = [c for c in pending_configs if not state.should_skip(c)]
-        skipped_failed = before_filter - len(pending_configs)
+        pending_configs = [
+            c for c in pending_configs if not state.should_skip(c, ctx.options.retry_states)
+        ]
+        skipped_unsuccessful = before_filter - len(pending_configs)
 
-    total_skipped = skipped_success + skipped_failed
+    total_skipped = skipped_successful + skipped_unsuccessful
 
-    if skipped_success > 0:
-        logger.info(f"Skipping {skipped_success} already tuned config(s)")
-    if skipped_failed > 0:
+    if skipped_successful > 0:
         logger.info(
-            f"Skipping {skipped_failed} failed/crashed config(s) - use '--retry-failed' to retune")
+            f"Skipping {skipped_successful} already tuned config(s) - use '--retune' to retune")
+    if skipped_unsuccessful > 0:
+        logger.info(
+            f"Skipping {skipped_unsuccessful} unsuccessful config(s) - use '--retry <state>' to retry"
+        )
 
     if status_only:
         logger.info(f"{len(pending_configs)}/{len(ctx.configs)} config(s) pending tuning")
@@ -1416,8 +1432,8 @@ def tune_configs(ctx: TuningContext, status_only: bool) -> bool:
     eta_tracker = ETATracker(total_configs=len(pending_configs),
                              num_workers=num_workers,
                              success_times=initial_times,
-                             ok_count=skipped_success,
-                             fail_count=skipped_failed)
+                             ok_count=skipped_successful,
+                             fail_count=skipped_unsuccessful)
 
     has_errors = False
 
@@ -1452,7 +1468,7 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
                 result.elapsed_seconds = time.time() - start_time
 
                 if result.success:
-                    state_file.set_success(result.test_vector)
+                    state_file.set_succeeded(result.test_vector)
                 else:
                     state_file.set_failed(result.test_vector)
 
@@ -1754,10 +1770,12 @@ def parse_arguments(gpu_topology: GpuTopology,
         default=False,
         help="Force retuning of all configs, ignoring existing results in the output file")
 
-    parser.add_argument("--retry-failed",
-                        action='store_true',
-                        default=False,
-                        help="Retry previously failed/crashed configs instead of skipping them")
+    parser.add_argument("--retry",
+                        nargs='+',
+                        choices=["failed", "crashed"],
+                        default=[],
+                        metavar='STATE',
+                        help="Retry configs in specified states")
 
     parser.add_argument("--gpus",
                         type=int,
@@ -1844,7 +1862,7 @@ def main(args=None):
                       output=parsed_args.output,
                       abort_on_error=parsed_args.abort_on_error,
                       retune=parsed_args.retune,
-                      retry_failed=parsed_args.retry_failed,
+                      retry_states=frozenset(ConfigState(s) for s in parsed_args.retry),
                       gpu_ids=parsed_args.gpus,
                       num_cpus=parsed_args.num_cpus,
                       wait_for_compiles=parsed_args.wait_for_compiles)

From c39a3d200af169a6979f24b1b27346b651eb94ee Mon Sep 17 00:00:00 2001
From: Mirza Halilcevic <mirza.halilcevic@amd.com>
Date: Mon, 2 Feb 2026 20:53:05 +0000
Subject: [PATCH 18/23] Add timeout option.

---
 mlir/utils/performance/tuningRunner.py | 49 ++++++++++++++++++++++----
 1 file changed, 43 insertions(+), 6 deletions(-)

diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py
index 8ffa4662ea76..1d575f5fa44f 100755
--- a/mlir/utils/performance/tuningRunner.py
+++ b/mlir/utils/performance/tuningRunner.py
@@ -171,6 +171,7 @@ class Options:
     gpu_ids: List[int]
     num_cpus: Optional[int]
     wait_for_compiles: bool
+    timeout: Optional[int]
 
 
 @dataclass
@@ -178,6 +179,7 @@ class TuningResult:
     """Result of tuning a single configuration."""
     test_vector: str
     success: bool
+    timed_out: bool = False
     gpu_id: int = -1
     elapsed_seconds: float = 0.0
     winning_config: Optional[str] = None
@@ -311,6 +313,7 @@ class ConfigState(Enum):
         PENDING (implicit) -> RUNNING: Config starts tuning
         RUNNING -> SUCCEEDED (implicit): Tuning completes successfully (removed from state, written to output)
         RUNNING -> FAILED: Tuning completes with error
+        RUNNING -> TIMED_OUT: Tuning exceeded timeout
         RUNNING -> INTERRUPTED: User interrupted (Ctrl+C) during tuning
         RUNNING -> CRASHED: Detected on next startup (stale RUNNING state)
         <state> -> PENDING: User requests retry with --retry <state>
@@ -321,12 +324,13 @@ class ConfigState(Enum):
     """
     RUNNING = "running"  # Currently being tuned
     FAILED = "failed"  # Tuning completed with error
+    TIMED_OUT = "timed_out"  # Tuning exceeded timeout
     INTERRUPTED = "interrupted"  # User interrupted during tuning (Ctrl+C)
     CRASHED = "crashed"  # Process crashed while tuning (detected on startup)
 
 
 # States representing unsuccessful tuning outcomes that are skipped by default
-UNSUCCESSFUL_STATES = frozenset({ConfigState.FAILED, ConfigState.CRASHED})
+UNSUCCESSFUL_STATES = frozenset({ConfigState.FAILED, ConfigState.TIMED_OUT, ConfigState.CRASHED})
 
 
 @dataclass
@@ -344,6 +348,10 @@ def set_failed(self, test_vector: str) -> None:
         self.configs[test_vector] = ConfigState.FAILED
         self._pre_running_states.pop(test_vector, None)
 
+    def set_timed_out(self, test_vector: str) -> None:
+        self.configs[test_vector] = ConfigState.TIMED_OUT
+        self._pre_running_states.pop(test_vector, None)
+
     def set_interrupted(self, test_vector: str) -> None:
         self.configs[test_vector] = ConfigState.INTERRUPTED
         self._pre_running_states.pop(test_vector, None)
@@ -362,6 +370,9 @@ def is_empty(self) -> bool:
     def failed_count(self) -> int:
         return sum(1 for s in self.configs.values() if s == ConfigState.FAILED)
 
+    def timed_out_count(self) -> int:
+        return sum(1 for s in self.configs.values() if s == ConfigState.TIMED_OUT)
+
     def crashed_count(self) -> int:
         return sum(1 for s in self.configs.values() if s == ConfigState.CRASHED)
 
@@ -472,6 +483,11 @@ def set_failed(self, test_vector: str) -> None:
             self._state.set_failed(test_vector)
             self._save_locked()
 
+    def set_timed_out(self, test_vector: str) -> None:
+        with self._lock:
+            self._state.set_timed_out(test_vector)
+            self._save_locked()
+
     def set_succeeded(self, test_vector: str) -> None:
         with self._lock:
             self._state.remove(test_vector)
@@ -1309,8 +1325,17 @@ def tune_config(test_vector: str, conf_class: type, paths: Paths, options: Optio
 
         gpu_logger.debug(f"Tuning '{test_vector}'\nCommand: {tuning_pipeline}")
 
-        # Note: communicate waits for process to terminate which might cause CI timeouts if tuning takes too long
-        tuning_stdout, tuning_stderr = tuning_driver.communicate()
+        try:
+            tuning_stdout, tuning_stderr = tuning_driver.communicate(timeout=options.timeout)
+        except subprocess.TimeoutExpired:
+            gpu_logger.error(
+                format_error(f"Tuning timed out after {options.timeout}s",
+                             command=tuning_pipeline,
+                             gpu_id=gpu_id))
+            return TuningResult(test_vector=test_vector,
+                                success=False,
+                                timed_out=True,
+                                gpu_id=gpu_id)
 
         raise_if_terminated(tuning_driver.returncode)
 
@@ -1385,6 +1410,8 @@ def tune_configs(ctx: TuningContext, status_only: bool) -> bool:
         logger.info(f"Found {cache.count()} tuned config(s) in {ctx.options.output}")
     if state.crashed_count() > 0:
         logger.warning(f"Found {state.crashed_count()} crashed config(s) in state file")
+    if state.timed_out_count() > 0:
+        logger.warning(f"Found {state.timed_out_count()} timed out config(s) in state file")
     if state.failed_count() > 0:
         logger.warning(f"Found {state.failed_count()} failed config(s) in state file")
 
@@ -1469,6 +1496,8 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
 
                 if result.success:
                     state_file.set_succeeded(result.test_vector)
+                elif result.timed_out:
+                    state_file.set_timed_out(result.test_vector)
                 else:
                     state_file.set_failed(result.test_vector)
 
@@ -1489,7 +1518,8 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
                         debug_writer.write_result(result)
                 else:
                     has_errors = True
-                    logger.error(f"Tuning failed for '{result.test_vector}' on GPU {result.gpu_id}")
+                    logger.error(
+                        f"Tuning unsuccessful for '{result.test_vector}' on GPU {result.gpu_id}")
 
                 eta_tracker.record(result)
                 progress_bar.update(1)
@@ -1772,11 +1802,17 @@ def parse_arguments(gpu_topology: GpuTopology,
 
     parser.add_argument("--retry",
                         nargs='+',
-                        choices=["failed", "crashed"],
+                        choices=["failed", "timed_out", "crashed"],
                         default=[],
                         metavar='STATE',
                         help="Retry configs in specified states")
 
+    parser.add_argument("--timeout",
+                        type=int,
+                        default=None,
+                        metavar='SECONDS',
+                        help="Timeout in seconds for tuning each config")
+
     parser.add_argument("--gpus",
                         type=int,
                         nargs='+',
@@ -1865,7 +1901,8 @@ def main(args=None):
                       retry_states=frozenset(ConfigState(s) for s in parsed_args.retry),
                       gpu_ids=parsed_args.gpus,
                       num_cpus=parsed_args.num_cpus,
-                      wait_for_compiles=parsed_args.wait_for_compiles)
+                      wait_for_compiles=parsed_args.wait_for_compiles,
+                      timeout=parsed_args.timeout)
 
     ctx = TuningContext(configs=configs,
                         conf_class=get_config_class(op_type),

From 2b6e394a6d09feb8509903418a556b8335bf870d Mon Sep 17 00:00:00 2001
From: Mirza Halilcevic <mirza.halilcevic@amd.com>
Date: Tue, 3 Feb 2026 14:14:15 +0000
Subject: [PATCH 19/23] Improve output file format.

---
 mlir/utils/performance/tuningRunner.py | 216 +++++++++++++++----------
 1 file changed, 135 insertions(+), 81 deletions(-)

diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py
index 1d575f5fa44f..f8a905469634 100755
--- a/mlir/utils/performance/tuningRunner.py
+++ b/mlir/utils/performance/tuningRunner.py
@@ -38,8 +38,9 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from contextlib import nullcontext
 from dataclasses import dataclass, field
+from datetime import datetime, timezone
 from enum import Enum
-from typing import Any, Dict, List, Optional
+from typing import Dict, List, Optional
 from collections import deque
 
 import numpy as np
@@ -157,13 +158,13 @@ class Options:
     tuning_space_kind: str
     quiet: bool
     verbose: bool
-    arch: str
+    chip: str
+    arch: str  # Old arch value for backwards compatibility
     num_cu: int
     num_chiplets: int
     rocmlir_gen_flags: str
     verify_mode: str
     verify_perfconfigs: bool
-    tflops: bool
     output: str
     abort_on_error: bool
     retune: bool
@@ -181,7 +182,8 @@ class TuningResult:
     success: bool
     timed_out: bool = False
     gpu_id: int = -1
-    elapsed_seconds: float = 0.0
+    duration_seconds: float = 0.0
+    timestamp: Optional[str] = None
     winning_config: Optional[str] = None
     max_tflops: Optional[float] = None
     entries: List[Dict] = field(default_factory=list)
@@ -392,7 +394,7 @@ class TuningStateFile:
     File format:
     {
         "contexts": {
-            "<arch>/<tuning_space>": {
+            "<arch>/<num_cu>/<num_chiplets>/<tuning_space>": {
                 "test_vector_1": "failed",
                 "test_vector_2": "crashed"
             }
@@ -402,9 +404,10 @@ class TuningStateFile:
     If filepath is None, all operations are no-ops.
     """
 
-    def __init__(self, filepath: Optional[str], arch: str, tuning_space: str):
+    def __init__(self, filepath: Optional[str], arch: str, num_cu: int, num_chiplets: int,
+                 tuning_space: str):
         self.filepath = filepath
-        self.context_key = f"{arch}/{tuning_space}"
+        self.context_key = f"{arch}/{num_cu}/{num_chiplets}/{tuning_space}"
         self._lock = threading.Lock()
         self._all_contexts: Dict[str, Dict[str, str]] = {}  # context_key -> {tv -> state_str}
         self._state = TuningState()
@@ -539,8 +542,10 @@ def count(self) -> int:
     def from_output_file(cls, options: Options) -> 'TunedConfigsCache':
         """Load previously tuned configurations from an output TSV file.
 
-        Format: # arch\tnumCUs\tnumChiplets\ttestVector\tperfConfig (tuning_space)\t[TFlops]\telapsedSeconds
-        Only loads entries matching current arch and tuning space.
+        Format (new): # arch\tnumCUs\tnumChiplets\ttestVector\tperfConfig\tTFlops\ttuningSpace\tcommitId\ttimestamp\tdurationSec
+        Format (old): # arch\tnumCUs\tnumChiplets\ttestVector\tperfConfig (tuning_space)\t[TFlops]
+
+        Only loads entries matching current arch, num_cu, num_chiplets, and tuning space.
         """
         if options.output == '-' or not os.path.exists(options.output):
             return cls()
@@ -548,10 +553,9 @@ def from_output_file(cls, options: Options) -> 'TunedConfigsCache':
         results: Dict[str, TuningResult] = {}
 
         current_commit = get_git_commit_hash()
+        warned_commits: set = set()
 
-        # Active section state
-        metadata: Dict[str, Optional[Any]] = {}
-        matching_section = False
+        header_tuning_space: Optional[str] = None
         column_indices: Dict[str, int] = {}
 
         with open(options.output, mode='r') as f:
@@ -560,40 +564,24 @@ def from_output_file(cls, options: Options) -> 'TunedConfigsCache':
                 if not line:
                     continue
 
-                # Check for metadata line
-                if line.startswith('## '):
-                    parts = line[3:].split(':')
-                    if len(parts) == 2:
-                        metadata[parts[0].strip()] = parts[1].strip()
-                    continue
-
                 # Check for header line
                 if cls._is_header_line(line):
-                    # Determine if this section matches based on tuning space
-                    matching_section = f'({options.tuning_space_kind})' in line
-                    if matching_section:
-                        column_indices = cls._parse_header_line(line)
-                        # Warn if commit hashes differ
-                        file_commit = metadata.get('commit', 'unknown')
-                        if file_commit != current_commit:
-                            logger.warning(
-                                f"Loading tuned configs from different commit (file: {file_commit[:8]}, current: {current_commit[:8]})"
-                            )
-
-                    # Reset metadata for next section
-                    metadata = {}
+                    column_indices = cls._parse_header_line(line)
+                    # Extract tuning space from header for old format (perfConfig (tuning_space))
+                    header_tuning_space = cls._extract_tuning_space_from_header(line)
                     continue
 
-                # Skip other comment lines
+                # Skip comment lines
                 if line.startswith('#'):
                     continue
 
-                # Skip data lines from non-matching sections
-                if not matching_section or not column_indices:
+                # Skip if we haven't seen a header yet
+                if not column_indices:
                     continue
 
                 # Parse data line
-                result = cls._parse_data_line(line.split('\t'), column_indices, options.arch)
+                result = cls._parse_data_line(line.split('\t'), column_indices, options,
+                                              header_tuning_space, current_commit, warned_commits)
                 if result:
                     results[result.test_vector] = result
 
@@ -604,6 +592,13 @@ def _is_header_line(line: str) -> bool:
         """Check if line is a column header."""
         return line.startswith('# arch\t')
 
+    @staticmethod
+    def _extract_tuning_space_from_header(line: str) -> Optional[str]:
+        """Extract tuning space from old format header like 'perfConfig (quick)' or 'TFlops (quick)'."""
+        import re
+        match = re.search(r'\((\w+)\)', line)
+        return match.group(1) if match else None
+
     @staticmethod
     def _parse_header_line(line: str) -> Dict[str, int]:
         """Parse column header and return name -> index mapping."""
@@ -621,12 +616,15 @@ def _parse_header_line(line: str) -> Dict[str, int]:
         return indices
 
     @staticmethod
-    def _parse_data_line(fields: List[str], column_indices: Dict[str, int],
-                         arch: str) -> Optional[TuningResult]:
+    def _parse_data_line(fields: List[str], column_indices: Dict[str, int], options: Options,
+                         header_tuning_space: Optional[str], current_commit: str,
+                         warned_commits: set) -> Optional[TuningResult]:
         """Parse a data line and return TuningResult if valid.
 
         A line is valid if:
-        - arch matches current system
+        - arch matches current system (chip or arch for backwards compatibility)
+        - numCUs and numChiplets match current system
+        - tuning space matches (from column or header)
         - testVector is present
         - perfConfig is present and not 'None'
         """
@@ -637,7 +635,24 @@ def get_field(name: str) -> Optional[str]:
                 return fields[idx]
             return None
 
-        if get_field('arch') != arch:
+        # Check arch match (new format uses chip, old format used arch)
+        file_arch = get_field('arch')
+        if file_arch != options.chip and file_arch != options.arch:
+            return None
+
+        # Check numCUs match
+        file_num_cu = get_field('numCUs')
+        if file_num_cu and file_num_cu != str(options.num_cu):
+            return None
+
+        # Check numChiplets match
+        file_num_chiplets = get_field('numChiplets')
+        if file_num_chiplets and file_num_chiplets != str(options.num_chiplets):
+            return None
+
+        # Check tuning space match (new format has column, old format used header)
+        file_tuning_space = get_field('tuningSpace') or header_tuning_space
+        if file_tuning_space != options.tuning_space_kind:
             return None
 
         test_vector = get_field('testVector')
@@ -648,6 +663,7 @@ def get_field(name: str) -> Optional[str]:
         if not perf_config or perf_config == 'None':
             return None
 
+        # TFlops (optional)
         max_tflops = None
         tflops_str = get_field('TFlops')
         if tflops_str:
@@ -658,18 +674,31 @@ def get_field(name: str) -> Optional[str]:
             except ValueError:
                 pass
 
-        elapsed_seconds = 0.0
-        elapsed_str = get_field('elapsedSeconds')
-        if elapsed_str:
+        # Duration (optional)
+        duration_seconds = 0.0
+        duration_str = get_field('durationSec')
+        if duration_str:
             try:
-                elapsed_seconds = float(elapsed_str)
+                duration_seconds = float(duration_str)
             except ValueError:
                 pass
 
+        # Timestamp (optional)
+        timestamp = get_field('timestamp')
+
+        # Warn if commit differs (avoid spamming for same commit)
+        file_commit = get_field('commitId')
+        if file_commit and file_commit != current_commit and file_commit not in warned_commits:
+            logger.warning(
+                f"Loading tuned configs from different commit (file: {file_commit[:8]}, current: {current_commit[:8]})"
+            )
+            warned_commits.add(file_commit)
+
         return TuningResult(test_vector=test_vector,
                             success=True,
                             gpu_id=-1,
-                            elapsed_seconds=elapsed_seconds,
+                            duration_seconds=duration_seconds,
+                            timestamp=timestamp,
                             winning_config=perf_config,
                             max_tflops=max_tflops)
 
@@ -688,7 +717,7 @@ def record(self, result: TuningResult) -> None:
         self._processed += 1
         if result.success:
             self.ok_count += 1
-            self.success_times.append(result.elapsed_seconds)
+            self.success_times.append(result.duration_seconds)
         else:
             self.fail_count += 1
 
@@ -859,19 +888,25 @@ def _set_memory_policy(self, numa_node: int) -> None:
 class OutputFileWriter:
     """Context manager for writing tuning results to TSV file."""
 
+    HEADER_COLUMNS = [
+        'arch', 'numCUs', 'numChiplets', 'testVector', 'perfConfig', 'TFlops', 'tuningSpace',
+        'commitId', 'timestamp', 'durationSec'
+    ]
+    EXPECTED_HEADER = "# " + "\t".join(HEADER_COLUMNS)
+
     def __init__(self, filepath: str, options: Options):
         self.filepath = filepath
         self.options = options
         self.file = None
         self._header_written = False
-        self._is_appending = False
 
     def __enter__(self):
         if self.filepath == '-':
             self.file = sys.stdout
         else:
-            self._is_appending = os.path.exists(self.filepath) and os.path.getsize(
-                self.filepath) > 0
+            if os.path.exists(self.filepath) and os.path.getsize(self.filepath) > 0:
+                if self._find_last_header() == self.EXPECTED_HEADER:
+                    self._header_written = True
             self.file = open(self.filepath, 'a')
         return self
 
@@ -879,41 +914,50 @@ def __exit__(self, exc_type, exc_value, traceback):
         if self.file and self.file != sys.stdout:
             self.file.close()
 
-    def _write_header(self):
-        if self._is_appending:
-            print("", file=self.file)  # Blank line before new section
+    def _find_last_header(self, chunk_size: int = 8192) -> Optional[str]:
+        """Find the last header line by reading from the end of file."""
+        with open(self.filepath, 'rb') as f:
+            f.seek(0, 2)  # Seek to end
+            file_size = f.tell()
+            remaining = b''
 
-        # Metadata comments
-        print(f"## commit: {get_git_commit_hash()}", file=self.file)
+            pos = file_size
+            while pos > 0:
+                read_size = min(chunk_size, pos)
+                pos -= read_size
+                f.seek(pos)
+                chunk = f.read(read_size) + remaining
 
-        # TSV header with '# ' prefix
-        columns = [
-            'arch', 'numCUs', 'numChiplets', 'testVector',
-            f'perfConfig ({self.options.tuning_space_kind})'
-        ]
-        if self.options.tflops:
-            columns.append('TFlops')
-        columns.append('elapsedSeconds')
+                lines = chunk.split(b'\n')
+                remaining = lines[0]
 
-        print("# " + "\t".join(columns), file=self.file)
-        self.file.flush()
+                for line in reversed(lines[1:]):
+                    if line.startswith(b'# arch\t'):
+                        return line.decode('utf-8')
+
+            if remaining.startswith(b'# arch\t'):
+                return remaining.decode('utf-8')
 
+        return None
+
+    def _write_header(self):
+        print(self.EXPECTED_HEADER, file=self.file)
+        self.file.flush()
         self._header_written = True
 
     def write_result(self, result: TuningResult):
-        assert result.success and result.winning_config and result.max_tflops, "write_result called with invalid result"
+        assert result.success and result.winning_config and result.max_tflops and result.timestamp and result.duration_seconds > 0.0, "write_result called with invalid result"
 
         if not self._header_written:
             self._write_header()
 
         fields = [
-            self.options.arch,
+            self.options.chip,
             str(self.options.num_cu),
-            str(self.options.num_chiplets), result.test_vector, result.winning_config
+            str(self.options.num_chiplets), result.test_vector, result.winning_config,
+            str(result.max_tflops), self.options.tuning_space_kind,
+            get_git_commit_hash(), result.timestamp, f"{result.duration_seconds:.1f}"
         ]
-        if self.options.tflops:
-            fields.append(str(result.max_tflops))
-        fields.append(f"{result.elapsed_seconds:.1f}")
 
         print("\t".join(fields), file=self.file)
         self.file.flush()
@@ -1238,7 +1282,7 @@ def find_best_perfconfig(tuning_output: List[str], config: PerfConfiguration, pa
         config.set_perfconfig(perfconfig)
         entry = config.table_entry(nano_seconds)
         if options.debug:
-            entry["Measurements"] = measurements
+            entry["MeasurementsMs"] = measurements
         entries.append(entry)
 
         if options.verify_perfconfigs and not np.isnan(nano_seconds):
@@ -1402,7 +1446,8 @@ def tune_configs(ctx: TuningContext, status_only: bool) -> bool:
         cache = TunedConfigsCache.from_output_file(ctx.options)
 
     # Load state file
-    state_file = TuningStateFile(get_state_filepath(ctx.options.output), ctx.options.arch,
+    state_file = TuningStateFile(get_state_filepath(ctx.options.output), ctx.options.chip,
+                                 ctx.options.num_cu, ctx.options.num_chiplets,
                                  ctx.options.tuning_space_kind)
     state = state_file.state
 
@@ -1455,7 +1500,9 @@ def tune_configs(ctx: TuningContext, status_only: bool) -> bool:
     ctx.print_gpu_summary(num_workers=num_workers)
 
     # Prepare ETA tracker with historical data
-    initial_times = [r.elapsed_seconds for r in cache.get_all_results() if r.elapsed_seconds > 0.0]
+    initial_times = [
+        r.duration_seconds for r in cache.get_all_results() if r.duration_seconds > 0.0
+    ]
     eta_tracker = ETATracker(total_configs=len(pending_configs),
                              num_workers=num_workers,
                              success_times=initial_times,
@@ -1464,8 +1511,12 @@ def tune_configs(ctx: TuningContext, status_only: bool) -> bool:
 
     has_errors = False
 
+    debug_enabled = ctx.options.debug and ctx.options.output != '-'
+    if ctx.options.debug and not debug_enabled:
+        logger.warning("Debug output disabled when writing to stdout")
+
     with (OutputFileWriter(ctx.options.output, ctx.options) as results_writer,
-          DebugFileWriter(f"{ctx.options.output}.debug") if ctx.options.debug else nullcontext() as
+          DebugFileWriter(f"{ctx.options.output}.debug") if debug_enabled else nullcontext() as
           debug_writer):
 
         executor = None
@@ -1488,11 +1539,13 @@ def execute_tuning_task(test_vector: str) -> TuningResult:
 
                 state_file.set_running(test_vector)
 
+                timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
                 start_time = time.time()
                 compile_threads = ctx.get_compile_threads(gpu_id)
                 result = tune_config(test_vector, ctx.conf_class, ctx.paths, ctx.options, gpu_id,
                                      compile_threads)
-                result.elapsed_seconds = time.time() - start_time
+                result.duration_seconds = time.time() - start_time
+                result.timestamp = timestamp
 
                 if result.success:
                     state_file.set_succeeded(result.test_vector)
@@ -1784,10 +1837,11 @@ def parse_arguments(gpu_topology: GpuTopology,
         metavar='TYPE',
         help="Force a set of scale types for gemm tuning. Only used when --op=gemm.")
 
-    parser.add_argument("--tflops",
-                        action='store_true',
-                        default=False,
-                        help="Include achieved TFLOPS in the output alongside the winning config")
+    parser.add_argument(
+        "--tflops",
+        action='store_true',
+        default=False,
+        help="[Deprecated, TFlops is always included] Include achieved TFLOPS in the output")
 
     parser.add_argument("--abort-on-error",
                         action='store_true',
@@ -1884,7 +1938,8 @@ def main(args=None):
     num_cu = perfRunner.get_num_cu(chip)
     num_chiplets = perfRunner.get_num_chiplets(chip, num_cu)
 
-    options = Options(arch=arch,
+    options = Options(chip=chip,
+                      arch=arch,
                       num_cu=num_cu,
                       num_chiplets=num_chiplets,
                       debug=parsed_args.debug,
@@ -1894,7 +1949,6 @@ def main(args=None):
                       rocmlir_gen_flags=parsed_args.rocmlir_gen_flags,
                       verify_mode=parsed_args.verify_mode,
                       verify_perfconfigs=parsed_args.verify_perf_configs,
-                      tflops=parsed_args.tflops,
                       output=parsed_args.output,
                       abort_on_error=parsed_args.abort_on_error,
                       retune=parsed_args.retune,

From f1a885ffa49a5f99317ba089f6854d144a4f3248 Mon Sep 17 00:00:00 2001
From: Mirza Halilcevic <mirza.halilcevic@amd.com>
Date: Tue, 3 Feb 2026 15:50:14 +0000
Subject: [PATCH 20/23] Simplify output file writing.

---
 mlir/utils/performance/tuningRunner.py | 45 +++-----------------------
 1 file changed, 5 insertions(+), 40 deletions(-)

diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py
index f8a905469634..405da8230f50 100755
--- a/mlir/utils/performance/tuningRunner.py
+++ b/mlir/utils/performance/tuningRunner.py
@@ -425,13 +425,9 @@ def _load(self) -> None:
         if not self.filepath or not os.path.exists(self.filepath):
             return
 
-        try:
-            with open(self.filepath, 'r') as f:
-                data = json.load(f)
-            self._all_contexts = data.get('contexts', {})
-        except (json.JSONDecodeError, TypeError, OSError) as e:
-            logger.warning(f"Failed to load state file, starting fresh: {e}")
-            return
+        with open(self.filepath, 'r') as f:
+            data = json.load(f)
+        self._all_contexts = data['contexts']
 
         # Process configs for active context with state transitions
         if self.context_key in self._all_contexts:
@@ -892,7 +888,7 @@ class OutputFileWriter:
         'arch', 'numCUs', 'numChiplets', 'testVector', 'perfConfig', 'TFlops', 'tuningSpace',
         'commitId', 'timestamp', 'durationSec'
     ]
-    EXPECTED_HEADER = "# " + "\t".join(HEADER_COLUMNS)
+    HEADER = "# " + "\t".join(HEADER_COLUMNS)
 
     def __init__(self, filepath: str, options: Options):
         self.filepath = filepath
@@ -904,9 +900,6 @@ def __enter__(self):
         if self.filepath == '-':
             self.file = sys.stdout
         else:
-            if os.path.exists(self.filepath) and os.path.getsize(self.filepath) > 0:
-                if self._find_last_header() == self.EXPECTED_HEADER:
-                    self._header_written = True
             self.file = open(self.filepath, 'a')
         return self
 
@@ -914,34 +907,8 @@ def __exit__(self, exc_type, exc_value, traceback):
         if self.file and self.file != sys.stdout:
             self.file.close()
 
-    def _find_last_header(self, chunk_size: int = 8192) -> Optional[str]:
-        """Find the last header line by reading from the end of file."""
-        with open(self.filepath, 'rb') as f:
-            f.seek(0, 2)  # Seek to end
-            file_size = f.tell()
-            remaining = b''
-
-            pos = file_size
-            while pos > 0:
-                read_size = min(chunk_size, pos)
-                pos -= read_size
-                f.seek(pos)
-                chunk = f.read(read_size) + remaining
-
-                lines = chunk.split(b'\n')
-                remaining = lines[0]
-
-                for line in reversed(lines[1:]):
-                    if line.startswith(b'# arch\t'):
-                        return line.decode('utf-8')
-
-            if remaining.startswith(b'# arch\t'):
-                return remaining.decode('utf-8')
-
-        return None
-
     def _write_header(self):
-        print(self.EXPECTED_HEADER, file=self.file)
+        print(self.HEADER, file=self.file)
         self.file.flush()
         self._header_written = True
 
@@ -972,7 +939,6 @@ def __init__(self, filepath: str):
         self._header_written = False
 
     def __enter__(self):
-        self._header_written = os.path.exists(self.filepath) and os.path.getsize(self.filepath) > 0
         self.file = open(self.filepath, 'a')
         return self
 
@@ -988,7 +954,6 @@ def write_result(self, result: TuningResult):
                                             header=not self._header_written,
                                             index=False)
         self.file.flush()
-
         self._header_written = True
 
 

From b2dc435fd9d6f10c0ff1f15039b79b7b54f457fd Mon Sep 17 00:00:00 2001
From: Mirza Halilcevic <mirza.halilcevic@amd.com>
Date: Thu, 5 Feb 2026 00:41:29 +0000
Subject: [PATCH 21/23] Address code review comments.

---
 mlir/utils/performance/tuningRunner.py | 73 ++++++++++++++++----------
 1 file changed, 44 insertions(+), 29 deletions(-)

diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py
index 405da8230f50..bad88362b477 100755
--- a/mlir/utils/performance/tuningRunner.py
+++ b/mlir/utils/performance/tuningRunner.py
@@ -28,6 +28,7 @@
 import json
 import logging
 import os
+import re
 import signal
 import statistics
 import subprocess
@@ -35,13 +36,13 @@
 import tempfile
 import threading
 import time
+from collections import deque
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from contextlib import nullcontext
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from enum import Enum
-from typing import Dict, List, Optional
-from collections import deque
+from typing import Dict, List, Optional, Tuple
 
 import numpy as np
 import pandas as pd
@@ -67,6 +68,11 @@
 WARMUP_ITERATIONS = 1
 SLEEP_US = 100  # 0.1 ms
 
+OUTPUT_HEADER_COLUMNS = [
+    'arch', 'numCUs', 'numChiplets', 'testVector', 'perfConfig', 'TFlops', 'tuningSpace',
+    'commitId', 'timestamp', 'durationSec'
+]
+
 # =============================================================================
 # Logging Setup
 # =============================================================================
@@ -123,9 +129,10 @@ def process(self, msg, kwargs):
         return msg, kwargs
 
 
-def setup_logger(quiet: bool = False, verbose: bool = False) -> logging.Logger:
+def setup_logger(quiet: bool = False, verbose: bool = False) -> None:
     """Configure and return a logger for tuningRunner."""
-    assert not (quiet and verbose), "quiet and verbose are mutually exclusive"
+    if quiet and verbose:
+        raise ValueError("quiet and verbose are mutually exclusive")
 
     if quiet:
         logger.setLevel(logging.ERROR)
@@ -586,12 +593,12 @@ def from_output_file(cls, options: Options) -> 'TunedConfigsCache':
     @staticmethod
     def _is_header_line(line: str) -> bool:
         """Check if line is a column header."""
-        return line.startswith('# arch\t')
+        header_prefix = f"# {OUTPUT_HEADER_COLUMNS[0]}\t"
+        return line.startswith(header_prefix)
 
     @staticmethod
     def _extract_tuning_space_from_header(line: str) -> Optional[str]:
         """Extract tuning space from old format header like 'perfConfig (quick)' or 'TFlops (quick)'."""
-        import re
         match = re.search(r'\((\w+)\)', line)
         return match.group(1) if match else None
 
@@ -884,11 +891,7 @@ def _set_memory_policy(self, numa_node: int) -> None:
 class OutputFileWriter:
     """Context manager for writing tuning results to TSV file."""
 
-    HEADER_COLUMNS = [
-        'arch', 'numCUs', 'numChiplets', 'testVector', 'perfConfig', 'TFlops', 'tuningSpace',
-        'commitId', 'timestamp', 'durationSec'
-    ]
-    HEADER = "# " + "\t".join(HEADER_COLUMNS)
+    HEADER = "# " + "\t".join(OUTPUT_HEADER_COLUMNS)
 
     def __init__(self, filepath: str, options: Options):
         self.filepath = filepath
@@ -913,7 +916,14 @@ def _write_header(self):
         self._header_written = True
 
     def write_result(self, result: TuningResult):
-        assert result.success and result.winning_config and result.max_tflops and result.timestamp and result.duration_seconds > 0.0, "write_result called with invalid result"
+        if not result.success:
+            raise ValueError("write_result called with unsuccessful result")
+        if not result.winning_config:
+            raise ValueError("write_result called without winning_config")
+        if result.max_tflops is None:
+            raise ValueError("write_result called without max_tflops")
+        if not result.timestamp:
+            raise ValueError("write_result called without timestamp")
 
         if not self._header_written:
             self._write_header()
@@ -947,7 +957,10 @@ def __exit__(self, exc_type, exc_value, traceback):
             self.file.close()
 
     def write_result(self, result: TuningResult):
-        assert result.success and result.entries, "write_result called with invalid result"
+        if not result.success:
+            raise ValueError("write_result called with unsuccessful result")
+        if not result.entries:
+            raise ValueError("write_result called without entries")
 
         pd.DataFrame(result.entries).to_csv(self.file,
                                             sep='\t',
@@ -1208,9 +1221,9 @@ def verify_perfconfig(perfconfig: str, config: PerfConfiguration, paths: Paths,
     return nano_seconds
 
 
-def find_best_perfconfig(tuning_output: List[str], config: PerfConfiguration, paths: Paths,
+def find_best_perfconfig(tuning_output_lines: List[str], config: PerfConfiguration, paths: Paths,
                          options: Options,
-                         gpu_id: int) -> tuple[Optional[str], Optional[float], List[Dict]]:
+                         gpu_id: int) -> Tuple[Optional[str], Optional[float], List[Dict]]:
     """Parse tuning driver output and find the best performing perfconfig.
 
     Returns the winning config, its TFLOPS, and all entries.
@@ -1221,7 +1234,7 @@ def find_best_perfconfig(tuning_output: List[str], config: PerfConfiguration, pa
     winning_config: Optional[str] = None
     entries = []
 
-    for line in tuning_output:
+    for line in tuning_output_lines:
         result = line.strip()
         if not result:
             continue
@@ -1348,26 +1361,25 @@ def tune_config(test_vector: str, conf_class: type, paths: Paths, options: Optio
 
         raise_if_terminated(tuning_driver.returncode)
 
-        tuning_output = tuning_stdout.decode('utf-8').splitlines()
+        tuning_output = tuning_stdout.decode('utf-8')
         tuning_errors = tuning_stderr.decode('utf-8')
 
         if tuning_driver.returncode != 0:
             gpu_logger.error(
-                format_error(
-                    "Tuning pipeline failed",
-                    command=tuning_pipeline,
-                    stdout=tuning_output[-10:],  # Last 10 lines of stdout
-                    stderr=tuning_errors,
-                    exit_code=tuning_driver.returncode,
-                    gpu_id=gpu_id))
+                format_error("Tuning pipeline failed",
+                             command=tuning_pipeline,
+                             stdout=tuning_output,
+                             stderr=tuning_errors,
+                             exit_code=tuning_driver.returncode,
+                             gpu_id=gpu_id))
             return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id)
         else:
             # Log any stderr output from tuning driver because it may contain warnings
             if tuning_errors.strip():
                 gpu_logger.warning(f"rocmlir-tuning-driver stderr:\n{tuning_errors}")
 
-        winning_config, max_tflops, entries = find_best_perfconfig(tuning_output, config, paths,
-                                                                   options, gpu_id)
+        winning_config, max_tflops, entries = find_best_perfconfig(tuning_output.splitlines(),
+                                                                   config, paths, options, gpu_id)
     except TuningError as e:
         gpu_logger.error(str(e))
         return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id)
@@ -1636,7 +1648,8 @@ def get_config_class(op_type: Operation) -> type:
         Operation.CONV_GEMM: ConvGemmConfiguration,
     }
 
-    assert op_type in config_classes, f"No config class for operation: {str(op_type)}"
+    if op_type not in config_classes:
+        raise ValueError(f"No config class for operation: {str(op_type)}")
     return config_classes[op_type]
 
 
@@ -1669,7 +1682,8 @@ def load_configs(op_type: Operation, parsed_args: argparse.Namespace, paths: Pat
             lambda: perfRunner.get_conv_gemm_configurations(paths.configuration_file_path),
     }
 
-    assert op_type in loaders, f"No config loader for operation: {str(op_type)}"
+    if op_type not in loaders:
+        raise ValueError(f"No config loader for operation: {str(op_type)}")
     return loaders[op_type]()
 
 
@@ -1882,7 +1896,8 @@ def main(args=None):
     # Handle stdin for configs file
     stdin_temp_file = None
     if parsed_args.configs_file == '-':
-        parsed_args.configs_file = load_configs_from_stdin()
+        stdin_temp_file = load_configs_from_stdin()
+        parsed_args.configs_file = stdin_temp_file
 
     try:
         paths = resolve_paths(op_type, parsed_args)

From d964457a8c975c62c2575ba56e7458ba54e91db4 Mon Sep 17 00:00:00 2001
From: Mirza Halilcevic <mirza.halilcevic@amd.com>
Date: Thu, 5 Feb 2026 01:00:27 +0000
Subject: [PATCH 22/23] Use llvm dbgs instead of errs where appropriate.

---
 mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp b/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp
index 78513dc5b16f..8bf7ab8b73d2 100644
--- a/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp
+++ b/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp
@@ -28,12 +28,15 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/LogicalResult.h"
 #include <cstdint>
 #include <random>
 
+#define DEBUG_TYPE "rock-tuning-parameter"
+
 // Found experimentally, might need to change it if we add more params to the
 // tuning space
 #define NUM_RANDOM_PERFCONFIGS_PER_TILE_SIZE 50
@@ -440,8 +443,8 @@ static void createGemmGemmTuningRangeBF(TuningParamSet *newSpace,
   // int64_t numEUPerCU =
   //     rock::lookupArchInfo(rock::getArchValue(gemmGemmOp)).numEUPerCU;
   bool isWMMA = archInfo.isWmma(gemmGemmOp);
-  llvm::errs() << "isWMMA: " << isWMMA << "\n";
-  llvm::errs() << "features: " << features << "\n";
+  LLVM_DEBUG(llvm::dbgs() << "isWMMA: " << isWMMA << "\n");
+  LLVM_DEBUG(llvm::dbgs() << "features: " << features << "\n");
   if (!archInfo.isAccel(gemmGemmOp)) {
     // We only support GPUs with matrix accelerator extensions
     return;
@@ -620,7 +623,7 @@ static void createGemmTuningRangeBF(TuningParamSet *newSpace,
   int64_t outputSwizzle{2}, wavesPerEU{0}, gridGroupSize{0};
   OpBuilder b(gemmOp.getContext());
   if (archInfo.isAccel(gemmOp)) {
-    llvm::errs() << "createGemmTuningRangeBF: accel\n";
+    LLVM_DEBUG(llvm::dbgs() << "createGemmTuningRangeBF: accel\n");
     for (uint32_t gemmMPerBlock : accelParams[0]) {
       SmallVector<uint32_t> mPerWaveRange =
           computeDPerWave(kind, gemmMPerBlock, waveSize);
@@ -665,7 +668,7 @@ static void createGemmTuningRangeBF(TuningParamSet *newSpace,
       }
     }
   } else {
-    llvm::errs() << "createGemmTuningRangeBF: non-accel\n";
+    LLVM_DEBUG(llvm::dbgs() << "createGemmTuningRangeBF: non-accel\n");
     // Non-accel
     PopulateParams tuningInfo;
     for (uint32_t blockSize : validRangeGeneralGemmParams[0]) {

From 408100adb16cfc191ad9293023f325a4860e2305 Mon Sep 17 00:00:00 2001
From: Mirza Halilcevic <mirza.halilcevic@amd.com>
Date: Thu, 5 Feb 2026 14:40:27 +0000
Subject: [PATCH 23/23] Warn if env vars are set.

---
 mlir/utils/performance/tuningRunner.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py
index bad88362b477..7aa804c3a434 100755
--- a/mlir/utils/performance/tuningRunner.py
+++ b/mlir/utils/performance/tuningRunner.py
@@ -24,6 +24,7 @@
 """
 
 import argparse
+import functools
 import glob
 import json
 import logging
@@ -924,6 +925,8 @@ def write_result(self, result: TuningResult):
             raise ValueError("write_result called without max_tflops")
         if not result.timestamp:
             raise ValueError("write_result called without timestamp")
+        if result.duration_seconds <= 0.0:
+            raise ValueError("write_result called with invalid duration_seconds")
 
         if not self._header_written:
             self._write_header()
@@ -1028,13 +1031,14 @@ def __call__(self, parser, namespace, values, option_string=None):
         setattr(namespace, self.dest, values)
 
 
+@functools.lru_cache(maxsize=1)
 def get_git_commit_hash() -> str:
     """Get the current git commit hash."""
     try:
         return subprocess.check_output(['git', 'rev-parse', 'HEAD'],
                                        stderr=subprocess.DEVNULL).decode().strip()
     except (subprocess.CalledProcessError, FileNotFoundError, OSError) as e:
-        logger.debug(f"Failed to get git commit hash: {e}")
+        logger.warning(f"Failed to get git commit hash: {e}")
         return "unknown"
 
 
@@ -1883,6 +1887,10 @@ def main(args=None):
     gpu_topology = GpuTopology.discover()
     available_gpus = sorted(gpu_topology.gpus.keys())
 
+    # Capture these before set_isolated_gpu_env overwrites them
+    user_rocr_visible = os.environ.get("ROCR_VISIBLE_DEVICES")
+    user_hip_visible = os.environ.get("HIP_VISIBLE_DEVICES")
+
     # We call into perfRunner which also queries GPU info using HIP and rocminfo.
     # To ensure consistency, we isolate the process to the first available GPU.
     set_isolated_gpu_env(os.environ, available_gpus[0])
@@ -1891,6 +1899,16 @@ def main(args=None):
 
     setup_logger(quiet=parsed_args.quiet, verbose=parsed_args.verbose)
 
+    if user_rocr_visible or user_hip_visible:
+        vars_set = []
+        if user_rocr_visible:
+            vars_set.append(f"ROCR_VISIBLE_DEVICES={user_rocr_visible}")
+        if user_hip_visible:
+            vars_set.append(f"HIP_VISIBLE_DEVICES={user_hip_visible}")
+        logger.warning(
+            f"Ignoring {' and '.join(vars_set)}. "
+            f"This script manages GPU visibility internally. Use '--gpus' to select specific GPUs.")
+
     op_type = Operation.from_name(parsed_args.op)
 
     # Handle stdin for configs file