From 9c0838e6096154654f413c5226e3bd3c650aba40 Mon Sep 17 00:00:00 2001 From: Mirza Halilcevic Date: Sat, 10 Jan 2026 00:14:42 +0000 Subject: [PATCH 01/23] Improve time and metadata tracking, output file format has changed. --- mlir/utils/performance/tuningRunner.py | 382 +++++++++++++++++++------ 1 file changed, 300 insertions(+), 82 deletions(-) diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py index d4c077b18c91..6a8f4fca8115 100755 --- a/mlir/utils/performance/tuningRunner.py +++ b/mlir/utils/performance/tuningRunner.py @@ -13,10 +13,12 @@ import argparse import glob import os +import statistics import subprocess import sys import tempfile import threading +import time from concurrent.futures import ThreadPoolExecutor, as_completed from contextlib import nullcontext from dataclasses import dataclass, field @@ -40,9 +42,14 @@ PerfConfiguration, ) +# ============================================================================= +# Constants +# ============================================================================= + MLIR_N_REPEATS = 10 WARMUP_ITERATIONS = 1 SLEEP_US = 100 # 0.1 ms +MAX_FAILURES = 10 # ============================================================================= # Configuration & Results @@ -75,7 +82,8 @@ class TuningResult: """Result of tuning a single configuration.""" test_vector: str success: bool - gpu_id: Optional[int] = None + gpu_id: int + elapsed_seconds: float winning_config: Optional[str] = None max_tflops: Optional[float] = None entries: List[Dict] = field(default_factory=list) @@ -227,85 +235,201 @@ def get(self, test_vector: str) -> Optional[TuningResult]: """Get cached result for a test vector.""" return self._results.get(test_vector) + def get_all_results(self) -> List[TuningResult]: + """Get all cached tuning results.""" + return list(self._results.values()) + def count(self) -> int: """Return number of cached configurations.""" return len(self._results) @classmethod - def from_output_file(cls, - filepath: str, - tuning_space_kind: str, - quiet: bool = False) -> 'TunedConfigsCache': + def from_output_file(cls, options: Options) -> 'TunedConfigsCache': """Load previously tuned configurations from an output TSV file. - The output file has the following structure: - - Commit lines starting with '# commit: ' indicating the git commit hash of the tuning run - - Header lines starting with '# ' containing tuning space kind in parentheses - (e.g., '# arch\tnumCUs\ttestVector\tperfConfig (quick)\tTFlops') - - Multiple commit and header sections can exist in the same file from different tuning runs - - Data lines with tab-separated fields following each header - - Error lines starting with '### ' indicating errors during tuning - - Only data lines under headers matching options.tuning_space_kind are loaded. - For example, if options.tuning_space_kind='quick', only data under headers containing '(quick)' - will be loaded, ignoring '(full)' or other sections. + Supports both old and new file formats: + - Old format: header starts with '# '; tuning space embedded in column name (e.g., perfConfig (quick)) + - New format: proper tsv header (no #); metadata in ## comments before header + + Only data lines that match the current tuning space, arch, and numCUs are loaded. """ cache = cls() - if filepath == '-' or not os.path.exists(filepath): + if options.output == '-' or not os.path.exists(options.output): return cache current_commit = get_git_commit_hash() - file_commit = current_commit - matching_tuning_space = False + + # Pending metadata + file_commit: Optional[str] = None + file_tuning_space: Optional[str] = None + file_arch: Optional[str] = None + file_num_cu: Optional[int] = None + + # Active section state + matching_section = False + column_indices: Dict[str, int] = {} try: - with open(filepath, mode='r') as f: + with open(options.output, mode='r') as f: for line in f: line = line.strip() if not line: continue - # Track commit hash for warning about stale results - if line.startswith('# commit: '): - file_commit = line[len('# commit: '):].strip() + # Check for metadata line + if line.startswith('## '): + key_value = line[3:] + if ':' in key_value: + key, value = key_value.split(':', 1) + key = key.strip() + value = value.strip() + if key == 'commit': + file_commit = value + elif key == 'tuningSpace': + file_tuning_space = value + elif key == 'arch': + file_arch = value + elif key == 'numCUs': + try: + file_num_cu = int(value) + except ValueError: + pass continue - # Check if this section header matches our tuning space - if line.startswith('# '): - matching_tuning_space = f"({tuning_space_kind})" in line - if matching_tuning_space and file_commit != current_commit and not quiet: - print( - f"Warning: Loading tuned configs from different commit " - f"(file: {file_commit[:12]}, current: {current_commit[:12]})", - file=sys.stderr) + # Check for header line + if cls._is_header_line(line): + # Determine if this section matches based on metadata or old format + if file_tuning_space is not None: + # New format: use metadata + matching_section = (file_tuning_space == options.tuning_space_kind and + (file_arch is None or file_arch == options.arch) and + (file_num_cu is None or + file_num_cu == options.num_cu)) + elif f'({options.tuning_space_kind})' in line: + # Old format: tuning space embedded in header + matching_section = True + else: + matching_section = False + + if matching_section: + column_indices = cls._parse_header_line(line) + if file_commit and file_commit != current_commit and not options.quiet: + print( + f"Warning: Loading tuned configs from different commit " + f"(file: {file_commit[:8]}, current: {current_commit[:8]})", + file=sys.stderr) + + # Reset pending metadata for next section + file_commit = None + file_tuning_space = None + file_arch = None + file_num_cu = None continue - # Skip error lines and lines from non-matching sections - if line.startswith('### ') or not matching_tuning_space: + # Skip other comment lines + if line.startswith('#'): continue - # Parse data line - fields = line.split('\t') - if len(fields) < 4: + # Skip data lines from non-matching sections + if not matching_section or not column_indices: continue - test_vector = fields[2] - perf_config = fields[3] if fields[3] else None - tflops_value = float(fields[4]) if len(fields) > 4 and fields[4] else None + # Parse data line + result = cls._parse_data_line(line.split('\t'), column_indices, options.arch, + options.num_cu) + if result: + cache._results[result.test_vector] = result - if perf_config and perf_config != "None": - cache._results[test_vector] = TuningResult(test_vector=test_vector, - success=True, - winning_config=perf_config, - max_tflops=tflops_value) except Exception as e: - if not quiet: - print(f"Warning: Failed to load existing tuning results from {filepath}: {e}", + if not options.quiet: + print(f"Warning: Failed to load existing tuning results from {options.output}: {e}", file=sys.stderr) return cache + @staticmethod + def _is_header_line(line: str) -> bool: + """Check if line is a column header (old or new format).""" + # Old format: '# arch\t...' + if line.startswith('# '): + return line[2:].startswith('arch\t') + # New format: 'testVector\t...' + return line.startswith('testVector\t') + + @staticmethod + def _parse_header_line(line: str) -> Dict[str, int]: + """Parse column header and return name -> index mapping.""" + header_text = line[2:] if line.startswith('# ') else line + indices = {} + for i, col in enumerate(header_text.split('\t')): + if col: + indices[col.split()[0]] = i + return indices + + @staticmethod + def _parse_data_line(fields: List[str], column_indices: Dict[str, int], arch: str, + num_cu: int) -> Optional[TuningResult]: + """Parse a data line and return TuningResult if valid. + + A line is valid if: + - arch and numCUs match current system (if columns exist, for old format) + - testVector is present + - perfConfig is present and not 'None' + - TFlops is a valid finite number (if column exists) + """ + + def get_field(name: str) -> Optional[str]: + idx = column_indices.get(name) + if idx is not None and idx < len(fields) and fields[idx]: + return fields[idx] + return None + + # Old format: arch and numCUs are columns + if 'arch' in column_indices: + if get_field('arch') != arch: + return None + + if 'numCUs' in column_indices: + if get_field('numCUs') != str(num_cu): + return None + + test_vector = get_field('testVector') + if not test_vector: + return None + + perf_config = get_field('perfConfig') + if not perf_config or perf_config == 'None': + return None + + max_tflops = None + if 'TFlops' in column_indices: + tflops_str = get_field('TFlops') + if not tflops_str: + return None + try: + tflops_val = float(tflops_str) + if np.isnan(tflops_val) or np.isinf(tflops_val): + return None + max_tflops = tflops_val + except ValueError: + return None + + elapsed_seconds = 0.0 + elapsed_str = get_field('elapsedSeconds') + if elapsed_str: + try: + elapsed_seconds = float(elapsed_str) + except ValueError: + pass + + return TuningResult(test_vector=test_vector, + success=True, + gpu_id=-1, + elapsed_seconds=elapsed_seconds, + winning_config=perf_config, + max_tflops=max_tflops) + @dataclass class TuningContext: @@ -436,6 +560,66 @@ def _set_memory_policy(self, numa_node: int) -> None: pass # libnuma not available, rely on first-touch policy +@dataclass +class ETATracker: + """Track completion times for accurate ETA estimation using median of successful configs.""" + total_configs: int + num_workers: int + initial_times: List[float] = field(default_factory=list) + initial_ok_count: int = 0 + _success_times: List[float] = field(default_factory=list, init=False) + _processed: int = field(default=0, init=False) + _ok_count: int = field(default=0, init=False) + _fail_count: int = field(default=0, init=False) + + def __post_init__(self): + self._success_times = list(self.initial_times) + self._ok_count = self.initial_ok_count + + def record(self, result: TuningResult) -> None: + self._processed += 1 + if result.success: + self._ok_count += 1 + self._success_times.append(result.elapsed_seconds) + else: + self._fail_count += 1 + + def _format_rate(self, seconds: float) -> str: + if seconds < 60: + return f"{seconds:.1f}s/cfg" + elif seconds < 3600: + return f"{seconds / 60:.1f}m/cfg" + else: + return f"{seconds / 3600:.1f}h/cfg" + + def _format_eta(self, seconds: float) -> str: + if seconds < 60: + return "<1m" + elif seconds < 3600: + return f"{int(seconds // 60)}m" + elif seconds < 86400: + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + return f"{hours}h{minutes}m" + else: + days = int(seconds // 86400) + hours = int((seconds % 86400) // 3600) + return f"{days}d{hours}h" + + def get_postfix_str(self) -> str: + remaining = self.total_configs - self._processed + + rate = "n/a" + eta = "n/a" + if len(self._success_times) >= 3: + median = statistics.median(self._success_times) + eta_seconds = (remaining / self.num_workers) * median + rate = self._format_rate(median) + eta = self._format_eta(eta_seconds) + + return f"ok={self._ok_count}, fail={self._fail_count}, rate={rate}, eta={eta}" + + # ============================================================================= # Output Writers # ============================================================================= @@ -449,11 +633,14 @@ def __init__(self, filepath: str, options: Options): self.options = options self.file = None self.header_written = False + self._is_appending = False def __enter__(self): if self.filepath == '-': self.file = sys.stdout else: + self._is_appending = os.path.exists(self.filepath) and os.path.getsize( + self.filepath) > 0 self.file = open(self.filepath, 'a') return self @@ -465,15 +652,23 @@ def _write_header(self): if self.header_written: return - commit_hash = get_git_commit_hash() - print(f"# commit: {commit_hash}", file=self.file) - columns = [ - 'arch', 'numCUs', 'numChiplets', 'testVector', - f'perfConfig ({self.options.tuning_space_kind})' - ] + # Add a blank line if appending + if self._is_appending: + print("", file=self.file) + + # Metadata comments + print(f"## commit: {get_git_commit_hash()}", file=self.file) + print(f"## tuningSpace: {self.options.tuning_space_kind}", file=self.file) + print(f'## arch: {self.options.arch}', file=self.file) + print(f'## numCUs: {self.options.num_cu}', file=self.file) + print(f'## numChiplets: {self.options.num_chiplets}', file=self.file) + + # TSV header + columns = ['testVector', 'perfConfig'] if self.options.tflops: columns.append('TFlops') - print("# " + "\t".join(columns), file=self.file) + columns.append('elapsedSeconds') + print("\t".join(columns), file=self.file) self.file.flush() self.header_written = True @@ -481,13 +676,10 @@ def _write_header(self): def write_result(self, result: TuningResult): self._write_header() - fields = [ - self.options.arch, - str(self.options.num_cu), - str(self.options.num_chiplets), result.test_vector, result.winning_config or "" - ] + fields = [result.test_vector, result.winning_config or ""] if self.options.tflops: fields.append(f"{result.max_tflops}" if result.max_tflops else "") + fields.append(f"{result.elapsed_seconds:.1f}") print("\t".join(fields), file=self.file) self.file.flush() @@ -564,6 +756,15 @@ def __call__(self, parser, namespace, values, option_string=None): setattr(namespace, self.dest, values) +def ensure_tsv_extension(filepath: str) -> str: + """Ensure filepath has .tsv extension, unless it's stdout.""" + if filepath == '-': + return filepath + if not filepath.endswith('.tsv'): + return filepath + '.tsv' + return filepath + + def get_git_commit_hash() -> str: """Get the current git commit hash.""" try: @@ -895,8 +1096,7 @@ def tune_configs(ctx: TuningContext) -> bool: # Load cached results unless retuning is forced cache = TunedConfigsCache() if not ctx.options.retune: - cache = TunedConfigsCache.from_output_file(ctx.options.output, - ctx.options.tuning_space_kind, ctx.options.quiet) + cache = TunedConfigsCache.from_output_file(ctx.options) if cache.count() > 0 and not ctx.options.quiet: print(f"Found {cache.count()} tuned config(s) in {ctx.options.output}", file=sys.stderr) @@ -916,30 +1116,36 @@ def tune_configs(ctx: TuningContext) -> bool: ctx.print_gpu_summary() def execute_tuning_task(test_vector: str) -> TuningResult: - try: - gpu_id = pool.acquire_gpu_for_thread() - compile_threads = ctx.get_compile_threads(gpu_id) - result = tune_config(test_vector, ctx.conf_class, ctx.paths, ctx.options, gpu_id, - compile_threads) - return TuningResult(test_vector=test_vector, - success=result.get('success', False), - gpu_id=gpu_id, - winning_config=result.get('winning_config'), - max_tflops=result.get('max_tflops'), - entries=result.get('entries', []), - verify_tflops=result.get('verify_tflops'), - error=result.get('error')) - except Exception as e: - return TuningResult(test_vector=test_vector, success=False, error=str(e)) + gpu_id = pool.acquire_gpu_for_thread() + start_time = time.time() + compile_threads = ctx.get_compile_threads(gpu_id) + result = tune_config(test_vector, ctx.conf_class, ctx.paths, ctx.options, gpu_id, + compile_threads) + return TuningResult(test_vector=test_vector, + success=result.get('success', False), + gpu_id=gpu_id, + elapsed_seconds=time.time() - start_time, + winning_config=result.get('winning_config'), + max_tflops=result.get('max_tflops'), + entries=result.get('entries', []), + verify_tflops=result.get('verify_tflops'), + error=result.get('error')) executor = None progress_bar = None - has_errors = False with OutputFileWriter(ctx.options.output, ctx.options) as results_writer: with DebugFileWriter(f"{ctx.options.output}.debug") if ctx.options.debug else nullcontext( ) as debug_writer: try: # No context manager for executor because we need to shutdown with wait=False + initial_times = [ + r.elapsed_seconds for r in cache.get_all_results() if r.elapsed_seconds > 0.0 + ] + eta_tracker = ETATracker(total_configs=len(pending_configs), + num_workers=num_workers, + initial_times=initial_times, + initial_ok_count=cache.count()) + progress_bar = tqdm( total=len(ctx.configs), initial=skipped_count, @@ -947,7 +1153,10 @@ def execute_tuning_task(test_vector: str) -> TuningResult: file=sys.stderr, desc=f"Tuning {ctx.conf_class.__name__} ({ctx.options.tuning_space_kind})", unit="config", - leave=False) + leave=False, + bar_format= + '{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [t={elapsed}{postfix}]') + progress_bar.set_postfix_str(eta_tracker.get_postfix_str()) executor = ThreadPoolExecutor(max_workers=num_workers) pending_futures = { @@ -955,19 +1164,22 @@ def execute_tuning_task(test_vector: str) -> TuningResult: for test_vector in pending_configs } + has_errors = False + consecutive_failures = 0 + for completed_future in as_completed(pending_futures): result = completed_future.result() if result.success: + consecutive_failures = 0 results_writer.write_result(result) if debug_writer: debug_writer.write_entries(result.entries) - progress_bar.update(1) else: has_errors = True + consecutive_failures += 1 error_text = result.error or "Unknown error" - gpu_prefix = f"[GPU {result.gpu_id}] " if result.gpu_id is not None else "" - formatted_error = f"{gpu_prefix}Error tuning {result.test_vector}\n" + '\n'.join( + formatted_error = f"[GPU {result.gpu_id}] Error tuning {result.test_vector}\n" + '\n'.join( f"\t{line}" for line in error_text.splitlines()) print(formatted_error, file=sys.stderr) results_writer.write_error(formatted_error) @@ -975,7 +1187,13 @@ def execute_tuning_task(test_vector: str) -> TuningResult: if ctx.options.abort_on_error: return False - progress_bar.refresh() + if consecutive_failures >= MAX_FAILURES: + print("Aborting due to too many consecutive failures", file=sys.stderr) + return False + + eta_tracker.record(result) + progress_bar.update(1) + progress_bar.set_postfix_str(eta_tracker.get_postfix_str()) if has_errors: print("Encountered errors during tuning", file=sys.stderr) @@ -1272,7 +1490,7 @@ def main(args=None): verify_mode=parsed_args.verify_mode, verify_perfconfigs=parsed_args.verify_perf_configs, tflops=parsed_args.tflops, - output=parsed_args.output, + output=ensure_tsv_extension(parsed_args.output), abort_on_error=parsed_args.abort_on_error, retune=parsed_args.retune, gpu_ids=parsed_args.gpus, From 9f6c671780600a9ff61b15d5a39fb23a29cc8b68 Mon Sep 17 00:00:00 2001 From: Mirza Halilcevic Date: Sat, 10 Jan 2026 01:11:18 +0000 Subject: [PATCH 02/23] Add support for stdin. --- mlir/utils/performance/tuningRunner.py | 225 +++++++++++++------------ 1 file changed, 121 insertions(+), 104 deletions(-) diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py index 6a8f4fca8115..9a95b6ca739d 100755 --- a/mlir/utils/performance/tuningRunner.py +++ b/mlir/utils/performance/tuningRunner.py @@ -431,6 +431,66 @@ def get_field(name: str) -> Optional[str]: max_tflops=max_tflops) +@dataclass +class ETATracker: + """Track completion times for accurate ETA estimation using median of successful configs.""" + total_configs: int + num_workers: int + initial_times: List[float] = field(default_factory=list) + initial_ok_count: int = 0 + _success_times: List[float] = field(default_factory=list, init=False) + _processed: int = field(default=0, init=False) + _ok_count: int = field(default=0, init=False) + _fail_count: int = field(default=0, init=False) + + def __post_init__(self): + self._success_times = list(self.initial_times) + self._ok_count = self.initial_ok_count + + def record(self, result: TuningResult) -> None: + self._processed += 1 + if result.success: + self._ok_count += 1 + self._success_times.append(result.elapsed_seconds) + else: + self._fail_count += 1 + + def _format_rate(self, seconds: float) -> str: + if seconds < 60: + return f"{seconds:.1f}s/cfg" + elif seconds < 3600: + return f"{seconds / 60:.1f}m/cfg" + else: + return f"{seconds / 3600:.1f}h/cfg" + + def _format_eta(self, seconds: float) -> str: + if seconds < 60: + return "<1m" + elif seconds < 3600: + return f"{int(seconds // 60)}m" + elif seconds < 86400: + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + return f"{hours}h{minutes}m" + else: + days = int(seconds // 86400) + hours = int((seconds % 86400) // 3600) + return f"{days}d{hours}h" + + def get_postfix_str(self) -> str: + remaining = self.total_configs - self._processed + + rate = "n/a" + eta = "n/a" + if len(self._success_times) >= 3: + median = statistics.median(self._success_times) + eta_seconds = (remaining / self.num_workers) * median + rate = self._format_rate(median) + eta = self._format_eta(eta_seconds) + + return f"ok={self._ok_count}, fail={self._fail_count}, rate={rate}, eta={eta}" + + @dataclass class TuningContext: """Encapsulates all state and configuration needed for tuning operations.""" @@ -560,66 +620,6 @@ def _set_memory_policy(self, numa_node: int) -> None: pass # libnuma not available, rely on first-touch policy -@dataclass -class ETATracker: - """Track completion times for accurate ETA estimation using median of successful configs.""" - total_configs: int - num_workers: int - initial_times: List[float] = field(default_factory=list) - initial_ok_count: int = 0 - _success_times: List[float] = field(default_factory=list, init=False) - _processed: int = field(default=0, init=False) - _ok_count: int = field(default=0, init=False) - _fail_count: int = field(default=0, init=False) - - def __post_init__(self): - self._success_times = list(self.initial_times) - self._ok_count = self.initial_ok_count - - def record(self, result: TuningResult) -> None: - self._processed += 1 - if result.success: - self._ok_count += 1 - self._success_times.append(result.elapsed_seconds) - else: - self._fail_count += 1 - - def _format_rate(self, seconds: float) -> str: - if seconds < 60: - return f"{seconds:.1f}s/cfg" - elif seconds < 3600: - return f"{seconds / 60:.1f}m/cfg" - else: - return f"{seconds / 3600:.1f}h/cfg" - - def _format_eta(self, seconds: float) -> str: - if seconds < 60: - return "<1m" - elif seconds < 3600: - return f"{int(seconds // 60)}m" - elif seconds < 86400: - hours = int(seconds // 3600) - minutes = int((seconds % 3600) // 60) - return f"{hours}h{minutes}m" - else: - days = int(seconds // 86400) - hours = int((seconds % 86400) // 3600) - return f"{days}d{hours}h" - - def get_postfix_str(self) -> str: - remaining = self.total_configs - self._processed - - rate = "n/a" - eta = "n/a" - if len(self._success_times) >= 3: - median = statistics.median(self._success_times) - eta_seconds = (remaining / self.num_workers) * median - rate = self._format_rate(median) - eta = self._format_eta(eta_seconds) - - return f"ok={self._ok_count}, fail={self._fail_count}, rate={rate}, eta={eta}" - - # ============================================================================= # Output Writers # ============================================================================= @@ -1218,8 +1218,10 @@ def resolve_paths(op_type: Operation, parsed_args) -> Paths: """Resolve paths based on operation type and arguments.""" if op_type == Operation.FUSION: configs_path = "./fusion_config_file" + elif parsed_args.config: + configs_path = None else: - configs_path = None if parsed_args.config else parsed_args.configs_file + configs_path = parsed_args.configs_file return perfRunner.create_paths(configs_path, parsed_args.mlir_build_dir) @@ -1277,6 +1279,15 @@ def get_config_class(op_type: Operation) -> type: return config_classes.get(op_type, PerfConfiguration) +def load_configs_from_stdin() -> str: + """Read configs from stdin and return path to a temporary file.""" + content = sys.stdin.read() + fd, path = tempfile.mkstemp(suffix='.txt', prefix='tuning_configs_') + with os.fdopen(fd, 'w') as f: + f.write(content) + return path + + def load_configs(op_type: Operation, parsed_args, paths: Paths) -> List[str]: """Load configurations based on operation type and arguments.""" if parsed_args.config: @@ -1468,58 +1479,64 @@ def main(args=None): parsed_args = parse_arguments(gpu_topology, available_gpus, args) - op_type = Operation.from_name(parsed_args.op) - paths = resolve_paths(op_type, parsed_args) - - if not paths.mlir_paths: - print("rocMLIR build dir was not provided/found", file=sys.stderr) - return 1 - - arch = perfRunner.get_arch() - chip = perfRunner.get_chip() - num_cu = perfRunner.get_num_cu(chip) - num_chiplets = perfRunner.get_num_chiplets(chip, num_cu) - - options = Options(arch=arch, - num_cu=num_cu, - num_chiplets=num_chiplets, - debug=parsed_args.debug, - quiet=parsed_args.quiet, - tuning_space_kind=parsed_args.tuning_space, - rocmlir_gen_flags=parsed_args.rocmlir_gen_flags, - verify_mode=parsed_args.verify_mode, - verify_perfconfigs=parsed_args.verify_perf_configs, - tflops=parsed_args.tflops, - output=ensure_tsv_extension(parsed_args.output), - abort_on_error=parsed_args.abort_on_error, - retune=parsed_args.retune, - gpu_ids=parsed_args.gpus, - num_cpus=parsed_args.num_cpus, - wait_for_compiles=parsed_args.wait_for_compiles) - - if op_type == Operation.FUSION: - op_type = extract_fusion_configs(parsed_args.test_dir, paths, options) - + stdin_temp_file = None try: + # Handle stdin for configs file + if parsed_args.configs_file == '-': + stdin_temp_file = load_configs_from_stdin() + parsed_args.configs_file = stdin_temp_file + + op_type = Operation.from_name(parsed_args.op) + paths = resolve_paths(op_type, parsed_args) + + if not paths.mlir_paths: + print("rocMLIR build dir was not provided/found", file=sys.stderr) + return 1 + + arch = perfRunner.get_arch() + chip = perfRunner.get_chip() + num_cu = perfRunner.get_num_cu(chip) + num_chiplets = perfRunner.get_num_chiplets(chip, num_cu) + + options = Options(arch=arch, + num_cu=num_cu, + num_chiplets=num_chiplets, + debug=parsed_args.debug, + quiet=parsed_args.quiet, + tuning_space_kind=parsed_args.tuning_space, + rocmlir_gen_flags=parsed_args.rocmlir_gen_flags, + verify_mode=parsed_args.verify_mode, + verify_perfconfigs=parsed_args.verify_perf_configs, + tflops=parsed_args.tflops, + output=ensure_tsv_extension(parsed_args.output), + abort_on_error=parsed_args.abort_on_error, + retune=parsed_args.retune, + gpu_ids=parsed_args.gpus, + num_cpus=parsed_args.num_cpus, + wait_for_compiles=parsed_args.wait_for_compiles) + + if op_type == Operation.FUSION: + op_type = extract_fusion_configs(parsed_args.test_dir, paths, options) + conf_class = get_config_class(op_type) configs = load_configs(op_type, parsed_args, paths) - except ValueError as e: - print(str(e), file=sys.stderr) - return 1 - ctx = TuningContext(configs=configs, - conf_class=conf_class, - paths=paths, - options=options, - gpu_topology=gpu_topology, - numa_topology=NumaTopology.discover()) + ctx = TuningContext(configs=configs, + conf_class=conf_class, + paths=paths, + options=options, + gpu_topology=gpu_topology, + numa_topology=NumaTopology.discover()) - try: tuning_succeeded = tune_configs(ctx) return 0 if tuning_succeeded else 1 + except KeyboardInterrupt: print("Tuning interrupted by user", file=sys.stderr) - return 1 + return 130 # 128 + SIGINT + finally: + if stdin_temp_file: + os.unlink(stdin_temp_file) if __name__ == '__main__': From 7eb9851b496a8f010d1202b332b8e0c922f641a4 Mon Sep 17 00:00:00 2001 From: Mirza Halilcevic Date: Sat, 10 Jan 2026 11:52:00 +0000 Subject: [PATCH 03/23] Revert to old output format. --- mlir/utils/performance/perfRunner.py | 17 ++- mlir/utils/performance/tuningRunner.py | 142 ++++++++++--------------- 2 files changed, 63 insertions(+), 96 deletions(-) diff --git a/mlir/utils/performance/perfRunner.py b/mlir/utils/performance/perfRunner.py index 71fbd5051c56..f8ed4b7d78f4 100644 --- a/mlir/utils/performance/perfRunner.py +++ b/mlir/utils/performance/perfRunner.py @@ -293,25 +293,22 @@ def read_tuning_db(path: Optional[str]) -> MaybeTuningDb: with open(path, 'r') as db_file: for line in db_file: line = line.strip() - if line.startswith('#'): + if not line or line.startswith('#'): continue entries = line.split('\t') # note: legacy format has 3 entries if len(entries) == 3: arch, config, perfconfig = entries - ret[arch, config] = perfconfig - # note: new format has 4 entries - elif len(entries) == 4: - arch, _, config, perfconfig = entries - ret[arch, config] = perfconfig - # note: 5-entry form includes tflops at end - elif len(entries) == 5: - arch, _, config, perfconfig, _ = entries - ret[arch, config] = perfconfig + # note: new format has 4+ entries + elif len(entries) >= 4: + arch, _, config, perfconfig = entries[:4] else: print("Warning: Malformed tuning database entry:", line) continue + + ret[arch, config] = perfconfig + return ret except FileNotFoundError: if path: diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py index 9a95b6ca739d..c7c7b57612b5 100755 --- a/mlir/utils/performance/tuningRunner.py +++ b/mlir/utils/performance/tuningRunner.py @@ -247,11 +247,8 @@ def count(self) -> int: def from_output_file(cls, options: Options) -> 'TunedConfigsCache': """Load previously tuned configurations from an output TSV file. - Supports both old and new file formats: - - Old format: header starts with '# '; tuning space embedded in column name (e.g., perfConfig (quick)) - - New format: proper tsv header (no #); metadata in ## comments before header - - Only data lines that match the current tuning space, arch, and numCUs are loaded. + Format: # arch\tnumCUs\ttestVector\tperfConfig (tuning_space)\t[TFlops]\t[elapsedSeconds] + Only loads entries matching current arch, numCUs, and tuning space. """ cache = cls() @@ -260,13 +257,8 @@ def from_output_file(cls, options: Options) -> 'TunedConfigsCache': current_commit = get_git_commit_hash() - # Pending metadata - file_commit: Optional[str] = None - file_tuning_space: Optional[str] = None - file_arch: Optional[str] = None - file_num_cu: Optional[int] = None - # Active section state + metadata: Dict[str, Optional[Any]] = {} matching_section = False column_indices: Dict[str, int] = {} @@ -279,52 +271,31 @@ def from_output_file(cls, options: Options) -> 'TunedConfigsCache': # Check for metadata line if line.startswith('## '): - key_value = line[3:] - if ':' in key_value: - key, value = key_value.split(':', 1) - key = key.strip() - value = value.strip() - if key == 'commit': - file_commit = value - elif key == 'tuningSpace': - file_tuning_space = value - elif key == 'arch': - file_arch = value - elif key == 'numCUs': - try: - file_num_cu = int(value) - except ValueError: - pass + parts = line[3:].split(':', 1) + if len(parts) == 2: + key = parts[0].strip() + value = parts[1].strip() + metadata[key] = value continue # Check for header line if cls._is_header_line(line): - # Determine if this section matches based on metadata or old format - if file_tuning_space is not None: - # New format: use metadata - matching_section = (file_tuning_space == options.tuning_space_kind and - (file_arch is None or file_arch == options.arch) and - (file_num_cu is None or - file_num_cu == options.num_cu)) - elif f'({options.tuning_space_kind})' in line: - # Old format: tuning space embedded in header - matching_section = True - else: - matching_section = False + # Determine if this section matches based on tuning space + matching_section = f'({options.tuning_space_kind})' in line if matching_section: column_indices = cls._parse_header_line(line) - if file_commit and file_commit != current_commit and not options.quiet: + + # Warn if commit hashes differ + file_commit = metadata.get('commit', 'unknown') + if file_commit != current_commit: print( f"Warning: Loading tuned configs from different commit " f"(file: {file_commit[:8]}, current: {current_commit[:8]})", file=sys.stderr) - # Reset pending metadata for next section - file_commit = None - file_tuning_space = None - file_arch = None - file_num_cu = None + # Reset metadata for next section + metadata = {} continue # Skip other comment lines @@ -350,21 +321,20 @@ def from_output_file(cls, options: Options) -> 'TunedConfigsCache': @staticmethod def _is_header_line(line: str) -> bool: - """Check if line is a column header (old or new format).""" - # Old format: '# arch\t...' - if line.startswith('# '): - return line[2:].startswith('arch\t') - # New format: 'testVector\t...' - return line.startswith('testVector\t') + """Check if line is a column header.""" + return line.startswith('# arch\t') @staticmethod def _parse_header_line(line: str) -> Dict[str, int]: """Parse column header and return name -> index mapping.""" + # Strip leading '# ' if present header_text = line[2:] if line.startswith('# ') else line indices = {} for i, col in enumerate(header_text.split('\t')): if col: - indices[col.split()[0]] = i + # Exctract base column name (handles 'perfConfig (tuning_space)') + col_name = col.split()[0] + indices[col_name] = i return indices @staticmethod @@ -385,14 +355,10 @@ def get_field(name: str) -> Optional[str]: return fields[idx] return None - # Old format: arch and numCUs are columns - if 'arch' in column_indices: - if get_field('arch') != arch: - return None - - if 'numCUs' in column_indices: - if get_field('numCUs') != str(num_cu): - return None + if get_field('arch') != arch: + return None + if get_field('numCUs') != str(num_cu): + return None test_vector = get_field('testVector') if not test_vector: @@ -652,33 +618,37 @@ def _write_header(self): if self.header_written: return - # Add a blank line if appending if self._is_appending: - print("", file=self.file) + print("", file=self.file) # Blank line before new section # Metadata comments print(f"## commit: {get_git_commit_hash()}", file=self.file) - print(f"## tuningSpace: {self.options.tuning_space_kind}", file=self.file) - print(f'## arch: {self.options.arch}', file=self.file) - print(f'## numCUs: {self.options.num_cu}', file=self.file) - print(f'## numChiplets: {self.options.num_chiplets}', file=self.file) - # TSV header - columns = ['testVector', 'perfConfig'] + # TSV header with '# ' prefix + columns = [ + 'arch', 'numCUs', 'numChiplets', 'testVector', + f'perfConfig ({self.options.tuning_space_kind})' + ] if self.options.tflops: columns.append('TFlops') columns.append('elapsedSeconds') - print("\t".join(columns), file=self.file) + print("# " + "\t".join(columns), file=self.file) self.file.flush() self.header_written = True def write_result(self, result: TuningResult): + assert result.success and result.winning_config and result.max_tflops, "write_result called with failed result" + self._write_header() - fields = [result.test_vector, result.winning_config or ""] + fields = [ + self.options.arch, + str(self.options.num_cu), + str(self.options.num_chiplets), result.test_vector, result.winning_config + ] if self.options.tflops: - fields.append(f"{result.max_tflops}" if result.max_tflops else "") + fields.append(str(result.max_tflops)) fields.append(f"{result.elapsed_seconds:.1f}") print("\t".join(fields), file=self.file) @@ -1499,21 +1469,21 @@ def main(args=None): num_chiplets = perfRunner.get_num_chiplets(chip, num_cu) options = Options(arch=arch, - num_cu=num_cu, - num_chiplets=num_chiplets, - debug=parsed_args.debug, - quiet=parsed_args.quiet, - tuning_space_kind=parsed_args.tuning_space, - rocmlir_gen_flags=parsed_args.rocmlir_gen_flags, - verify_mode=parsed_args.verify_mode, - verify_perfconfigs=parsed_args.verify_perf_configs, - tflops=parsed_args.tflops, - output=ensure_tsv_extension(parsed_args.output), - abort_on_error=parsed_args.abort_on_error, - retune=parsed_args.retune, - gpu_ids=parsed_args.gpus, - num_cpus=parsed_args.num_cpus, - wait_for_compiles=parsed_args.wait_for_compiles) + num_cu=num_cu, + num_chiplets=num_chiplets, + debug=parsed_args.debug, + quiet=parsed_args.quiet, + tuning_space_kind=parsed_args.tuning_space, + rocmlir_gen_flags=parsed_args.rocmlir_gen_flags, + verify_mode=parsed_args.verify_mode, + verify_perfconfigs=parsed_args.verify_perf_configs, + tflops=parsed_args.tflops, + output=ensure_tsv_extension(parsed_args.output), + abort_on_error=parsed_args.abort_on_error, + retune=parsed_args.retune, + gpu_ids=parsed_args.gpus, + num_cpus=parsed_args.num_cpus, + wait_for_compiles=parsed_args.wait_for_compiles) if op_type == Operation.FUSION: op_type = extract_fusion_configs(parsed_args.test_dir, paths, options) From 44ead00f147f3458c4d729d671ba2de9b6f8da38 Mon Sep 17 00:00:00 2001 From: Mirza Halilcevic Date: Sat, 10 Jan 2026 22:48:27 +0000 Subject: [PATCH 04/23] Add state file for crash and interrupt recovery. --- mlir/utils/performance/tuningRunner.py | 301 ++++++++++++++++++++++++- 1 file changed, 295 insertions(+), 6 deletions(-) diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py index c7c7b57612b5..d86024c6689f 100755 --- a/mlir/utils/performance/tuningRunner.py +++ b/mlir/utils/performance/tuningRunner.py @@ -22,6 +22,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from contextlib import nullcontext from dataclasses import dataclass, field +from enum import Enum from typing import Any, Dict, List, Optional from collections import deque @@ -49,7 +50,7 @@ MLIR_N_REPEATS = 10 WARMUP_ITERATIONS = 1 SLEEP_US = 100 # 0.1 ms -MAX_FAILURES = 10 +MAX_FAILURES = 20 # ============================================================================= # Configuration & Results @@ -217,6 +218,253 @@ def _parse_cpu_list(cpu_list_str: str) -> List[int]: return cpus +# ============================================================================= +# State Management +# ============================================================================= + + +class ConfigState(Enum): + """Possible states for a tuning configuration in the state file. + + State transitions: + PENDING (implicit) -> RUNNING: Config starts tuning + RUNNING -> SUCCESS (implicit): Tuning completes successfully (removed from state, written to output) + RUNNING -> FAILED: Tuning completes with error + RUNNING -> INTERRUPTED: User interrupted (Ctrl+C) during tuning + RUNNING -> CRASHED: Detected on next startup (stale RUNNING state) + FAILED/CRASHED -> PENDING: User requests retry with --retry-failed + + Note: PENDING and SUCCESS are implicit states: + - PENDING: not in state file AND not in output file + - SUCCESS: in output file (not tracked in state file) + """ + RUNNING = "running" # Currently being tuned + FAILED = "failed" # Tuning completed with error + INTERRUPTED = "interrupted" # User interrupted during tuning (Ctrl+C) + CRASHED = "crashed" # Process crashed while tuning (detected on startup) + + +@dataclass +class TuningStateContext: + """Context that identifies a tuning run. State is invalidated if context changes.""" + arch: str + num_cu: int + tuning_space: str + + def matches(self, other: 'TuningStateContext') -> bool: + return (self.arch == other.arch and self.num_cu == other.num_cu and + self.tuning_space == other.tuning_space) + + +@dataclass +class TuningState: + """Persistent state for tuning runs, survives crashes and interrupts.""" + context: TuningStateContext + configs: Dict[str, ConfigState] = field(default_factory=dict) + + def set_running(self, test_vector: str) -> None: + """Mark a config as currently running.""" + self.configs[test_vector] = ConfigState.RUNNING + + def set_failed(self, test_vector: str) -> None: + """Mark a config as failed.""" + self.configs[test_vector] = ConfigState.FAILED + + def set_interrupted(self, test_vector: str) -> None: + """Mark a config as interrupted by user.""" + self.configs[test_vector] = ConfigState.INTERRUPTED + + def set_crashed(self, test_vector: str) -> None: + """Mark a config as crashed.""" + self.configs[test_vector] = ConfigState.CRASHED + + def remove(self, test_vector: str) -> None: + """Remove a config from state (e.g., on success).""" + self.configs.pop(test_vector, None) + + def should_skip(self, test_vector: str) -> bool: + """Check if a config should be skipped (failed or crashed).""" + return self.configs.get(test_vector) in (ConfigState.FAILED, ConfigState.CRASHED) + + def _count_by_state(self, *states: ConfigState) -> int: + """Count configs in any of the given states.""" + return sum(1 for s in self.configs.values() if s in states) + + def failed_count(self) -> int: + """Count of failed configs.""" + return self._count_by_state(ConfigState.FAILED) + + def crashed_count(self) -> int: + """Count of crashed configs.""" + return self._count_by_state(ConfigState.CRASHED) + + def skip_count(self) -> int: + """Count of configs that should be skipped (failed + crashed).""" + return self._count_by_state(ConfigState.FAILED, ConfigState.CRASHED) + + def promote_running_to_crashed(self) -> int: + """Move all RUNNING configs to CRASHED (crash recovery). Returns count.""" + count = 0 + for tv in self.configs: + if self.configs[tv] == ConfigState.RUNNING: + self.configs[tv] = ConfigState.CRASHED + count += 1 + return count + + def promote_running_to_interrupted(self) -> int: + """Move all RUNNING configs to INTERRUPTED (clean shutdown). Returns count.""" + count = 0 + for tv in self.configs: + if self.configs[tv] == ConfigState.RUNNING: + self.configs[tv] = ConfigState.INTERRUPTED + count += 1 + return count + + +class TuningStateFile: + """Manages reading and writing of tuning state to a JSON file. + + If filepath is None, all operations are no-ops (null object pattern). + """ + + def __init__(self, filepath: Optional[str]): + self.filepath = filepath + self._lock = threading.Lock() + self._state: Optional[TuningState] = None + + def load(self, expected_context: TuningStateContext, quiet: bool = False) -> 'TuningStateFile': + """Load state from file. Returns self for chaining. + + On load: + - INTERRUPTED configs are demoted to PENDING (removed from state) + - RUNNING configs are promoted to CRASHED (indicates previous crash) + """ + if not self.filepath: + self._state = TuningState(context=expected_context) + return self + + if not os.path.exists(self.filepath): + self._state = TuningState(context=expected_context) + return self + + try: + with open(self.filepath, 'r') as f: + data = json.load(f) + + file_context = TuningStateContext(arch=data.get('arch', ''), + num_cu=data.get('numCUs', 0), + tuning_space=data.get('tuningSpace', '')) + + if not file_context.matches(expected_context): + if not quiet: + print("State file context mismatch, starting fresh", file=sys.stderr) + self._state = TuningState(context=expected_context) + return self + + configs = {} + for tv, state_str in data.get('configs', {}).items(): + try: + config_state = ConfigState(state_str) + # Demote INTERRUPTED to PENDING (don't add to configs) + if config_state == ConfigState.INTERRUPTED: + continue + # Promote RUNNING to CRASHED (stale running = crash) + if config_state == ConfigState.RUNNING: + config_state = ConfigState.CRASHED + configs[tv] = config_state + except ValueError: + pass # Skip invalid states + + self._state = TuningState(context=expected_context, configs=configs) + return self + + except (json.JSONDecodeError, KeyError, TypeError) as e: + if not quiet: + print(f"Warning: Failed to load state file: {e}", file=sys.stderr) + self._state = TuningState(context=expected_context) + return self + + @property + def state(self) -> TuningState: + """Get the current state. Must call load() first.""" + if self._state is None: + raise RuntimeError("State not loaded. Call load() first.") + return self._state + + def _save_locked(self) -> None: + """Save state to file atomically. Assumes lock is held.""" + if not self.filepath or not self._state: + return + + data = { + 'arch': self._state.context.arch, + 'numCUs': self._state.context.num_cu, + 'tuningSpace': self._state.context.tuning_space, + 'configs': { + tv: s.value for tv, s in self._state.configs.items() + } + } + + # Write to temp file then rename for atomicity + temp_path = self.filepath + '.tmp' + with open(temp_path, 'w') as f: + json.dump(data, f, indent=2) + os.replace(temp_path, self.filepath) + + def save(self) -> None: + """Save state to file atomically. No-op if filepath is None.""" + with self._lock: + self._save_locked() + + def delete(self) -> None: + """Delete the state file. No-op if filepath is None.""" + if not self.filepath: + return + + with self._lock: + if os.path.exists(self.filepath): + os.remove(self.filepath) + self._state = None + + def set_running(self, test_vector: str) -> None: + """Mark a config as running and save.""" + if self._state: + with self._lock: + self._state.set_running(test_vector) + self._save_locked() + + def set_failed(self, test_vector: str) -> None: + """Mark a config as failed and save.""" + if self._state: + with self._lock: + self._state.set_failed(test_vector) + self._save_locked() + + def set_success(self, test_vector: str) -> None: + """Remove a config from state (success) and save.""" + if self._state: + with self._lock: + self._state.remove(test_vector) + self._save_locked() + + def finalize_interrupted(self, quiet: bool = False) -> None: + """Mark any RUNNING configs as INTERRUPTED and save. Called on clean shutdown.""" + if self._state: + with self._lock: + interrupted_count = self._state.promote_running_to_interrupted() + if interrupted_count > 0 and not quiet: + print(f"Marked {interrupted_count} running config(s) as interrupted", + file=sys.stderr) + self._save_locked() + + +def get_state_filepath(output_filepath: str) -> Optional[str]: + """Get the state file path for a given output file.""" + if output_filepath == '-': + return None + return f"{output_filepath}.state" + + # ============================================================================= # Tuning Infrastructure # ============================================================================= @@ -669,6 +917,7 @@ def __init__(self, filepath: str): self.header_written = False def __enter__(self): + self.header_written = os.path.exists(self.filepath) and os.path.getsize(self.filepath) > 0 self.file = open(self.filepath, 'a') return self @@ -1070,12 +1319,44 @@ def tune_configs(ctx: TuningContext) -> bool: if cache.count() > 0 and not ctx.options.quiet: print(f"Found {cache.count()} tuned config(s) in {ctx.options.output}", file=sys.stderr) + # Load state file + state_context = TuningStateContext(arch=ctx.options.arch, + num_cu=ctx.options.num_cu, + tuning_space=ctx.options.tuning_space_kind) + state_file = TuningStateFile(get_state_filepath(ctx.options.output)) + + if ctx.options.retune: + state_file.delete() + + state_file.load(state_context, ctx.options.quiet) + state = state_file.state + + if not ctx.options.retune: + crashed_count = state.crashed_count() + if crashed_count > 0 and not ctx.options.quiet: + print(f"Detected {crashed_count} crashed config(s) from previous run", file=sys.stderr) + + if state.skip_count() > 0 and not ctx.options.quiet: + print(f"Found {state.skip_count()} failed/crashed config(s) in state file", + file=sys.stderr) + + state_file.save() + # Filter out already-tuned configs pending_configs = [c for c in ctx.configs if not cache.contains(c)] - skipped_count = len(ctx.configs) - len(pending_configs) - if skipped_count > 0 and not ctx.options.quiet: - print(f"Skipping {skipped_count} of {len(ctx.configs)} already tuned config(s)", - file=sys.stderr) + skipped_success = len(ctx.configs) - len(pending_configs) + + # Filter out failed/crashed configs from state file + before_filter = len(pending_configs) + pending_configs = [c for c in pending_configs if not state.should_skip(c)] + skipped_failed = before_filter - len(pending_configs) + + total_skipped = skipped_success + skipped_failed + + if skipped_success > 0 and not ctx.options.quiet: + print(f"Skipping {skipped_success} already tuned config(s)", file=sys.stderr) + if skipped_failed > 0 and not ctx.options.quiet: + print(f"Skipping {skipped_failed} failed/crashed config(s)", file=sys.stderr) if not pending_configs: print("All configurations already tuned", file=sys.stderr) @@ -1087,6 +1368,9 @@ def tune_configs(ctx: TuningContext) -> bool: def execute_tuning_task(test_vector: str) -> TuningResult: gpu_id = pool.acquire_gpu_for_thread() + + state_file.set_running(test_vector) + start_time = time.time() compile_threads = ctx.get_compile_threads(gpu_id) result = tune_config(test_vector, ctx.conf_class, ctx.paths, ctx.options, gpu_id, @@ -1118,7 +1402,7 @@ def execute_tuning_task(test_vector: str) -> TuningResult: progress_bar = tqdm( total=len(ctx.configs), - initial=skipped_count, + initial=total_skipped, disable=ctx.options.quiet, file=sys.stderr, desc=f"Tuning {ctx.conf_class.__name__} ({ctx.options.tuning_space_kind})", @@ -1145,9 +1429,12 @@ def execute_tuning_task(test_vector: str) -> TuningResult: results_writer.write_result(result) if debug_writer: debug_writer.write_entries(result.entries) + state_file.set_success(result.test_vector) else: has_errors = True consecutive_failures += 1 + state_file.set_failed(result.test_vector) + error_text = result.error or "Unknown error" formatted_error = f"[GPU {result.gpu_id}] Error tuning {result.test_vector}\n" + '\n'.join( f"\t{line}" for line in error_text.splitlines()) @@ -1178,6 +1465,8 @@ def execute_tuning_task(test_vector: str) -> TuningResult: if progress_bar: progress_bar.close() + state_file.finalize_interrupted(ctx.options.quiet) + # ============================================================================= # Configuration Loading From 8a9345641987a30e99971efb2bb8cc359f037967 Mon Sep 17 00:00:00 2001 From: Mirza Halilcevic Date: Sat, 10 Jan 2026 23:05:00 +0000 Subject: [PATCH 05/23] Add --retry-failed option. --- mlir/utils/performance/tuningRunner.py | 47 +++++++++++++++----------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py index d86024c6689f..fb3402ec8d8b 100755 --- a/mlir/utils/performance/tuningRunner.py +++ b/mlir/utils/performance/tuningRunner.py @@ -73,6 +73,7 @@ class Options: output: str abort_on_error: bool retune: bool + retry_failed: bool gpu_ids: List[int] num_cpus: Optional[int] wait_for_compiles: bool @@ -652,6 +653,7 @@ class ETATracker: num_workers: int initial_times: List[float] = field(default_factory=list) initial_ok_count: int = 0 + initial_fail_count: int = 0 _success_times: List[float] = field(default_factory=list, init=False) _processed: int = field(default=0, init=False) _ok_count: int = field(default=0, init=False) @@ -660,6 +662,7 @@ class ETATracker: def __post_init__(self): self._success_times = list(self.initial_times) self._ok_count = self.initial_ok_count + self._fail_count = self.initial_fail_count def record(self, result: TuningResult) -> None: self._processed += 1 @@ -1324,32 +1327,31 @@ def tune_configs(ctx: TuningContext) -> bool: num_cu=ctx.options.num_cu, tuning_space=ctx.options.tuning_space_kind) state_file = TuningStateFile(get_state_filepath(ctx.options.output)) - - if ctx.options.retune: - state_file.delete() - state_file.load(state_context, ctx.options.quiet) state = state_file.state - if not ctx.options.retune: - crashed_count = state.crashed_count() - if crashed_count > 0 and not ctx.options.quiet: - print(f"Detected {crashed_count} crashed config(s) from previous run", file=sys.stderr) + crashed_count = state.crashed_count() + if crashed_count > 0 and not ctx.options.quiet: + print(f"Detected {crashed_count} crashed config(s) from previous run", file=sys.stderr) - if state.skip_count() > 0 and not ctx.options.quiet: - print(f"Found {state.skip_count()} failed/crashed config(s) in state file", - file=sys.stderr) + if state.skip_count() > 0 and not ctx.options.quiet: + print(f"Found {state.skip_count()} failed/crashed config(s) in state file", file=sys.stderr) state_file.save() - # Filter out already-tuned configs - pending_configs = [c for c in ctx.configs if not cache.contains(c)] - skipped_success = len(ctx.configs) - len(pending_configs) + # Filter out already-tuned configs (unless --retune) + pending_configs = ctx.configs + skipped_success = 0 + if not ctx.options.retune: + pending_configs = [c for c in pending_configs if not cache.contains(c)] + skipped_success = len(ctx.configs) - len(pending_configs) - # Filter out failed/crashed configs from state file - before_filter = len(pending_configs) - pending_configs = [c for c in pending_configs if not state.should_skip(c)] - skipped_failed = before_filter - len(pending_configs) + # Filter out failed/crashed configs (unless --retry-failed or --retune) + skipped_failed = 0 + if not ctx.options.retry_failed and not ctx.options.retune: + before_filter = len(pending_configs) + pending_configs = [c for c in pending_configs if not state.should_skip(c)] + skipped_failed = before_filter - len(pending_configs) total_skipped = skipped_success + skipped_failed @@ -1398,7 +1400,8 @@ def execute_tuning_task(test_vector: str) -> TuningResult: eta_tracker = ETATracker(total_configs=len(pending_configs), num_workers=num_workers, initial_times=initial_times, - initial_ok_count=cache.count()) + initial_ok_count=skipped_success, + initial_fail_count=skipped_failed) progress_bar = tqdm( total=len(ctx.configs), @@ -1703,6 +1706,11 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N default=False, help="Force retuning of all configs, ignoring existing results in the output file") + parser.add_argument("--retry-failed", + action='store_true', + default=False, + help="Retry previously failed/crashed configs instead of skipping them") + parser.add_argument("--gpus", type=int, nargs='+', @@ -1770,6 +1778,7 @@ def main(args=None): output=ensure_tsv_extension(parsed_args.output), abort_on_error=parsed_args.abort_on_error, retune=parsed_args.retune, + retry_failed=parsed_args.retry_failed, gpu_ids=parsed_args.gpus, num_cpus=parsed_args.num_cpus, wait_for_compiles=parsed_args.wait_for_compiles) From aeedb2a5bf52a9e39f5ebaa6165c8353ac82eadc Mon Sep 17 00:00:00 2001 From: Mirza Halilcevic Date: Sun, 11 Jan 2026 23:35:47 +0000 Subject: [PATCH 06/23] Use proper python logger. --- mlir/utils/performance/tuningRunner.py | 249 +++++++++++++------------ 1 file changed, 129 insertions(+), 120 deletions(-) diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py index fb3402ec8d8b..7a587b759f13 100755 --- a/mlir/utils/performance/tuningRunner.py +++ b/mlir/utils/performance/tuningRunner.py @@ -12,6 +12,8 @@ import argparse import glob +import json +import logging import os import statistics import subprocess @@ -26,7 +28,6 @@ from typing import Any, Dict, List, Optional from collections import deque -import json import numpy as np import pandas as pd from tqdm import tqdm @@ -52,6 +53,46 @@ SLEEP_US = 100 # 0.1 ms MAX_FAILURES = 20 +# ============================================================================= +# Logging Setup +# ============================================================================= + + +class TqdmLoggingHandler(logging.Handler): + """Logging handler that uses tqdm.write() to avoid corrupting progress bars.""" + + def emit(self, record): + try: + msg = self.format(record) + tqdm.write(msg, file=sys.stderr) + except Exception: + self.handleError(record) + + +def setup_logger(verbose: bool = False) -> logging.Logger: + """Configure and return a logger for tuningRunner.""" + log = logging.getLogger("tuningRunner") + log.setLevel(logging.DEBUG if verbose else logging.INFO) + + # Clear any existing handlers + log.handlers.clear() + + # Use tqdm-aware handler + handler = TqdmLoggingHandler() + handler.setLevel(logging.DEBUG if verbose else logging.INFO) + + # Simple format: level and message + formatter = logging.Formatter('%(levelname)s: %(message)s') + handler.setFormatter(formatter) + + log.addHandler(handler) + + return log + + +# Module-level logger +logger: logging.Logger = setup_logger() + # ============================================================================= # Configuration & Results # ============================================================================= @@ -62,7 +103,7 @@ class Options: """Configuration options for the tuning process.""" debug: bool tuning_space_kind: str - quiet: bool + verbose: bool arch: str num_cu: int num_chiplets: int @@ -157,19 +198,19 @@ def discover() -> 'GpuTopology': gpus[gpu_id] = Gpu(gpu_id=gpu_id, sku=sku, numa_node=numa_node) if gpus: return GpuTopology(gpus=gpus) - print("Warning: rocm-smi returned no GPU cards", file=sys.stderr) + logger.warning("rocm-smi returned no GPU cards") except subprocess.CalledProcessError as e: - print(f"Warning: rocm-smi failed with return code {e.returncode}", file=sys.stderr) + logger.warning(f"rocm-smi failed with return code {e.returncode}") except subprocess.TimeoutExpired: - print("Warning: rocm-smi timed out", file=sys.stderr) + logger.warning("rocm-smi timed out") except FileNotFoundError: - print("Warning: rocm-smi not found in PATH", file=sys.stderr) + logger.warning("rocm-smi not found in PATH") except json.JSONDecodeError as e: - print(f"Warning: Failed to parse rocm-smi JSON output: {e}", file=sys.stderr) + logger.warning(f"Failed to parse rocm-smi JSON output: {e}") except (ValueError, KeyError) as e: - print(f"Warning: Failed to extract GPU info from rocm-smi output: {e}", file=sys.stderr) + logger.warning(f"Failed to extract GPU info from rocm-smi output: {e}") - print("Warning: Could not detect GPUs, defaulting to GPU 0", file=sys.stderr) + logger.warning("Could not detect GPUs, defaulting to GPU 0") return GpuTopology(gpus={0: Gpu(gpu_id=0, sku="unknown", numa_node=0)}) @@ -199,6 +240,8 @@ def discover() -> 'NumaTopology': if os.path.exists(cpulist_path): with open(cpulist_path, 'r') as f: numa_to_cpus[node_id] = NumaTopology._parse_cpu_list(f.read()) + else: + logger.warning(f"Missing cpulist for NUMA node {node_id}") # Fallback: single node with all CPUs if not numa_to_cpus: @@ -303,15 +346,6 @@ def skip_count(self) -> int: """Count of configs that should be skipped (failed + crashed).""" return self._count_by_state(ConfigState.FAILED, ConfigState.CRASHED) - def promote_running_to_crashed(self) -> int: - """Move all RUNNING configs to CRASHED (crash recovery). Returns count.""" - count = 0 - for tv in self.configs: - if self.configs[tv] == ConfigState.RUNNING: - self.configs[tv] = ConfigState.CRASHED - count += 1 - return count - def promote_running_to_interrupted(self) -> int: """Move all RUNNING configs to INTERRUPTED (clean shutdown). Returns count.""" count = 0 @@ -333,7 +367,7 @@ def __init__(self, filepath: Optional[str]): self._lock = threading.Lock() self._state: Optional[TuningState] = None - def load(self, expected_context: TuningStateContext, quiet: bool = False) -> 'TuningStateFile': + def load(self, expected_context: TuningStateContext) -> 'TuningStateFile': """Load state from file. Returns self for chaining. On load: @@ -357,8 +391,7 @@ def load(self, expected_context: TuningStateContext, quiet: bool = False) -> 'Tu tuning_space=data.get('tuningSpace', '')) if not file_context.matches(expected_context): - if not quiet: - print("State file context mismatch, starting fresh", file=sys.stderr) + logger.info("State file context mismatch, starting fresh") self._state = TuningState(context=expected_context) return self @@ -380,8 +413,7 @@ def load(self, expected_context: TuningStateContext, quiet: bool = False) -> 'Tu return self except (json.JSONDecodeError, KeyError, TypeError) as e: - if not quiet: - print(f"Warning: Failed to load state file: {e}", file=sys.stderr) + logger.warning(f"Failed to load state file: {e}") self._state = TuningState(context=expected_context) return self @@ -448,14 +480,13 @@ def set_success(self, test_vector: str) -> None: self._state.remove(test_vector) self._save_locked() - def finalize_interrupted(self, quiet: bool = False) -> None: + def finalize_interrupted(self) -> None: """Mark any RUNNING configs as INTERRUPTED and save. Called on clean shutdown.""" if self._state: with self._lock: interrupted_count = self._state.promote_running_to_interrupted() - if interrupted_count > 0 and not quiet: - print(f"Marked {interrupted_count} running config(s) as interrupted", - file=sys.stderr) + if interrupted_count > 0: + logger.info(f"Marked {interrupted_count} running config(s) as interrupted") self._save_locked() @@ -538,10 +569,9 @@ def from_output_file(cls, options: Options) -> 'TunedConfigsCache': # Warn if commit hashes differ file_commit = metadata.get('commit', 'unknown') if file_commit != current_commit: - print( - f"Warning: Loading tuned configs from different commit " - f"(file: {file_commit[:8]}, current: {current_commit[:8]})", - file=sys.stderr) + logger.warning( + f"Loading tuned configs from different commit " + f"(file: {file_commit[:8]}, current: {current_commit[:8]})") # Reset metadata for next section metadata = {} @@ -562,9 +592,7 @@ def from_output_file(cls, options: Options) -> 'TunedConfigsCache': cache._results[result.test_vector] = result except Exception as e: - if not options.quiet: - print(f"Warning: Failed to load existing tuning results from {options.output}: {e}", - file=sys.stderr) + logger.warning(f"Failed to load existing tuning results from {options.output}: {e}") return cache @@ -747,11 +775,10 @@ def _compute_thread_allocation(self) -> Dict[int, int]: scale_factor = self.options.num_cpus / total_allocated for gpu_id in allocation: allocation[gpu_id] = max(1, int(allocation[gpu_id] * scale_factor)) - elif not self.options.quiet: - print( - f"Note: --num-cpus={self.options.num_cpus} exceeds optimal {total_allocated}, " - f"using optimal allocation", - file=sys.stderr) + else: + logger.info( + f"--num-cpus={self.options.num_cpus} exceeds optimal {total_allocated}, " + f"using optimal allocation") return allocation @@ -760,15 +787,13 @@ def get_compile_threads(self, gpu_id: int) -> int: return self._threads_per_gpu.get(gpu_id, 1) def print_gpu_summary(self): - """Print summary of GPU allocation to stderr.""" - if self.options.quiet: - return + """Print summary of GPU allocation.""" num_active = len(self.options.gpu_ids) - print(f"Using {num_active} GPU(s):", file=sys.stderr) + logger.info(f"Using {num_active} GPU(s):") for gpu_id in self.options.gpu_ids[:num_active]: node = self.gpu_topology.get_numa_node(gpu_id) threads = self._threads_per_gpu.get(gpu_id, 1) - print(f" GPU {gpu_id}: NUMA node {node}, {threads} compile threads", file=sys.stderr) + logger.info(f" GPU {gpu_id}: NUMA node {node}, {threads} compile threads") class GpuWorkerPool: @@ -811,8 +836,7 @@ def _apply_numa_affinity(self, gpu_id: int) -> None: try: os.sched_setaffinity(0, set(cpu_list)) except OSError: - if not self._ctx.options.quiet: - print(f"Warning: Could not set CPU affinity for GPU {gpu_id}", file=sys.stderr) + logger.warning(f"Could not set CPU affinity for GPU {gpu_id}") self._set_memory_policy(node) @@ -849,7 +873,7 @@ def __init__(self, filepath: str, options: Options): self.filepath = filepath self.options = options self.file = None - self.header_written = False + self._header_written = False self._is_appending = False def __enter__(self): @@ -866,7 +890,7 @@ def __exit__(self, exc_type, exc_value, traceback): self.file.close() def _write_header(self): - if self.header_written: + if self._header_written: return if self._is_appending: @@ -886,7 +910,7 @@ def _write_header(self): print("# " + "\t".join(columns), file=self.file) self.file.flush() - self.header_written = True + self._header_written = True def write_result(self, result: TuningResult): assert result.success and result.winning_config and result.max_tflops, "write_result called with failed result" @@ -905,11 +929,6 @@ def write_result(self, result: TuningResult): self.file.flush() - def write_error(self, content: str): - self._write_header() - print('\n'.join(f"### {line}" for line in content.splitlines()), file=self.file) - self.file.flush() - class DebugFileWriter: """Context manager for writing debug entries to TSV file.""" @@ -917,10 +936,10 @@ class DebugFileWriter: def __init__(self, filepath: str): self.filepath = filepath self.file = None - self.header_written = False + self._header_written = False def __enter__(self): - self.header_written = os.path.exists(self.filepath) and os.path.getsize(self.filepath) > 0 + self._header_written = os.path.exists(self.filepath) and os.path.getsize(self.filepath) > 0 self.file = open(self.filepath, 'a') return self @@ -935,11 +954,11 @@ def write_entries(self, entries: List[Dict]): pd.DataFrame(entries).to_csv(self.file, sep='\t', mode='a', - header=not self.header_written, + header=not self._header_written, index=False) self.file.flush() - self.header_written = True + self._header_written = True # ============================================================================= @@ -978,15 +997,6 @@ def __call__(self, parser, namespace, values, option_string=None): setattr(namespace, self.dest, values) -def ensure_tsv_extension(filepath: str) -> str: - """Ensure filepath has .tsv extension, unless it's stdout.""" - if filepath == '-': - return filepath - if not filepath.endswith('.tsv'): - return filepath + '.tsv' - return filepath - - def get_git_commit_hash() -> str: """Get the current git commit hash.""" try: @@ -1032,9 +1042,9 @@ def kill_process(proc) -> None: proc.kill() proc.wait(timeout=10) except subprocess.TimeoutExpired: - print(f"Warning: Process {proc.pid} did not terminate in time after kill", file=sys.stderr) + logger.warning(f"Process {proc.pid} did not terminate in time after kill") except Exception as e: - print(f"Warning: Failed to kill process {proc.pid}: {e}", file=sys.stderr) + logger.warning(f"Failed to kill process {proc.pid}: {e}") # ============================================================================= @@ -1073,9 +1083,7 @@ def verify_perfconfig(perfconfig, config, paths: Paths, options: Options, gpu_id ]) debug_info = f"[GPU {gpu_id}] Verification pipeline:\n" + verification_pipeline - - if not options.quiet and options.debug: - print(debug_info, file=sys.stderr) + logger.debug(debug_info) with tempfile.TemporaryDirectory() as tmpdir: p1 = None @@ -1260,9 +1268,7 @@ def tune_config(test_vector, conf_class, paths: Paths, options: Options, gpu_id: tuning_pipeline = ' '.join(tuning_driver_command) debug_info = f"[GPU {gpu_id}] Tuning '{test_vector}':\n" + tuning_pipeline - - if not options.quiet and options.debug: - print(debug_info, file=sys.stderr) + logger.debug(debug_info) # Note: communicate waits for process to terminate which might cause CI timeouts if tuning takes too long tuning_stdout, tuning_stderr = tuning_driver.communicate() @@ -1319,23 +1325,24 @@ def tune_configs(ctx: TuningContext) -> bool: cache = TunedConfigsCache() if not ctx.options.retune: cache = TunedConfigsCache.from_output_file(ctx.options) - if cache.count() > 0 and not ctx.options.quiet: - print(f"Found {cache.count()} tuned config(s) in {ctx.options.output}", file=sys.stderr) + if cache.count() > 0: + logger.info(f"Found {cache.count()} tuned config(s) in {ctx.options.output}") # Load state file state_context = TuningStateContext(arch=ctx.options.arch, num_cu=ctx.options.num_cu, tuning_space=ctx.options.tuning_space_kind) state_file = TuningStateFile(get_state_filepath(ctx.options.output)) - state_file.load(state_context, ctx.options.quiet) + state_file.load(state_context) state = state_file.state crashed_count = state.crashed_count() - if crashed_count > 0 and not ctx.options.quiet: - print(f"Detected {crashed_count} crashed config(s) from previous run", file=sys.stderr) + if crashed_count > 0: + logger.warning(f"Detected {crashed_count} crashed config(s) from previous run") - if state.skip_count() > 0 and not ctx.options.quiet: - print(f"Found {state.skip_count()} failed/crashed config(s) in state file", file=sys.stderr) + failed_count = state.failed_count() + if failed_count > 0: + logger.info(f"Found {failed_count} failed config(s) in state file") state_file.save() @@ -1355,13 +1362,13 @@ def tune_configs(ctx: TuningContext) -> bool: total_skipped = skipped_success + skipped_failed - if skipped_success > 0 and not ctx.options.quiet: - print(f"Skipping {skipped_success} already tuned config(s)", file=sys.stderr) - if skipped_failed > 0 and not ctx.options.quiet: - print(f"Skipping {skipped_failed} failed/crashed config(s)", file=sys.stderr) + if skipped_success > 0: + logger.info(f"Skipping {skipped_success} already tuned config(s)") + if skipped_failed > 0: + logger.info(f"Skipping {skipped_failed} failed/crashed config(s)") if not pending_configs: - print("All configurations already tuned", file=sys.stderr) + logger.info("No configurations to tune") return True pool = GpuWorkerPool(ctx) @@ -1390,6 +1397,8 @@ def execute_tuning_task(test_vector: str) -> TuningResult: executor = None progress_bar = None + has_errors = False + with OutputFileWriter(ctx.options.output, ctx.options) as results_writer: with DebugFileWriter(f"{ctx.options.output}.debug") if ctx.options.debug else nullcontext( ) as debug_writer: @@ -1406,7 +1415,7 @@ def execute_tuning_task(test_vector: str) -> TuningResult: progress_bar = tqdm( total=len(ctx.configs), initial=total_skipped, - disable=ctx.options.quiet, + disable=not sys.stderr.isatty(), file=sys.stderr, desc=f"Tuning {ctx.conf_class.__name__} ({ctx.options.tuning_space_kind})", unit="config", @@ -1421,7 +1430,6 @@ def execute_tuning_task(test_vector: str) -> TuningResult: for test_vector in pending_configs } - has_errors = False consecutive_failures = 0 for completed_future in as_completed(pending_futures): @@ -1441,34 +1449,36 @@ def execute_tuning_task(test_vector: str) -> TuningResult: error_text = result.error or "Unknown error" formatted_error = f"[GPU {result.gpu_id}] Error tuning {result.test_vector}\n" + '\n'.join( f"\t{line}" for line in error_text.splitlines()) - print(formatted_error, file=sys.stderr) - results_writer.write_error(formatted_error) + logger.error(formatted_error) if ctx.options.abort_on_error: return False if consecutive_failures >= MAX_FAILURES: - print("Aborting due to too many consecutive failures", file=sys.stderr) + logger.error("Aborting due to too many consecutive failures") return False eta_tracker.record(result) progress_bar.update(1) progress_bar.set_postfix_str(eta_tracker.get_postfix_str()) - if has_errors: - print("Encountered errors during tuning", file=sys.stderr) - else: - print("Tuning completed successfully", file=sys.stderr) - - return not has_errors - + except KeyboardInterrupt: + logger.info("Tuning interrupted by user") + raise finally: if executor: executor.shutdown(wait=False, cancel_futures=True) if progress_bar: progress_bar.close() - state_file.finalize_interrupted(ctx.options.quiet) + state_file.finalize_interrupted() + + if has_errors: + logger.warning("Encountered errors during tuning") + else: + logger.info("Tuning completed successfully") + + return not has_errors # ============================================================================= @@ -1487,13 +1497,12 @@ def resolve_paths(op_type: Operation, parsed_args) -> Paths: return perfRunner.create_paths(configs_path, parsed_args.mlir_build_dir) -def extract_fusion_configs(test_dir, paths: Paths, options: Options) -> Operation: +def extract_fusion_configs(test_dir, paths: Paths) -> Operation: """Extract tuning configurations from fusion E2E test files.""" all_configs = [] op_type = Operation.FUSION for filename in glob.glob(test_dir + '/*mlir'): - if not options.quiet: - print("Extract from:", filename, file=sys.stderr) + logger.info(f"Extract from: {filename}") test_entry = perfRunner.get_fusion_test_info(filename, paths) if not test_entry: continue @@ -1501,23 +1510,20 @@ def extract_fusion_configs(test_dir, paths: Paths, options: Options) -> Operatio if not test_vector: continue if test_vector in all_configs: - if not options.quiet: - print("An entry already exists in the tuning DB", file=sys.stderr) + logger.info("An entry already exists in the tuning DB") continue command_line = test_vector.split(sep=' ') if command_line[0].startswith('conv'): if op_type == Operation.FUSION: op_type = Operation.CONV elif op_type != Operation.CONV: - if not options.quiet: - print("Invalid config op: ", test_vector, file=sys.stderr) + logger.warning(f"Invalid config op: {test_vector}") continue else: if op_type == Operation.FUSION: op_type = Operation.GEMM elif op_type != Operation.GEMM: - if not options.quiet: - print("Invalid config op: ", test_vector, file=sys.stderr) + logger.warning(f"Invalid config op: {test_vector}") continue all_configs.append(test_vector) @@ -1645,12 +1651,11 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N choices=["quick", "full", "greedy", "exhaustive"], help="Tuning space kind to use") - parser.add_argument( - "-q", - "--quiet", - action='store_true', - default=False, - help="Suppress progress bars and informational messages, showing only errors") + parser.add_argument("-v", + "--verbose", + action='store_true', + default=False, + help="Enable verbose output, including commands being executed") parser.add_argument("--verify-mode", default="gpu", @@ -1737,6 +1742,8 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N def main(args=None): + global logger + gpu_topology = GpuTopology.discover() available_gpus = sorted(gpu_topology.gpus.keys()) @@ -1746,6 +1753,9 @@ def main(args=None): parsed_args = parse_arguments(gpu_topology, available_gpus, args) + if parsed_args.verbose: + logger = setup_logger(verbose=parsed_args.verbose) + stdin_temp_file = None try: # Handle stdin for configs file @@ -1757,7 +1767,7 @@ def main(args=None): paths = resolve_paths(op_type, parsed_args) if not paths.mlir_paths: - print("rocMLIR build dir was not provided/found", file=sys.stderr) + logger.error("rocMLIR build dir was not provided/found") return 1 arch = perfRunner.get_arch() @@ -1769,13 +1779,13 @@ def main(args=None): num_cu=num_cu, num_chiplets=num_chiplets, debug=parsed_args.debug, - quiet=parsed_args.quiet, + verbose=parsed_args.verbose, tuning_space_kind=parsed_args.tuning_space, rocmlir_gen_flags=parsed_args.rocmlir_gen_flags, verify_mode=parsed_args.verify_mode, verify_perfconfigs=parsed_args.verify_perf_configs, tflops=parsed_args.tflops, - output=ensure_tsv_extension(parsed_args.output), + output=parsed_args.output, abort_on_error=parsed_args.abort_on_error, retune=parsed_args.retune, retry_failed=parsed_args.retry_failed, @@ -1784,7 +1794,7 @@ def main(args=None): wait_for_compiles=parsed_args.wait_for_compiles) if op_type == Operation.FUSION: - op_type = extract_fusion_configs(parsed_args.test_dir, paths, options) + op_type = extract_fusion_configs(parsed_args.test_dir, paths) conf_class = get_config_class(op_type) configs = load_configs(op_type, parsed_args, paths) @@ -1800,7 +1810,6 @@ def main(args=None): return 0 if tuning_succeeded else 1 except KeyboardInterrupt: - print("Tuning interrupted by user", file=sys.stderr) return 130 # 128 + SIGINT finally: if stdin_temp_file: From d54ab9e348a4e956cf0bd9ec481524cbd80e8f20 Mon Sep 17 00:00:00 2001 From: Mirza Halilcevic Date: Mon, 12 Jan 2026 02:28:16 +0000 Subject: [PATCH 07/23] Improve readability of output. --- mlir/utils/performance/tuningRunner.py | 313 ++++++++++++++++--------- 1 file changed, 198 insertions(+), 115 deletions(-) diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py index 7a587b759f13..644243bedf50 100755 --- a/mlir/utils/performance/tuningRunner.py +++ b/mlir/utils/performance/tuningRunner.py @@ -4,10 +4,23 @@ This script tunes MLIR kernels by running them with different performance configurations and selecting the best one based on execution time. Usage examples: - python3 tuningRunner.py --op gemm --configs-file=../mlir/utils/performance/configs/tier1-gemm-configs --output=tuning_db.tsv - python3 tuningRunner.py --op gemm --config="-g 3 -m 1024 -k 769 -n 512 -t f32 -transA 0 -transB 0" - python3 tuningRunner.py --op conv --tuning-space=quick --config="conv -F 1 -f NCHW -I NCHW -O NCHW -n 256 -c 1024 -H 14 -W 14 -k 2048 -y 1 -x 1 -p 0 -q 0 -u 2 -v 2 -l 1 -j 1 -m conv -g 1 -t 1" - python3 tuningRunner.py --op fusion --test-dir=../mlir/test/fusion/resnet50-e2e --output=tuning_db.tsv + # Tune GEMM configs from a file + python3 tuningRunner.py --op gemm -c configs/tier1-gemm-configs -o tuning_db.tsv + + # Tune a single GEMM config + python3 tuningRunner.py --op gemm --config "-g 3 -m 1024 -k 769 -n 512 -t f32 -transA 0 -transB 0" + + # Quick-tune CONV configs from a file + python3 tuningRunner.py --op conv -c configs/tier1-conv-configs --tuning-space quick + + # Use a subset of available GPUs + python3 tuningRunner.py --op gemm -c configs/tier1-gemm-configs --gpus 2 3 + + # Tune fusion ops from E2E test directory + python3 tuningRunner.py --op fusion --test-dir ../mlir/test/fusion/resnet50-e2e + + # Pipe configs from stdin + cat configs/tier1-gemm-configs | python3 tuningRunner.py --op gemm -c - -o tuning_db.tsv """ import argparse @@ -57,14 +70,44 @@ # Logging Setup # ============================================================================= +# ANSI color codes +_LOG_COLORS = { + logging.DEBUG: '\033[36m', # Cyan + logging.INFO: '\033[34m', # Blue + logging.WARNING: '\033[33m', # Yellow + logging.ERROR: '\033[91m', # Red + logging.CRITICAL: '\033[91m', # Red +} +_COLOR_RESET = '\033[0m' + class TqdmLoggingHandler(logging.Handler): """Logging handler that uses tqdm.write() to avoid corrupting progress bars.""" + def __init__(self, use_color: bool = False): + super().__init__() + self.use_color = use_color + def emit(self, record): try: - msg = self.format(record) - tqdm.write(msg, file=sys.stderr) + msg = record.getMessage() + levelname = record.levelname + + if self.use_color: + color = _LOG_COLORS.get(record.levelno, '') + prefix = f"{color}{levelname}{_COLOR_RESET}: " + else: + prefix = f"{levelname}: " + + indent = ' ' * 4 + lines = msg.splitlines() + if len(lines) == 1: + formatted = prefix + lines[0] + else: + formatted = prefix + lines[0] + '\n' + '\n'.join( + indent + line for line in lines[1:]) + + tqdm.write(formatted, file=sys.stderr) except Exception: self.handleError(record) @@ -74,17 +117,12 @@ def setup_logger(verbose: bool = False) -> logging.Logger: log = logging.getLogger("tuningRunner") log.setLevel(logging.DEBUG if verbose else logging.INFO) - # Clear any existing handlers log.handlers.clear() - # Use tqdm-aware handler - handler = TqdmLoggingHandler() + use_color = sys.stderr.isatty() + handler = TqdmLoggingHandler(use_color=use_color) handler.setLevel(logging.DEBUG if verbose else logging.INFO) - # Simple format: level and message - formatter = logging.Formatter('%(levelname)s: %(message)s') - handler.setFormatter(formatter) - log.addHandler(handler) return log @@ -179,8 +217,7 @@ def validate_homogeneity(self, gpu_ids: List[int]) -> bool: def discover() -> 'GpuTopology': """Query GPU topology using rocm-smi. - rocm-smi reports physical device IDs regardless of environment variables - (e.g., ROCR_VISIBLE_DEVICES and HIP_VISIBLE_DEVICES). + rocm-smi reports physical device IDs regardless of environment variables (e.g., ROCR_VISIBLE_DEVICES and HIP_VISIBLE_DEVICES). """ try: output = subprocess.check_output( @@ -570,8 +607,8 @@ def from_output_file(cls, options: Options) -> 'TunedConfigsCache': file_commit = metadata.get('commit', 'unknown') if file_commit != current_commit: logger.warning( - f"Loading tuned configs from different commit " - f"(file: {file_commit[:8]}, current: {current_commit[:8]})") + f"Loading tuned configs from different commit (file: {file_commit[:8]}, current: {current_commit[:8]})" + ) # Reset metadata for next section metadata = {} @@ -777,8 +814,8 @@ def _compute_thread_allocation(self) -> Dict[int, int]: allocation[gpu_id] = max(1, int(allocation[gpu_id] * scale_factor)) else: logger.info( - f"--num-cpus={self.options.num_cpus} exceeds optimal {total_allocated}, " - f"using optimal allocation") + f"--num-cpus={self.options.num_cpus} exceeds optimal {total_allocated}, using optimal allocation" + ) return allocation @@ -789,11 +826,12 @@ def get_compile_threads(self, gpu_id: int) -> int: def print_gpu_summary(self): """Print summary of GPU allocation.""" num_active = len(self.options.gpu_ids) - logger.info(f"Using {num_active} GPU(s):") + lines = [f"Using {num_active} GPU(s)"] for gpu_id in self.options.gpu_ids[:num_active]: node = self.gpu_topology.get_numa_node(gpu_id) threads = self._threads_per_gpu.get(gpu_id, 1) - logger.info(f" GPU {gpu_id}: NUMA node {node}, {threads} compile threads") + lines.append(f"GPU {gpu_id}: NUMA node {node}, {threads} compile threads") + logger.info("\n".join(lines)) class GpuWorkerPool: @@ -814,7 +852,6 @@ def acquire_gpu_for_thread(self) -> int: """Assign a GPU to the calling thread if not already assigned. Also pins the thread to CPUs on the GPU's NUMA node for better memory locality. - Returns the assigned GPU ID. """ if hasattr(self._worker_state, 'assigned_gpu'): return self._worker_state.assigned_gpu @@ -1047,6 +1084,47 @@ def kill_process(proc) -> None: logger.warning(f"Failed to kill process {proc.pid}: {e}") +def format_error(context: str, + command: str = None, + stdout: str = None, + stderr: str = None, + exit_code: int = None, + gpu_id: int = None, + max_lines: int = 10) -> str: + """Format an error message with optional details.""" + + def truncate(text: str) -> str: + if not text or not text.strip(): + return None + lines = text.strip().splitlines() + if len(lines) <= max_lines: + return text.strip() + half = max_lines // 2 + return '\n'.join(lines[:half] + [f'... ({len(lines) - max_lines} lines omitted) ...'] + + lines[-half:]) + + parts = [context] + + if exit_code is not None: + parts.append(f"Exit code: {exit_code}") + + if command: + if gpu_id is not None: + parts.append(f"Reproduce: ROCR_VISIBLE_DEVICES={gpu_id} {command}") + else: + parts.append(f"Reproduce: {command}") + + truncated_stdout = truncate(stdout) + if truncated_stdout: + parts.append("stdout:\n" + truncated_stdout) + + truncated_stderr = truncate(stderr) + if truncated_stderr: + parts.append("stderr:\n" + truncated_stderr) + + return '\n'.join(parts) + + # ============================================================================= # Core Tuning Logic # ============================================================================= @@ -1081,9 +1159,7 @@ def verify_perfconfig(perfconfig, config, paths: Paths, options: Options, gpu_id verification_pipeline = " | ".join([ ' '.join(rocmlir_gen_command), ' '.join(rocmlir_driver_command), ' '.join(rocprof_command) ]) - - debug_info = f"[GPU {gpu_id}] Verification pipeline:\n" + verification_pipeline - logger.debug(debug_info) + logger.debug(f"[GPU {gpu_id}] Verifying perfconfig '{perfconfig}'\n{verification_pipeline}") with tempfile.TemporaryDirectory() as tmpdir: p1 = None @@ -1115,22 +1191,23 @@ def verify_perfconfig(perfconfig, config, paths: Paths, options: Options, gpu_id outs, errs = p3.communicate(timeout=600) outs = outs.decode('utf-8') if p3.returncode != 0 or not CORRECT_RESULT_RE.search(outs): - raise TuningError(f"""Verification failed -{debug_info} -stdout: -{outs} -stderr: -{errs.decode('utf-8')}""") + raise TuningError( + format_error(f"Verification failed for perfconfig '{perfconfig}'", + command=verification_pipeline, + stdout=outs, + stderr=errs.decode('utf-8'), + exit_code=p3.returncode, + gpu_id=gpu_id)) except subprocess.TimeoutExpired: kill_process(p3) outs, errs = p3.communicate() - raise TuningError(f"""Verification timed out -{debug_info} -stdout: -{outs.decode('utf-8')} -stderr: -{errs.decode('utf-8')}""") + raise TuningError( + format_error(f"Verification timed out for perfconfig '{perfconfig}'", + command=verification_pipeline, + stdout=outs.decode('utf-8'), + stderr=errs.decode('utf-8'), + gpu_id=gpu_id)) stats_file = os.path.join( tmpdir, @@ -1183,13 +1260,9 @@ def find_best_perfconfig(tuning_output, config, paths: Paths, options: Options, these_tflops = entry['TFlops'] if options.verify_perfconfigs and not np.isnan(nano_seconds): - try: - verify_ns = verify_perfconfig(perfconfig, config, paths, options, gpu_id) - except TuningError as e: - raise TuningError( - f"Error during verification of perf config {perfconfig}\n{str(e)}") + verify_ns = verify_perfconfig(perfconfig, config, paths, options, gpu_id) if np.isnan(verify_ns): - raise TuningError(f"Verification failed for perf config {perfconfig}") + raise TuningError(f"Verification returned NaN for perfconfig '{perfconfig}'") if not np.isnan(these_tflops) and these_tflops > max_tflops: max_tflops = these_tflops @@ -1247,11 +1320,17 @@ def tune_config(test_vector, conf_class, paths: Paths, options: Options, gpu_id: stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) - output, _ = tuning_key.communicate() + output, err = tuning_key.communicate() if tuning_key.returncode != 0: return { - 'success': False, - 'error': f"rocmlir-gen failed with return code {tuning_key.returncode}" + 'success': + False, + 'error': + format_error("Failed to generate tuning key", + command=' '.join(rocmlir_gen_command), + stderr=err.decode('utf-8'), + exit_code=tuning_key.returncode, + gpu_id=gpu_id) } result = output.decode('utf-8').strip().split('\t') command_line = result[2].split(sep=' ') @@ -1267,18 +1346,22 @@ def tune_config(test_vector, conf_class, paths: Paths, options: Options, gpu_id: env=env) tuning_pipeline = ' '.join(tuning_driver_command) - debug_info = f"[GPU {gpu_id}] Tuning '{test_vector}':\n" + tuning_pipeline - logger.debug(debug_info) + logger.debug(f"[GPU {gpu_id}] Tuning '{test_vector}'\n{tuning_pipeline}") # Note: communicate waits for process to terminate which might cause CI timeouts if tuning takes too long tuning_stdout, tuning_stderr = tuning_driver.communicate() if tuning_driver.returncode != 0: - error_msg = f"rocmlir-tuning-driver failed with return code {tuning_driver.returncode}" - stderr_content = tuning_stderr.decode('utf-8').strip() - if stderr_content: - error_msg += f"\nstderr:\n{stderr_content}" - return {'success': False, 'error': error_msg} + return { + 'success': + False, + 'error': + format_error("Tuning failed", + command=tuning_pipeline, + stderr=tuning_stderr.decode('utf-8'), + exit_code=tuning_driver.returncode, + gpu_id=gpu_id) + } tuning_output = tuning_stdout.decode('utf-8').splitlines() winning_config, max_tflops, entries = find_best_perfconfig(tuning_output, config, paths, @@ -1297,15 +1380,12 @@ def tune_config(test_vector, conf_class, paths: Paths, options: Options, gpu_id: try: verify_ns = verify_perfconfig(winning_config, config, paths, options, gpu_id) except TuningError as e: - return { - 'success': False, - 'error': f"Error during verification of winning config {winning_config}\n{str(e)}" - } + return {'success': False, 'error': str(e)} if np.isnan(verify_ns): return { 'success': False, - 'error': f"Verification failed for winning config {winning_config}" + 'error': f"Verification returned NaN for winning perfconfig '{winning_config}'" } verify_tflops = config.compute_tflops(verify_ns) @@ -1375,6 +1455,14 @@ def tune_configs(ctx: TuningContext) -> bool: num_workers = min(pool.worker_count, len(ctx.configs)) ctx.print_gpu_summary() + # Prepare ETA tracker with historical data + initial_times = [r.elapsed_seconds for r in cache.get_all_results() if r.elapsed_seconds > 0.0] + eta_tracker = ETATracker(total_configs=len(pending_configs), + num_workers=num_workers, + initial_times=initial_times, + initial_ok_count=skipped_success, + initial_fail_count=skipped_failed) + def execute_tuning_task(test_vector: str) -> TuningResult: gpu_id = pool.acquire_gpu_for_thread() @@ -1394,24 +1482,14 @@ def execute_tuning_task(test_vector: str) -> TuningResult: verify_tflops=result.get('verify_tflops'), error=result.get('error')) - executor = None - progress_bar = None - - has_errors = False - with OutputFileWriter(ctx.options.output, ctx.options) as results_writer: with DebugFileWriter(f"{ctx.options.output}.debug") if ctx.options.debug else nullcontext( ) as debug_writer: - try: # No context manager for executor because we need to shutdown with wait=False - initial_times = [ - r.elapsed_seconds for r in cache.get_all_results() if r.elapsed_seconds > 0.0 - ] - eta_tracker = ETATracker(total_configs=len(pending_configs), - num_workers=num_workers, - initial_times=initial_times, - initial_ok_count=skipped_success, - initial_fail_count=skipped_failed) + executor = None + progress_bar = None + + try: # No context manager for executor because we need to shutdown with wait=False progress_bar = tqdm( total=len(ctx.configs), initial=total_skipped, @@ -1430,6 +1508,7 @@ def execute_tuning_task(test_vector: str) -> TuningResult: for test_vector in pending_configs } + has_errors = False consecutive_failures = 0 for completed_future in as_completed(pending_futures): @@ -1446,10 +1525,10 @@ def execute_tuning_task(test_vector: str) -> TuningResult: consecutive_failures += 1 state_file.set_failed(result.test_vector) - error_text = result.error or "Unknown error" - formatted_error = f"[GPU {result.gpu_id}] Error tuning {result.test_vector}\n" + '\n'.join( - f"\t{line}" for line in error_text.splitlines()) - logger.error(formatted_error) + error_msg = f"[GPU {result.gpu_id}] Tuning failed for '{result.test_vector}'" + if result.error: + error_msg += "\n" + result.error + logger.error(error_msg) if ctx.options.abort_on_error: return False @@ -1732,11 +1811,13 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N metavar='N', help="Maximum CPU threads for compilation (default: auto-detect based on NUMA topology)") - parser.add_argument("--wait-for-compiles", - action='store_true', - default=False, - help="Wait for all compilation tasks to complete before starting tuning. " - "Useful for systems with shared CPU/GPU memory (e.g., APUs).") + parser.add_argument( + "--wait-for-compiles", + action='store_true', + default=False, + help= + "Wait for all compilation tasks to complete before starting tuning. Useful for systems with shared CPU/GPU memory (e.g., APUs)." + ) return parser.parse_args(args) @@ -1770,50 +1851,52 @@ def main(args=None): logger.error("rocMLIR build dir was not provided/found") return 1 - arch = perfRunner.get_arch() - chip = perfRunner.get_chip() - num_cu = perfRunner.get_num_cu(chip) - num_chiplets = perfRunner.get_num_chiplets(chip, num_cu) - - options = Options(arch=arch, - num_cu=num_cu, - num_chiplets=num_chiplets, - debug=parsed_args.debug, - verbose=parsed_args.verbose, - tuning_space_kind=parsed_args.tuning_space, - rocmlir_gen_flags=parsed_args.rocmlir_gen_flags, - verify_mode=parsed_args.verify_mode, - verify_perfconfigs=parsed_args.verify_perf_configs, - tflops=parsed_args.tflops, - output=parsed_args.output, - abort_on_error=parsed_args.abort_on_error, - retune=parsed_args.retune, - retry_failed=parsed_args.retry_failed, - gpu_ids=parsed_args.gpus, - num_cpus=parsed_args.num_cpus, - wait_for_compiles=parsed_args.wait_for_compiles) - if op_type == Operation.FUSION: op_type = extract_fusion_configs(parsed_args.test_dir, paths) conf_class = get_config_class(op_type) configs = load_configs(op_type, parsed_args, paths) - ctx = TuningContext(configs=configs, - conf_class=conf_class, - paths=paths, - options=options, - gpu_topology=gpu_topology, - numa_topology=NumaTopology.discover()) + finally: + if stdin_temp_file: + os.unlink(stdin_temp_file) + + arch = perfRunner.get_arch() + chip = perfRunner.get_chip() + num_cu = perfRunner.get_num_cu(chip) + num_chiplets = perfRunner.get_num_chiplets(chip, num_cu) + + options = Options(arch=arch, + num_cu=num_cu, + num_chiplets=num_chiplets, + debug=parsed_args.debug, + verbose=parsed_args.verbose, + tuning_space_kind=parsed_args.tuning_space, + rocmlir_gen_flags=parsed_args.rocmlir_gen_flags, + verify_mode=parsed_args.verify_mode, + verify_perfconfigs=parsed_args.verify_perf_configs, + tflops=parsed_args.tflops, + output=parsed_args.output, + abort_on_error=parsed_args.abort_on_error, + retune=parsed_args.retune, + retry_failed=parsed_args.retry_failed, + gpu_ids=parsed_args.gpus, + num_cpus=parsed_args.num_cpus, + wait_for_compiles=parsed_args.wait_for_compiles) + + ctx = TuningContext(configs=configs, + conf_class=conf_class, + paths=paths, + options=options, + gpu_topology=gpu_topology, + numa_topology=NumaTopology.discover()) + try: tuning_succeeded = tune_configs(ctx) - return 0 if tuning_succeeded else 1 - except KeyboardInterrupt: return 130 # 128 + SIGINT - finally: - if stdin_temp_file: - os.unlink(stdin_temp_file) + + return 0 if tuning_succeeded else 1 if __name__ == '__main__': From a2c338e4e3c8a4c167ca12f128ab711d8f7cde0d Mon Sep 17 00:00:00 2001 From: Mirza Halilcevic Date: Mon, 12 Jan 2026 02:56:03 +0000 Subject: [PATCH 08/23] Log warnings from tuning driver. --- mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp | 1 - mlir/utils/jenkins/Jenkinsfile | 4 ++-- mlir/utils/jenkins/Jenkinsfile.downstream | 4 ++-- mlir/utils/performance/tuningRunner.py | 7 ++++++- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp b/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp index 2a735302a3ea..78513dc5b16f 100644 --- a/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp +++ b/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp @@ -985,7 +985,6 @@ createTunableParamSpace(ModuleOp mod, TuningParamSetKind kind, // greedy is not implemented for non-accel if (!archInfo.isAccel(op) && kind == TuningParamSetKind::Greedy) { kind = TuningParamSetKind::Exhaustive; - // TODO: tuningRunner hides this warning llvm::errs() << "Greedy tuning not implemented for non-accel, using " "Exhaustive instead\n"; } diff --git a/mlir/utils/jenkins/Jenkinsfile b/mlir/utils/jenkins/Jenkinsfile index ffaf6e6baf1d..413bc0be1593 100644 --- a/mlir/utils/jenkins/Jenkinsfile +++ b/mlir/utils/jenkins/Jenkinsfile @@ -1179,10 +1179,10 @@ PY stage("Tune Fusion") { dir('build') { // Tune resnet50 - sh """python3 ./bin/tuningRunner.py --quiet --abort-on-error --op fusion --test-dir ../mlir/test/fusion/resnet50-e2e/ -o tuning_fusion_${CHIP}.tsv""" + sh """python3 ./bin/tuningRunner.py --abort-on-error --op fusion --test-dir ../mlir/test/fusion/resnet50-e2e/ -o tuning_fusion_${CHIP}.tsv""" // Tune bert - sh """python3 ./bin/tuningRunner.py --quiet --abort-on-error --op fusion --test-dir ../mlir/test/xmir/bert-torch-tosa-e2e/ -o tuning_fusion_${CHIP}.tsv""" + sh """python3 ./bin/tuningRunner.py --abort-on-error --op fusion --test-dir ../mlir/test/xmir/bert-torch-tosa-e2e/ -o tuning_fusion_${CHIP}.tsv""" } sh 'rm -f build/CMakeCache.txt' } diff --git a/mlir/utils/jenkins/Jenkinsfile.downstream b/mlir/utils/jenkins/Jenkinsfile.downstream index dd400380e7e9..b2f3d1e6dea6 100644 --- a/mlir/utils/jenkins/Jenkinsfile.downstream +++ b/mlir/utils/jenkins/Jenkinsfile.downstream @@ -150,12 +150,12 @@ pipeline { dir('build') { timeout(time: 60, activity: true, unit: 'MINUTES') { // Tune gemms, fail if the DB is not created - sh """python3 ./bin/tuningRunner.py --quiet --abort-on-error \ + sh """python3 ./bin/tuningRunner.py --abort-on-error \ --operation gemm \ --configs-file=../mlir/utils/jenkins/ci-configs/selected-gemm-configs \ --output=tuning_gemm.tsv [ -f tuning_gemm.tsv ]""" - sh """python3 ./bin/tuningRunner.py --quiet --abort-on-error \ + sh """python3 ./bin/tuningRunner.py --abort-on-error \ --operation conv \ --configs-file=../mlir/utils/jenkins/ci-configs/selected-conv-configs \ --output=tuning_conv.tsv diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py index 644243bedf50..5dc14cca61ab 100755 --- a/mlir/utils/performance/tuningRunner.py +++ b/mlir/utils/performance/tuningRunner.py @@ -1356,12 +1356,17 @@ def tune_config(test_vector, conf_class, paths: Paths, options: Options, gpu_id: 'success': False, 'error': - format_error("Tuning failed", + format_error("Tuning pipeline failed", command=tuning_pipeline, stderr=tuning_stderr.decode('utf-8'), exit_code=tuning_driver.returncode, gpu_id=gpu_id) } + else: + # Log any stderr output from tuning driver because it may contain warnings + tuning_stderr_str = tuning_stderr.decode('utf-8').strip() + if tuning_stderr_str: + logger.debug(f"[GPU {gpu_id}] rocmlir-tuning-driver stderr:\n{tuning_stderr_str}") tuning_output = tuning_stdout.decode('utf-8').splitlines() winning_config, max_tflops, entries = find_best_perfconfig(tuning_output, config, paths, From 58bada89e3b9ee596a27641fa0814082fb1925e5 Mon Sep 17 00:00:00 2001 From: Mirza Halilcevic Date: Sat, 17 Jan 2026 00:39:15 +0000 Subject: [PATCH 09/23] Reintroduce --quiet flag. --- mlir/utils/performance/tuningRunner.py | 27 +++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py index 5dc14cca61ab..eb71dec76fc2 100755 --- a/mlir/utils/performance/tuningRunner.py +++ b/mlir/utils/performance/tuningRunner.py @@ -112,10 +112,16 @@ def emit(self, record): self.handleError(record) -def setup_logger(verbose: bool = False) -> logging.Logger: +def setup_logger(quiet: bool = False, verbose: bool = False) -> logging.Logger: """Configure and return a logger for tuningRunner.""" log = logging.getLogger("tuningRunner") - log.setLevel(logging.DEBUG if verbose else logging.INFO) + + if quiet: + log.setLevel(logging.ERROR) + elif verbose: + log.setLevel(logging.DEBUG) + else: + log.setLevel(logging.INFO) log.handlers.clear() @@ -141,6 +147,7 @@ class Options: """Configuration options for the tuning process.""" debug: bool tuning_space_kind: str + quiet: bool verbose: bool arch: str num_cu: int @@ -428,7 +435,7 @@ def load(self, expected_context: TuningStateContext) -> 'TuningStateFile': tuning_space=data.get('tuningSpace', '')) if not file_context.matches(expected_context): - logger.info("State file context mismatch, starting fresh") + logger.warning("State file context mismatch, starting fresh") self._state = TuningState(context=expected_context) return self @@ -1498,7 +1505,7 @@ def execute_tuning_task(test_vector: str) -> TuningResult: progress_bar = tqdm( total=len(ctx.configs), initial=total_skipped, - disable=not sys.stderr.isatty(), + disable=ctx.options.quiet or not sys.stderr.isatty(), file=sys.stderr, desc=f"Tuning {ctx.conf_class.__name__} ({ctx.options.tuning_space_kind})", unit="config", @@ -1558,7 +1565,7 @@ def execute_tuning_task(test_vector: str) -> TuningResult: state_file.finalize_interrupted() if has_errors: - logger.warning("Encountered errors during tuning") + logger.error("Encountered errors during tuning") else: logger.info("Tuning completed successfully") @@ -1735,6 +1742,12 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N choices=["quick", "full", "greedy", "exhaustive"], help="Tuning space kind to use") + parser.add_argument("-q", + "--quiet", + action='store_true', + default=False, + help="Suppress non-error output") + parser.add_argument("-v", "--verbose", action='store_true', @@ -1839,8 +1852,7 @@ def main(args=None): parsed_args = parse_arguments(gpu_topology, available_gpus, args) - if parsed_args.verbose: - logger = setup_logger(verbose=parsed_args.verbose) + logger = setup_logger(quiet=parsed_args.quiet, verbose=parsed_args.verbose) stdin_temp_file = None try: @@ -1875,6 +1887,7 @@ def main(args=None): num_cu=num_cu, num_chiplets=num_chiplets, debug=parsed_args.debug, + quiet=parsed_args.quiet, verbose=parsed_args.verbose, tuning_space_kind=parsed_args.tuning_space, rocmlir_gen_flags=parsed_args.rocmlir_gen_flags, From 1cf2e36c0d13100a5dffe4d5aac47933f56a02ae Mon Sep 17 00:00:00 2001 From: Mirza Halilcevic Date: Sun, 18 Jan 2026 00:58:36 +0000 Subject: [PATCH 10/23] Let important exceptions propagate and clean up code. --- mlir/utils/performance/tuningRunner.py | 677 ++++++++++++------------- 1 file changed, 326 insertions(+), 351 deletions(-) diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py index eb71dec76fc2..eeec94be3d58 100755 --- a/mlir/utils/performance/tuningRunner.py +++ b/mlir/utils/performance/tuningRunner.py @@ -114,28 +114,21 @@ def emit(self, record): def setup_logger(quiet: bool = False, verbose: bool = False) -> logging.Logger: """Configure and return a logger for tuningRunner.""" - log = logging.getLogger("tuningRunner") + assert not (quiet and verbose), "quiet and verbose are mutually exclusive" if quiet: - log.setLevel(logging.ERROR) + logger.setLevel(logging.ERROR) elif verbose: - log.setLevel(logging.DEBUG) + logger.setLevel(logging.DEBUG) else: - log.setLevel(logging.INFO) + logger.setLevel(logging.INFO) - log.handlers.clear() - - use_color = sys.stderr.isatty() - handler = TqdmLoggingHandler(use_color=use_color) - handler.setLevel(logging.DEBUG if verbose else logging.INFO) - - log.addHandler(handler) - - return log + logger.handlers.clear() + logger.addHandler(TqdmLoggingHandler(use_color=sys.stderr.isatty())) # Module-level logger -logger: logging.Logger = setup_logger() +logger: logging.Logger = logging.getLogger("tuningRunner") # ============================================================================= # Configuration & Results @@ -170,8 +163,8 @@ class TuningResult: """Result of tuning a single configuration.""" test_vector: str success: bool - gpu_id: int - elapsed_seconds: float + gpu_id: int = -1 + elapsed_seconds: float = 0.0 winning_config: Optional[str] = None max_tflops: Optional[float] = None entries: List[Dict] = field(default_factory=list) @@ -194,7 +187,7 @@ class TuningError(Exception): # ============================================================================= -@dataclass +@dataclass(frozen=True) class Gpu: """Information about a GPU.""" gpu_id: int @@ -202,22 +195,21 @@ class Gpu: numa_node: int -@dataclass +@dataclass(frozen=True) class GpuTopology: """System GPU topology with NUMA mappings.""" gpus: Dict[int, Gpu] # GPU ID -> Gpu def get_numa_node(self, gpu_id: int) -> int: - """Get NUMA node for a GPU, defaults to 0 if unknown.""" - if gpu_id in self.gpus: - return self.gpus[gpu_id].numa_node - return 0 + """Get NUMA node for a GPU.""" + return self.gpus[gpu_id].numa_node def validate_homogeneity(self, gpu_ids: List[int]) -> bool: """Validate that all selected GPUs are of the same model.""" if len(gpu_ids) <= 1: return True - skus = {self.gpus[gpu_id].sku for gpu_id in gpu_ids if gpu_id in self.gpus} + + skus = {self.gpus[gpu_id].sku for gpu_id in gpu_ids} return len(skus) == 1 @staticmethod @@ -226,46 +218,36 @@ def discover() -> 'GpuTopology': rocm-smi reports physical device IDs regardless of environment variables (e.g., ROCR_VISIBLE_DEVICES and HIP_VISIBLE_DEVICES). """ - try: - output = subprocess.check_output( - ["rocm-smi", "--showproductname", "--showtoponuma", "--json"], - text=True, - timeout=10) - data = json.loads(output) - gpus = {} - for key, value in data.items(): - if key.startswith("card"): - gpu_id = int(key.replace("card", "")) - sku = value.get("Card SKU", "unknown") - numa_node_str = value.get("(Topology) Numa Node") - numa_node = int(numa_node_str) if numa_node_str is not None else 0 - gpus[gpu_id] = Gpu(gpu_id=gpu_id, sku=sku, numa_node=numa_node) - if gpus: - return GpuTopology(gpus=gpus) - logger.warning("rocm-smi returned no GPU cards") - except subprocess.CalledProcessError as e: - logger.warning(f"rocm-smi failed with return code {e.returncode}") - except subprocess.TimeoutExpired: - logger.warning("rocm-smi timed out") - except FileNotFoundError: - logger.warning("rocm-smi not found in PATH") - except json.JSONDecodeError as e: - logger.warning(f"Failed to parse rocm-smi JSON output: {e}") - except (ValueError, KeyError) as e: - logger.warning(f"Failed to extract GPU info from rocm-smi output: {e}") - - logger.warning("Could not detect GPUs, defaulting to GPU 0") - return GpuTopology(gpus={0: Gpu(gpu_id=0, sku="unknown", numa_node=0)}) + output = subprocess.check_output( + ["rocm-smi", "--showproductname", "--showtoponuma", "--json"], text=True, timeout=10) + data = json.loads(output) + gpus = {} + for key, value in data.items(): + if key.startswith("card"): + gpu_id = int(key.replace("card", "")) -@dataclass + sku = value["Card SKU"] + + numa_node_str = value.get("(Topology) Numa Node") + numa_node = int(numa_node_str) if numa_node_str is not None else 0 + + gpus[gpu_id] = Gpu(gpu_id=gpu_id, sku=sku, numa_node=numa_node) + + if not gpus: + raise RuntimeError("rocm-smi returned no GPU cards") + + return GpuTopology(gpus=gpus) + + +@dataclass(frozen=True) class NumaTopology: """System NUMA topology with CPU mappings.""" numa_to_cpus: Dict[int, List[int]] # NUMA node -> list of CPU IDs def get_cpus_for_numa_node(self, numa_node: int) -> List[int]: """Get CPUs belonging to a NUMA node.""" - return self.numa_to_cpus.get(numa_node, []) + return self.numa_to_cpus[numa_node] @staticmethod def discover() -> 'NumaTopology': @@ -281,11 +263,8 @@ def discover() -> 'NumaTopology': if entry.startswith("node") and entry[4:].isdigit(): node_id = int(entry[4:]) cpulist_path = os.path.join(numa_base, entry, "cpulist") - if os.path.exists(cpulist_path): - with open(cpulist_path, 'r') as f: - numa_to_cpus[node_id] = NumaTopology._parse_cpu_list(f.read()) - else: - logger.warning(f"Missing cpulist for NUMA node {node_id}") + with open(cpulist_path, 'r') as f: + numa_to_cpus[node_id] = NumaTopology._parse_cpu_list(f.read()) # Fallback: single node with all CPUs if not numa_to_cpus: @@ -332,7 +311,7 @@ class ConfigState(Enum): CRASHED = "crashed" # Process crashed while tuning (detected on startup) -@dataclass +@dataclass(frozen=True) class TuningStateContext: """Context that identifies a tuning run. State is invalidated if context changes.""" arch: str @@ -546,7 +525,7 @@ def get_state_filepath(output_filepath: str) -> Optional[str]: # ============================================================================= -@dataclass +@dataclass(frozen=True) class TunedConfigsCache: """Cache for previously tuned configurations loaded from output file.""" _results: Dict[str, TuningResult] = field(default_factory=dict) @@ -571,13 +550,13 @@ def count(self) -> int: def from_output_file(cls, options: Options) -> 'TunedConfigsCache': """Load previously tuned configurations from an output TSV file. - Format: # arch\tnumCUs\ttestVector\tperfConfig (tuning_space)\t[TFlops]\t[elapsedSeconds] - Only loads entries matching current arch, numCUs, and tuning space. + Format: # arch\tnumCUs\tnumChiplets\ttestVector\tperfConfig (tuning_space)\t[TFlops]\telapsedSeconds + Only loads entries matching current arch and tuning space. """ - cache = cls() - if options.output == '-' or not os.path.exists(options.output): - return cache + return cls() + + results: Dict[str, TuningResult] = {} current_commit = get_git_commit_hash() @@ -586,59 +565,50 @@ def from_output_file(cls, options: Options) -> 'TunedConfigsCache': matching_section = False column_indices: Dict[str, int] = {} - try: - with open(options.output, mode='r') as f: - for line in f: - line = line.strip() - if not line: - continue - - # Check for metadata line - if line.startswith('## '): - parts = line[3:].split(':', 1) - if len(parts) == 2: - key = parts[0].strip() - value = parts[1].strip() - metadata[key] = value - continue - - # Check for header line - if cls._is_header_line(line): - # Determine if this section matches based on tuning space - matching_section = f'({options.tuning_space_kind})' in line - - if matching_section: - column_indices = cls._parse_header_line(line) - - # Warn if commit hashes differ - file_commit = metadata.get('commit', 'unknown') - if file_commit != current_commit: - logger.warning( - f"Loading tuned configs from different commit (file: {file_commit[:8]}, current: {current_commit[:8]})" - ) - - # Reset metadata for next section - metadata = {} - continue - - # Skip other comment lines - if line.startswith('#'): - continue - - # Skip data lines from non-matching sections - if not matching_section or not column_indices: - continue - - # Parse data line - result = cls._parse_data_line(line.split('\t'), column_indices, options.arch, - options.num_cu) - if result: - cache._results[result.test_vector] = result - - except Exception as e: - logger.warning(f"Failed to load existing tuning results from {options.output}: {e}") - - return cache + with open(options.output, mode='r') as f: + for line in f: + line = line.strip() + if not line: + continue + + # Check for metadata line + if line.startswith('## '): + parts = line[3:].split(':') + if len(parts) == 2: + metadata[parts[0].strip()] = parts[1].strip() + continue + + # Check for header line + if cls._is_header_line(line): + # Determine if this section matches based on tuning space + matching_section = f'({options.tuning_space_kind})' in line + if matching_section: + column_indices = cls._parse_header_line(line) + # Warn if commit hashes differ + file_commit = metadata.get('commit', 'unknown') + if file_commit != current_commit: + logger.warning( + f"Loading tuned configs from different commit (file: {file_commit[:8]}, current: {current_commit[:8]})" + ) + + # Reset metadata for next section + metadata = {} + continue + + # Skip other comment lines + if line.startswith('#'): + continue + + # Skip data lines from non-matching sections + if not matching_section or not column_indices: + continue + + # Parse data line + result = cls._parse_data_line(line.split('\t'), column_indices, options.arch) + if result: + results[result.test_vector] = result + + return cls(_results=results) @staticmethod def _is_header_line(line: str) -> bool: @@ -650,24 +620,26 @@ def _parse_header_line(line: str) -> Dict[str, int]: """Parse column header and return name -> index mapping.""" # Strip leading '# ' if present header_text = line[2:] if line.startswith('# ') else line + indices = {} for i, col in enumerate(header_text.split('\t')): - if col: - # Exctract base column name (handles 'perfConfig (tuning_space)') - col_name = col.split()[0] - indices[col_name] = i + if not col: + continue + # Exctract base column name (handles 'perfConfig (tuning_space)') + col_name = col.split()[0] + indices[col_name] = i + return indices @staticmethod - def _parse_data_line(fields: List[str], column_indices: Dict[str, int], arch: str, - num_cu: int) -> Optional[TuningResult]: + def _parse_data_line(fields: List[str], column_indices: Dict[str, int], + arch: str) -> Optional[TuningResult]: """Parse a data line and return TuningResult if valid. A line is valid if: - - arch and numCUs match current system (if columns exist, for old format) + - arch matches current system - testVector is present - perfConfig is present and not 'None' - - TFlops is a valid finite number (if column exists) """ def get_field(name: str) -> Optional[str]: @@ -678,8 +650,6 @@ def get_field(name: str) -> Optional[str]: if get_field('arch') != arch: return None - if get_field('numCUs') != str(num_cu): - return None test_vector = get_field('testVector') if not test_vector: @@ -690,17 +660,14 @@ def get_field(name: str) -> Optional[str]: return None max_tflops = None - if 'TFlops' in column_indices: - tflops_str = get_field('TFlops') - if not tflops_str: - return None + tflops_str = get_field('TFlops') + if tflops_str: try: tflops_val = float(tflops_str) - if np.isnan(tflops_val) or np.isinf(tflops_val): - return None - max_tflops = tflops_val + if np.isfinite(tflops_val): + max_tflops = tflops_val except ValueError: - return None + pass elapsed_seconds = 0.0 elapsed_str = get_field('elapsedSeconds') @@ -723,26 +690,18 @@ class ETATracker: """Track completion times for accurate ETA estimation using median of successful configs.""" total_configs: int num_workers: int - initial_times: List[float] = field(default_factory=list) - initial_ok_count: int = 0 - initial_fail_count: int = 0 - _success_times: List[float] = field(default_factory=list, init=False) + success_times: List[float] = field(default_factory=list) + ok_count: int = 0 + fail_count: int = 0 _processed: int = field(default=0, init=False) - _ok_count: int = field(default=0, init=False) - _fail_count: int = field(default=0, init=False) - - def __post_init__(self): - self._success_times = list(self.initial_times) - self._ok_count = self.initial_ok_count - self._fail_count = self.initial_fail_count def record(self, result: TuningResult) -> None: self._processed += 1 if result.success: - self._ok_count += 1 - self._success_times.append(result.elapsed_seconds) + self.ok_count += 1 + self.success_times.append(result.elapsed_seconds) else: - self._fail_count += 1 + self.fail_count += 1 def _format_rate(self, seconds: float) -> str: if seconds < 60: @@ -771,13 +730,13 @@ def get_postfix_str(self) -> str: rate = "n/a" eta = "n/a" - if len(self._success_times) >= 3: - median = statistics.median(self._success_times) + if len(self.success_times) >= 3: + median = statistics.median(self.success_times) eta_seconds = (remaining / self.num_workers) * median rate = self._format_rate(median) eta = self._format_eta(eta_seconds) - return f"ok={self._ok_count}, fail={self._fail_count}, rate={rate}, eta={eta}" + return f"ok={self.ok_count}, fail={self.fail_count}, rate={rate}, eta={eta}" @dataclass @@ -828,7 +787,7 @@ def _compute_thread_allocation(self) -> Dict[int, int]: def get_compile_threads(self, gpu_id: int) -> int: """Get the number of compile threads allocated to a GPU.""" - return self._threads_per_gpu.get(gpu_id, 1) + return self._threads_per_gpu[gpu_id] def print_gpu_summary(self): """Print summary of GPU allocation.""" @@ -836,7 +795,7 @@ def print_gpu_summary(self): lines = [f"Using {num_active} GPU(s)"] for gpu_id in self.options.gpu_ids[:num_active]: node = self.gpu_topology.get_numa_node(gpu_id) - threads = self._threads_per_gpu.get(gpu_id, 1) + threads = self._threads_per_gpu[gpu_id] lines.append(f"GPU {gpu_id}: NUMA node {node}, {threads} compile threads") logger.info("\n".join(lines)) @@ -876,11 +835,7 @@ def _apply_numa_affinity(self, gpu_id: int) -> None: node = self._ctx.gpu_topology.get_numa_node(gpu_id) cpu_list = self._ctx.numa_topology.get_cpus_for_numa_node(node) - if cpu_list: - try: - os.sched_setaffinity(0, set(cpu_list)) - except OSError: - logger.warning(f"Could not set CPU affinity for GPU {gpu_id}") + os.sched_setaffinity(0, set(cpu_list)) self._set_memory_policy(node) @@ -902,7 +857,7 @@ def _set_memory_policy(self, numa_node: int) -> None: ctypes.byref(ctypes.c_ulong(nodemask)), maxnode=64) except (OSError, AttributeError): - pass # libnuma not available, rely on first-touch policy + logger.debug("libnuma not available, skipping memory policy setup") # ============================================================================= @@ -957,7 +912,7 @@ def _write_header(self): self._header_written = True def write_result(self, result: TuningResult): - assert result.success and result.winning_config and result.max_tflops, "write_result called with failed result" + assert result.success and result.winning_config and result.max_tflops, "write_result called with invalid result" self._write_header() @@ -991,15 +946,13 @@ def __exit__(self, exc_type, exc_value, traceback): if self.file: self.file.close() - def write_entries(self, entries: List[Dict]): - if not entries: - return + def write_result(self, result: TuningResult): + assert result.success and result.entries, "write_result called with invalid result" - pd.DataFrame(entries).to_csv(self.file, - sep='\t', - mode='a', - header=not self._header_written, - index=False) + pd.DataFrame(result.entries).to_csv(self.file, + sep='\t', + header=not self._header_written, + index=False) self.file.flush() self._header_written = True @@ -1013,7 +966,7 @@ def write_entries(self, entries: List[Dict]): class TuningArgumentParser(argparse.ArgumentParser): """ArgumentParser with custom validation for tuning arguments.""" - def __init__(self, *args, gpu_topology: GpuTopology = None, **kwargs): + def __init__(self, *args, gpu_topology: Optional[GpuTopology] = None, **kwargs): super().__init__(*args, **kwargs) self._gpu_topology = gpu_topology @@ -1044,10 +997,10 @@ def __call__(self, parser, namespace, values, option_string=None): def get_git_commit_hash() -> str: """Get the current git commit hash.""" try: - commit_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD'], - stderr=subprocess.DEVNULL).decode().strip() - return commit_hash - except Exception: + return subprocess.check_output(['git', 'rev-parse', 'HEAD'], + stderr=subprocess.DEVNULL).decode().strip() + except (subprocess.CalledProcessError, FileNotFoundError, OSError) as e: + logger.debug(f"Failed to get git commit hash: {e}") return "unknown" @@ -1075,10 +1028,10 @@ def verify_mode_flags(verify_mode: str) -> str: return "-pv" if verify_mode == "gpu": return "-pv_with_gpu --verifier-keep-perf-config=false" - raise ValueError("Unknown verification mode", verify_mode) + raise ValueError(f"Unknown verification mode: {verify_mode}") -def kill_process(proc) -> None: +def kill_process(proc: Optional[subprocess.Popen]) -> None: """Terminate a subprocess and wait for cleanup.""" if proc is None: return @@ -1092,11 +1045,11 @@ def kill_process(proc) -> None: def format_error(context: str, - command: str = None, - stdout: str = None, - stderr: str = None, - exit_code: int = None, - gpu_id: int = None, + command: Optional[str] = None, + stdout: Optional[str] = None, + stderr: Optional[str] = None, + exit_code: Optional[int] = None, + gpu_id: Optional[int] = None, max_lines: int = 10) -> str: """Format an error message with optional details.""" @@ -1123,11 +1076,11 @@ def truncate(text: str) -> str: truncated_stdout = truncate(stdout) if truncated_stdout: - parts.append("stdout:\n" + truncated_stdout) + parts.append("STDOUT:\n" + truncated_stdout) truncated_stderr = truncate(stderr) if truncated_stderr: - parts.append("stderr:\n" + truncated_stderr) + parts.append("STDERR:\n" + truncated_stderr) return '\n'.join(parts) @@ -1137,10 +1090,11 @@ def truncate(text: str) -> str: # ============================================================================= -def verify_perfconfig(perfconfig, config, paths: Paths, options: Options, gpu_id: int) -> float: +def verify_perfconfig(perfconfig: str, config: PerfConfiguration, paths: Paths, options: Options, + gpu_id: int) -> float: """Verify a performance config by running with profiling. - Returns the execution time in nanoseconds, or NaN if verification fails. + Returns the execution time in nanoseconds, or raises TuningError on failure. """ config.set_perfconfig(perfconfig) @@ -1230,57 +1184,62 @@ def verify_perfconfig(perfconfig, config, paths: Paths, options: Options, gpu_id return nano_seconds -def find_best_perfconfig(tuning_output, config, paths: Paths, options: Options, - gpu_id: int) -> tuple[str, float, List[Dict]]: +def find_best_perfconfig(tuning_output: List[str], config: PerfConfiguration, paths: Paths, + options: Options, + gpu_id: int) -> tuple[Optional[str], Optional[float], List[Dict]]: """Parse tuning driver output and find the best performing perfconfig. Returns the winning config, its TFLOPS, and all entries. """ - max_tflops = -np.inf - winning_config = "None" + max_tflops: Optional[float] = None + winning_config: Optional[str] = None entries = [] for line in tuning_output: result = line.strip() if not result: continue + + parts = result.split('\t') + if len(parts) < 2: + logger.debug(f"Skipping malformed tuning output line: '{result}'") + continue + + perfconfig = parts[0] + time = parts[-1] try: - parts = result.split('\t') - if len(parts) < 2: - continue # Skip silently - can happen during normal shutdown - perfconfig = parts[0] - time = parts[-1] if time == "N/A": nano_seconds = np.nan measurements = None else: nano_seconds = float(time) measurements = json.loads(parts[1]) if len(parts) == 3 else None - except ValueError: - continue # Skip silently - can happen during normal shutdown + except (ValueError, json.JSONDecodeError): + logger.debug(f"Skipping malformed tuning output line: '{result}'") + continue config.set_perfconfig(perfconfig) entry = config.table_entry(nano_seconds) if options.debug: entry["Measurements"] = measurements entries.append(entry) - these_tflops = entry['TFlops'] if options.verify_perfconfigs and not np.isnan(nano_seconds): verify_ns = verify_perfconfig(perfconfig, config, paths, options, gpu_id) if np.isnan(verify_ns): raise TuningError(f"Verification returned NaN for perfconfig '{perfconfig}'") - if not np.isnan(these_tflops) and these_tflops > max_tflops: + these_tflops = entry['TFlops'] + if not np.isnan(these_tflops) and (max_tflops is None or these_tflops > max_tflops): max_tflops = these_tflops winning_config = perfconfig return winning_config, max_tflops, entries -def tune_config(test_vector, conf_class, paths: Paths, options: Options, gpu_id: int, - num_compile_threads: int) -> Dict[str, Any]: - """Tune a single configuration and return the results.""" +def tune_config(test_vector: str, conf_class: type, paths: Paths, options: Options, gpu_id: int, + num_compile_threads: int) -> TuningResult: + """Tune a single configuration and return the result.""" tuning_driver_args = [ f"--tuning-space={options.tuning_space_kind}", f"--num-iterations={MLIR_N_REPEATS}", f"--warmup-iterations={WARMUP_ITERATIONS}", "--use-median", f"--sleep-us={SLEEP_US}", @@ -1295,14 +1254,17 @@ def tune_config(test_vector, conf_class, paths: Paths, options: Options, gpu_id: try: rocmlir_gen_command = [paths.mlir_paths.rocmlir_gen_path] tuning_driver_command = [paths.mlir_paths.rocmlir_tuning_driver_path] + tuning_driver_args + if not test_vector.endswith(".mlir"): command_line = test_vector.split(sep=' ') try: config = conf_class.from_command_line(command_line, options.arch, options.num_cu, options.num_chiplets) except ValueError as e: - return {'success': False, 'error': str(e)} - test_vector = config.to_command_line() + return TuningResult(test_vector=test_vector, + success=False, + gpu_id=gpu_id, + error=str(e)) command_line_options = config.generate_mlir_driver_commandline( options.rocmlir_gen_flags, kernel_repeats=None) # Note, we don't need the -ph, this goes to the tuning driver. @@ -1329,23 +1291,25 @@ def tune_config(test_vector, conf_class, paths: Paths, options: Options, gpu_id: env=env) output, err = tuning_key.communicate() if tuning_key.returncode != 0: - return { - 'success': - False, - 'error': - format_error("Failed to generate tuning key", + error = format_error("Failed to generate tuning key", command=' '.join(rocmlir_gen_command), stderr=err.decode('utf-8'), exit_code=tuning_key.returncode, gpu_id=gpu_id) - } + return TuningResult(test_vector=test_vector, + success=False, + gpu_id=gpu_id, + error=error) result = output.decode('utf-8').strip().split('\t') command_line = result[2].split(sep=' ') try: config = conf_class.from_command_line(command_line, options.arch, options.num_cu, options.num_chiplets) except ValueError as e: - return {'success': False, 'error': str(e)} + return TuningResult(test_vector=test_vector, + success=False, + gpu_id=gpu_id, + error=str(e)) tuning_driver_command += [test_vector] tuning_driver = subprocess.Popen(tuning_driver_command, stdout=subprocess.PIPE, @@ -1359,16 +1323,12 @@ def tune_config(test_vector, conf_class, paths: Paths, options: Options, gpu_id: tuning_stdout, tuning_stderr = tuning_driver.communicate() if tuning_driver.returncode != 0: - return { - 'success': - False, - 'error': - format_error("Tuning pipeline failed", + error = format_error("Tuning pipeline failed", command=tuning_pipeline, stderr=tuning_stderr.decode('utf-8'), exit_code=tuning_driver.returncode, gpu_id=gpu_id) - } + return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id, error=error) else: # Log any stderr output from tuning driver because it may contain warnings tuning_stderr_str = tuning_stderr.decode('utf-8').strip() @@ -1379,43 +1339,48 @@ def tune_config(test_vector, conf_class, paths: Paths, options: Options, gpu_id: winning_config, max_tflops, entries = find_best_perfconfig(tuning_output, config, paths, options, gpu_id) except TuningError as e: - return {'success': False, 'error': str(e)} + return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id, error=str(e)) finally: kill_process(rocmlir_gen) kill_process(tuning_driver) - if winning_config == "None": - return {'success': False, 'error': "No valid perf config found"} + if winning_config is None: + return TuningResult(test_vector=test_vector, + success=False, + gpu_id=gpu_id, + error="No valid perf config found") verify_tflops = None if options.verify_mode != "none": try: verify_ns = verify_perfconfig(winning_config, config, paths, options, gpu_id) except TuningError as e: - return {'success': False, 'error': str(e)} + return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id, error=str(e)) if np.isnan(verify_ns): - return { - 'success': False, - 'error': f"Verification returned NaN for winning perfconfig '{winning_config}'" - } + return TuningResult( + test_vector=test_vector, + success=False, + gpu_id=gpu_id, + error=f"Verification returned NaN for winning perfconfig '{winning_config}'") verify_tflops = config.compute_tflops(verify_ns) - return { - 'success': True, - 'winning_config': winning_config, - 'max_tflops': max_tflops, - 'entries': entries, - 'verify_tflops': verify_tflops - } + return TuningResult(test_vector=test_vector, + success=True, + gpu_id=gpu_id, + winning_config=winning_config, + max_tflops=max_tflops, + entries=entries, + verify_tflops=verify_tflops) def tune_configs(ctx: TuningContext) -> bool: """Tune multiple configurations in parallel across available GPUs.""" # Load cached results unless retuning is forced - cache = TunedConfigsCache() - if not ctx.options.retune: + if ctx.options.retune: + cache = TunedConfigsCache() + else: cache = TunedConfigsCache.from_output_file(ctx.options) if cache.count() > 0: logger.info(f"Found {cache.count()} tuned config(s) in {ctx.options.output}") @@ -1430,11 +1395,11 @@ def tune_configs(ctx: TuningContext) -> bool: crashed_count = state.crashed_count() if crashed_count > 0: - logger.warning(f"Detected {crashed_count} crashed config(s) from previous run") + logger.warning(f"Found {crashed_count} crashed config(s) in state file") failed_count = state.failed_count() if failed_count > 0: - logger.info(f"Found {failed_count} failed config(s) in state file") + logger.warning(f"Found {failed_count} failed config(s) in state file") state_file.save() @@ -1457,23 +1422,24 @@ def tune_configs(ctx: TuningContext) -> bool: if skipped_success > 0: logger.info(f"Skipping {skipped_success} already tuned config(s)") if skipped_failed > 0: - logger.info(f"Skipping {skipped_failed} failed/crashed config(s)") + logger.info( + f"Skipping {skipped_failed} failed/crashed config(s) - use '--retry-failed' to retune") if not pending_configs: logger.info("No configurations to tune") return True pool = GpuWorkerPool(ctx) - num_workers = min(pool.worker_count, len(ctx.configs)) + num_workers = min(pool.worker_count, len(pending_configs)) ctx.print_gpu_summary() # Prepare ETA tracker with historical data initial_times = [r.elapsed_seconds for r in cache.get_all_results() if r.elapsed_seconds > 0.0] eta_tracker = ETATracker(total_configs=len(pending_configs), num_workers=num_workers, - initial_times=initial_times, - initial_ok_count=skipped_success, - initial_fail_count=skipped_failed) + success_times=initial_times, + ok_count=skipped_success, + fail_count=skipped_failed) def execute_tuning_task(test_vector: str) -> TuningResult: gpu_id = pool.acquire_gpu_for_thread() @@ -1484,85 +1450,78 @@ def execute_tuning_task(test_vector: str) -> TuningResult: compile_threads = ctx.get_compile_threads(gpu_id) result = tune_config(test_vector, ctx.conf_class, ctx.paths, ctx.options, gpu_id, compile_threads) - return TuningResult(test_vector=test_vector, - success=result.get('success', False), - gpu_id=gpu_id, - elapsed_seconds=time.time() - start_time, - winning_config=result.get('winning_config'), - max_tflops=result.get('max_tflops'), - entries=result.get('entries', []), - verify_tflops=result.get('verify_tflops'), - error=result.get('error')) - - with OutputFileWriter(ctx.options.output, ctx.options) as results_writer: - with DebugFileWriter(f"{ctx.options.output}.debug") if ctx.options.debug else nullcontext( - ) as debug_writer: - - executor = None - progress_bar = None - - try: # No context manager for executor because we need to shutdown with wait=False - progress_bar = tqdm( - total=len(ctx.configs), - initial=total_skipped, - disable=ctx.options.quiet or not sys.stderr.isatty(), - file=sys.stderr, - desc=f"Tuning {ctx.conf_class.__name__} ({ctx.options.tuning_space_kind})", - unit="config", - leave=False, - bar_format= - '{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [t={elapsed}{postfix}]') + result.elapsed_seconds = time.time() - start_time + + return result + + has_errors = False + consecutive_failures = 0 + + with (OutputFileWriter(ctx.options.output, ctx.options) as results_writer, + DebugFileWriter(f"{ctx.options.output}.debug") if ctx.options.debug else nullcontext() as + debug_writer): + executor = None + progress_bar = None + + try: # No context manager for executor because we need to shutdown with wait=False + progress_bar = tqdm( + total=len(ctx.configs), + initial=total_skipped, + disable=ctx.options.quiet or not sys.stderr.isatty(), + file=sys.stderr, + desc=f"Tuning {ctx.conf_class.__name__} ({ctx.options.tuning_space_kind})", + unit="config", + leave=True, + bar_format= + '{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [t={elapsed}{postfix}]') + progress_bar.set_postfix_str(eta_tracker.get_postfix_str()) + + executor = ThreadPoolExecutor(max_workers=num_workers) + pending_futures = { + executor.submit(execute_tuning_task, test_vector): test_vector + for test_vector in pending_configs + } + + for completed_future in as_completed(pending_futures): + result = completed_future.result() + + if result.success: + consecutive_failures = 0 + results_writer.write_result(result) + if debug_writer: + debug_writer.write_result(result) + state_file.set_success(result.test_vector) + else: + has_errors = True + consecutive_failures += 1 + state_file.set_failed(result.test_vector) + + error_msg = f"[GPU {result.gpu_id}] Tuning failed for '{result.test_vector}'" + if result.error: + error_msg += "\n" + result.error + logger.error(error_msg) + + eta_tracker.record(result) + progress_bar.update(1) progress_bar.set_postfix_str(eta_tracker.get_postfix_str()) - executor = ThreadPoolExecutor(max_workers=num_workers) - pending_futures = { - executor.submit(execute_tuning_task, test_vector): test_vector - for test_vector in pending_configs - } - - has_errors = False - consecutive_failures = 0 - - for completed_future in as_completed(pending_futures): - result = completed_future.result() - - if result.success: - consecutive_failures = 0 - results_writer.write_result(result) - if debug_writer: - debug_writer.write_entries(result.entries) - state_file.set_success(result.test_vector) - else: - has_errors = True - consecutive_failures += 1 - state_file.set_failed(result.test_vector) - - error_msg = f"[GPU {result.gpu_id}] Tuning failed for '{result.test_vector}'" - if result.error: - error_msg += "\n" + result.error - logger.error(error_msg) - - if ctx.options.abort_on_error: - return False - - if consecutive_failures >= MAX_FAILURES: - logger.error("Aborting due to too many consecutive failures") - return False - - eta_tracker.record(result) - progress_bar.update(1) - progress_bar.set_postfix_str(eta_tracker.get_postfix_str()) - - except KeyboardInterrupt: - logger.info("Tuning interrupted by user") - raise - finally: - if executor: - executor.shutdown(wait=False, cancel_futures=True) - if progress_bar: - progress_bar.close() - - state_file.finalize_interrupted() + if has_errors and ctx.options.abort_on_error: + return False + + if consecutive_failures >= MAX_FAILURES: + logger.error("Aborting due to too many consecutive failures") + return False + + except KeyboardInterrupt: + logger.info("Tuning interrupted by user") + raise + finally: + if executor: + executor.shutdown(wait=False, cancel_futures=True) + if progress_bar: + progress_bar.close() + + state_file.finalize_interrupted() if has_errors: logger.error("Encountered errors during tuning") @@ -1577,7 +1536,7 @@ def execute_tuning_task(test_vector: str) -> TuningResult: # ============================================================================= -def resolve_paths(op_type: Operation, parsed_args) -> Paths: +def resolve_paths(op_type: Operation, parsed_args: argparse.Namespace) -> Paths: """Resolve paths based on operation type and arguments.""" if op_type == Operation.FUSION: configs_path = "./fusion_config_file" @@ -1588,34 +1547,42 @@ def resolve_paths(op_type: Operation, parsed_args) -> Paths: return perfRunner.create_paths(configs_path, parsed_args.mlir_build_dir) -def extract_fusion_configs(test_dir, paths: Paths) -> Operation: - """Extract tuning configurations from fusion E2E test files.""" +def extract_fusion_configs(test_dir: str, paths: Paths) -> Operation: + """Extract tuning configurations from fusion E2E test files. + + Writes extracted configs to paths.configuration_file_path and returns the detected operation type. + """ all_configs = [] op_type = Operation.FUSION + for filename in glob.glob(test_dir + '/*mlir'): logger.info(f"Extract from: {filename}") test_entry = perfRunner.get_fusion_test_info(filename, paths) if not test_entry: continue + test_vector = test_entry['testVector'] if not test_vector: continue + if test_vector in all_configs: - logger.info("An entry already exists in the tuning DB") + logger.debug("Duplicate entry skipped") continue + command_line = test_vector.split(sep=' ') if command_line[0].startswith('conv'): if op_type == Operation.FUSION: op_type = Operation.CONV elif op_type != Operation.CONV: - logger.warning(f"Invalid config op: {test_vector}") + logger.warning(f"Mixed operation types, skipping: {test_vector}") continue else: if op_type == Operation.FUSION: op_type = Operation.GEMM elif op_type != Operation.GEMM: - logger.warning(f"Invalid config op: {test_vector}") + logger.warning(f"Mixed operation types, skipping: {test_vector}") continue + all_configs.append(test_vector) with open(paths.configuration_file_path, 'w') as outfile: @@ -1635,7 +1602,8 @@ def get_config_class(op_type: Operation) -> type: Operation.CONV_GEMM: ConvGemmConfiguration, } - return config_classes.get(op_type, PerfConfiguration) + assert op_type in config_classes, f"No config class for operation: {op_type}" + return config_classes[op_type] def load_configs_from_stdin() -> str: @@ -1647,10 +1615,10 @@ def load_configs_from_stdin() -> str: return path -def load_configs(op_type: Operation, parsed_args, paths: Paths) -> List[str]: +def load_configs(op_type: Operation, parsed_args: argparse.Namespace, paths: Paths) -> List[str]: """Load configurations based on operation type and arguments.""" if parsed_args.config: - return parsed_args.config + return [parsed_args.config] loaders = { Operation.CONV: @@ -1667,11 +1635,8 @@ def load_configs(op_type: Operation, parsed_args, paths: Paths) -> List[str]: lambda: perfRunner.get_conv_gemm_configurations(paths.configuration_file_path), } - loader = loaders.get(op_type) - if loader: - return loader() - - raise ValueError(f"Unsupported operation type: {op_type}") + assert op_type in loaders, f"No config loader for operation: {op_type}" + return loaders[op_type]() # ============================================================================= @@ -1679,7 +1644,9 @@ def load_configs(op_type: Operation, parsed_args, paths: Paths) -> List[str]: # ============================================================================= -def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=None): +def parse_arguments(gpu_topology: GpuTopology, + available_gpus: List[int], + args=None) -> argparse.Namespace: """Parse and validate command-line arguments.""" parser = TuningArgumentParser( prog="tuningRunner.py", @@ -1694,11 +1661,12 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N "--configs-file", "--configs_file", # for backward compatibility type=str, - help="Path to file containing list of configurations to tune") + metavar='FILE', + help="Path to file containing list of configurations to tune. Use '-' for stdin.") config_group.add_argument("--config", type=str, - nargs='*', + metavar='CONFIG', help="Specific config to tune. Format depends on --op type.") parser.add_argument("--op", @@ -1712,6 +1680,7 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N "--output", type=str, default="tuning_results_local.tsv", + metavar='FILE', help= "Output file path for tuning results in TSV format. Results will be appended if file exists. Use '-' for stdout." ) @@ -1720,6 +1689,7 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N "--mlir-build-dir", type=str, default=perfRunner.find_mlir_build_dir(), + metavar='DIR', help= "Path to rocMLIR build directory containing rocmlir-gen, rocmlir-driver, rocmlir-tuning-driver, and other build artifacts", ) @@ -1729,6 +1699,7 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N "--rocmlir_gen_flags", # for backward compatibility type=str, default="", + metavar='FLAGS', help="Additional flags to pass to rocmlir-gen") parser.add_argument("-d", @@ -1742,17 +1713,19 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N choices=["quick", "full", "greedy", "exhaustive"], help="Tuning space kind to use") - parser.add_argument("-q", - "--quiet", - action='store_true', - default=False, - help="Suppress non-error output") + logging_group = parser.add_mutually_exclusive_group() - parser.add_argument("-v", - "--verbose", - action='store_true', - default=False, - help="Enable verbose output, including commands being executed") + logging_group.add_argument("-q", + "--quiet", + action='store_true', + default=False, + help="Suppress non-error output") + + logging_group.add_argument("-v", + "--verbose", + action='store_true', + default=False, + help="Enable verbose output, including commands being executed") parser.add_argument("--verify-mode", default="gpu", @@ -1772,6 +1745,7 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N "--test_dir", # for backward compatibility default="../mlir/test/fusion/resnet50-e2e", type=str, + metavar='DIR', help= "Directory containing fusion E2E tests to extract configs from. Only used when --op=fusion." ) @@ -1783,6 +1757,7 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N "fp8_fp8", "f4E2M1FN" ], default=["f32", "f16", "i8"], + metavar='TYPE', help="Force a set of data types for gemm tuning. Only used when --op=gemm.") parser.add_argument( @@ -1790,6 +1765,7 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N nargs='+', choices=["f32", "f8E8M0FNU"], default=None, + metavar='TYPE', help="Force a set of scale types for gemm tuning. Only used when --op=gemm.") parser.add_argument("--tflops", @@ -1841,8 +1817,7 @@ def parse_arguments(gpu_topology: GpuTopology, available_gpus: List[int], args=N def main(args=None): - global logger - + numa_topology = NumaTopology.discover() gpu_topology = GpuTopology.discover() available_gpus = sorted(gpu_topology.gpus.keys()) @@ -1852,7 +1827,7 @@ def main(args=None): parsed_args = parse_arguments(gpu_topology, available_gpus, args) - logger = setup_logger(quiet=parsed_args.quiet, verbose=parsed_args.verbose) + setup_logger(quiet=parsed_args.quiet, verbose=parsed_args.verbose) stdin_temp_file = None try: @@ -1907,7 +1882,7 @@ def main(args=None): paths=paths, options=options, gpu_topology=gpu_topology, - numa_topology=NumaTopology.discover()) + numa_topology=numa_topology) try: tuning_succeeded = tune_configs(ctx) From 110dccbacd1c13a9837bef429aff3835708481f5 Mon Sep 17 00:00:00 2001 From: Mirza Halilcevic Date: Sun, 18 Jan 2026 12:11:14 +0000 Subject: [PATCH 11/23] Simplify state file and support multiple contexts. --- mlir/utils/performance/tuningRunner.py | 242 +++++++++---------------- 1 file changed, 82 insertions(+), 160 deletions(-) diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py index eeec94be3d58..c7521b313221 100755 --- a/mlir/utils/performance/tuningRunner.py +++ b/mlir/utils/performance/tuningRunner.py @@ -64,7 +64,6 @@ MLIR_N_REPEATS = 10 WARMUP_ITERATIONS = 1 SLEEP_US = 100 # 0.1 ms -MAX_FAILURES = 20 # ============================================================================= # Logging Setup @@ -311,66 +310,36 @@ class ConfigState(Enum): CRASHED = "crashed" # Process crashed while tuning (detected on startup) -@dataclass(frozen=True) -class TuningStateContext: - """Context that identifies a tuning run. State is invalidated if context changes.""" - arch: str - num_cu: int - tuning_space: str - - def matches(self, other: 'TuningStateContext') -> bool: - return (self.arch == other.arch and self.num_cu == other.num_cu and - self.tuning_space == other.tuning_space) - - @dataclass class TuningState: - """Persistent state for tuning runs, survives crashes and interrupts.""" - context: TuningStateContext + """State tracking for configs within a single context.""" configs: Dict[str, ConfigState] = field(default_factory=dict) def set_running(self, test_vector: str) -> None: - """Mark a config as currently running.""" self.configs[test_vector] = ConfigState.RUNNING def set_failed(self, test_vector: str) -> None: - """Mark a config as failed.""" self.configs[test_vector] = ConfigState.FAILED def set_interrupted(self, test_vector: str) -> None: - """Mark a config as interrupted by user.""" self.configs[test_vector] = ConfigState.INTERRUPTED - def set_crashed(self, test_vector: str) -> None: - """Mark a config as crashed.""" - self.configs[test_vector] = ConfigState.CRASHED - def remove(self, test_vector: str) -> None: - """Remove a config from state (e.g., on success).""" self.configs.pop(test_vector, None) def should_skip(self, test_vector: str) -> bool: - """Check if a config should be skipped (failed or crashed).""" return self.configs.get(test_vector) in (ConfigState.FAILED, ConfigState.CRASHED) - def _count_by_state(self, *states: ConfigState) -> int: - """Count configs in any of the given states.""" - return sum(1 for s in self.configs.values() if s in states) + def is_empty(self) -> bool: + return not self.configs def failed_count(self) -> int: - """Count of failed configs.""" - return self._count_by_state(ConfigState.FAILED) + return sum(1 for s in self.configs.values() if s == ConfigState.FAILED) def crashed_count(self) -> int: - """Count of crashed configs.""" - return self._count_by_state(ConfigState.CRASHED) - - def skip_count(self) -> int: - """Count of configs that should be skipped (failed + crashed).""" - return self._count_by_state(ConfigState.FAILED, ConfigState.CRASHED) + return sum(1 for s in self.configs.values() if s == ConfigState.CRASHED) def promote_running_to_interrupted(self) -> int: - """Move all RUNNING configs to INTERRUPTED (clean shutdown). Returns count.""" count = 0 for tv in self.configs: if self.configs[tv] == ConfigState.RUNNING: @@ -380,137 +349,114 @@ def promote_running_to_interrupted(self) -> int: class TuningStateFile: - """Manages reading and writing of tuning state to a JSON file. + """Manages multi-context tuning state in a JSON file. + + File format: + { + "contexts": { + "/": { + "test_vector_1": "failed", + "test_vector_2": "crashed" + } + } + } - If filepath is None, all operations are no-ops (null object pattern). + If filepath is None, all operations are no-ops. """ - def __init__(self, filepath: Optional[str]): + def __init__(self, filepath: Optional[str], arch: str, tuning_space: str): self.filepath = filepath + self.context_key = f"{arch}/{tuning_space}" self._lock = threading.Lock() - self._state: Optional[TuningState] = None + self._all_contexts: Dict[str, Dict[str, str]] = {} # context_key -> {tv -> state_str} + self._state = TuningState() - def load(self, expected_context: TuningStateContext) -> 'TuningStateFile': - """Load state from file. Returns self for chaining. + self._load() + self._save_locked() # Persist any state transitions from load - On load: - - INTERRUPTED configs are demoted to PENDING (removed from state) - - RUNNING configs are promoted to CRASHED (indicates previous crash) - """ - if not self.filepath: - self._state = TuningState(context=expected_context) - return self + def _load(self) -> None: + """Load state from file. - if not os.path.exists(self.filepath): - self._state = TuningState(context=expected_context) - return self + For the active context only: + - INTERRUPTED configs are removed (will be retried) + - RUNNING configs become CRASHED (stale = crash) + """ + if not self.filepath or not os.path.exists(self.filepath): + return try: with open(self.filepath, 'r') as f: data = json.load(f) + self._all_contexts = data.get('contexts', {}) + except (json.JSONDecodeError, TypeError, OSError) as e: + logger.warning(f"Failed to load state file, starting fresh: {e}") + return - file_context = TuningStateContext(arch=data.get('arch', ''), - num_cu=data.get('numCUs', 0), - tuning_space=data.get('tuningSpace', '')) - - if not file_context.matches(expected_context): - logger.warning("State file context mismatch, starting fresh") - self._state = TuningState(context=expected_context) - return self - - configs = {} - for tv, state_str in data.get('configs', {}).items(): + # Process configs for active context with state transitions + if self.context_key in self._all_contexts: + for tv, state_str in self._all_contexts[self.context_key].items(): try: - config_state = ConfigState(state_str) - # Demote INTERRUPTED to PENDING (don't add to configs) - if config_state == ConfigState.INTERRUPTED: - continue - # Promote RUNNING to CRASHED (stale running = crash) - if config_state == ConfigState.RUNNING: - config_state = ConfigState.CRASHED - configs[tv] = config_state + state = ConfigState(state_str) + if state == ConfigState.INTERRUPTED: + continue # Remove - will retry + if state == ConfigState.RUNNING: + state = ConfigState.CRASHED # Stale running = crashed + self._state.configs[tv] = state except ValueError: - pass # Skip invalid states - - self._state = TuningState(context=expected_context, configs=configs) - return self - - except (json.JSONDecodeError, KeyError, TypeError) as e: - logger.warning(f"Failed to load state file: {e}") - self._state = TuningState(context=expected_context) - return self + logger.warning(f"Unknown state '{state_str}' for config '{tv}' in state file") @property def state(self) -> TuningState: - """Get the current state. Must call load() first.""" - if self._state is None: - raise RuntimeError("State not loaded. Call load() first.") return self._state def _save_locked(self) -> None: - """Save state to file atomically. Assumes lock is held.""" - if not self.filepath or not self._state: + if not self.filepath: return - data = { - 'arch': self._state.context.arch, - 'numCUs': self._state.context.num_cu, - 'tuningSpace': self._state.context.tuning_space, - 'configs': { + # Update active context in all_contexts + if not self._state.is_empty(): + self._all_contexts[self.context_key] = { tv: s.value for tv, s in self._state.configs.items() } - } + else: + self._all_contexts.pop(self.context_key, None) + + # Remove empty contexts + self._all_contexts = {k: v for k, v in self._all_contexts.items() if v} + + # Delete file if nothing left, otherwise save + if not self._all_contexts: + if os.path.exists(self.filepath): + os.remove(self.filepath) + return - # Write to temp file then rename for atomicity temp_path = self.filepath + '.tmp' with open(temp_path, 'w') as f: - json.dump(data, f, indent=2) + json.dump({'contexts': self._all_contexts}, f, indent=2) os.replace(temp_path, self.filepath) - def save(self) -> None: - """Save state to file atomically. No-op if filepath is None.""" + def set_running(self, test_vector: str) -> None: with self._lock: + self._state.set_running(test_vector) self._save_locked() - def delete(self) -> None: - """Delete the state file. No-op if filepath is None.""" - if not self.filepath: - return - - with self._lock: - if os.path.exists(self.filepath): - os.remove(self.filepath) - self._state = None - - def set_running(self, test_vector: str) -> None: - """Mark a config as running and save.""" - if self._state: - with self._lock: - self._state.set_running(test_vector) - self._save_locked() - def set_failed(self, test_vector: str) -> None: - """Mark a config as failed and save.""" - if self._state: - with self._lock: - self._state.set_failed(test_vector) - self._save_locked() + with self._lock: + self._state.set_failed(test_vector) + self._save_locked() def set_success(self, test_vector: str) -> None: - """Remove a config from state (success) and save.""" - if self._state: - with self._lock: - self._state.remove(test_vector) - self._save_locked() + with self._lock: + self._state.remove(test_vector) + self._save_locked() def finalize_interrupted(self) -> None: - """Mark any RUNNING configs as INTERRUPTED and save. Called on clean shutdown.""" - if self._state: - with self._lock: - interrupted_count = self._state.promote_running_to_interrupted() - if interrupted_count > 0: - logger.info(f"Marked {interrupted_count} running config(s) as interrupted") - self._save_locked() + """Mark RUNNING configs as INTERRUPTED on clean shutdown.""" + with self._lock: + count = self._state.promote_running_to_interrupted() + if count > 0: + logger.info(f"Marked {count} running config(s) as interrupted") + self._save_locked() def get_state_filepath(output_filepath: str) -> Optional[str]: @@ -1257,14 +1203,8 @@ def tune_config(test_vector: str, conf_class: type, paths: Paths, options: Optio if not test_vector.endswith(".mlir"): command_line = test_vector.split(sep=' ') - try: - config = conf_class.from_command_line(command_line, options.arch, options.num_cu, - options.num_chiplets) - except ValueError as e: - return TuningResult(test_vector=test_vector, - success=False, - gpu_id=gpu_id, - error=str(e)) + config = conf_class.from_command_line(command_line, options.arch, options.num_cu, + options.num_chiplets) command_line_options = config.generate_mlir_driver_commandline( options.rocmlir_gen_flags, kernel_repeats=None) # Note, we don't need the -ph, this goes to the tuning driver. @@ -1302,14 +1242,8 @@ def tune_config(test_vector: str, conf_class: type, paths: Paths, options: Optio error=error) result = output.decode('utf-8').strip().split('\t') command_line = result[2].split(sep=' ') - try: - config = conf_class.from_command_line(command_line, options.arch, options.num_cu, - options.num_chiplets) - except ValueError as e: - return TuningResult(test_vector=test_vector, - success=False, - gpu_id=gpu_id, - error=str(e)) + config = conf_class.from_command_line(command_line, options.arch, options.num_cu, + options.num_chiplets) tuning_driver_command += [test_vector] tuning_driver = subprocess.Popen(tuning_driver_command, stdout=subprocess.PIPE, @@ -1386,11 +1320,8 @@ def tune_configs(ctx: TuningContext) -> bool: logger.info(f"Found {cache.count()} tuned config(s) in {ctx.options.output}") # Load state file - state_context = TuningStateContext(arch=ctx.options.arch, - num_cu=ctx.options.num_cu, - tuning_space=ctx.options.tuning_space_kind) - state_file = TuningStateFile(get_state_filepath(ctx.options.output)) - state_file.load(state_context) + state_file = TuningStateFile(get_state_filepath(ctx.options.output), ctx.options.arch, + ctx.options.tuning_space_kind) state = state_file.state crashed_count = state.crashed_count() @@ -1401,8 +1332,6 @@ def tune_configs(ctx: TuningContext) -> bool: if failed_count > 0: logger.warning(f"Found {failed_count} failed config(s) in state file") - state_file.save() - # Filter out already-tuned configs (unless --retune) pending_configs = ctx.configs skipped_success = 0 @@ -1455,7 +1384,6 @@ def execute_tuning_task(test_vector: str) -> TuningResult: return result has_errors = False - consecutive_failures = 0 with (OutputFileWriter(ctx.options.output, ctx.options) as results_writer, DebugFileWriter(f"{ctx.options.output}.debug") if ctx.options.debug else nullcontext() as @@ -1471,7 +1399,7 @@ def execute_tuning_task(test_vector: str) -> TuningResult: file=sys.stderr, desc=f"Tuning {ctx.conf_class.__name__} ({ctx.options.tuning_space_kind})", unit="config", - leave=True, + leave=False, bar_format= '{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [t={elapsed}{postfix}]') progress_bar.set_postfix_str(eta_tracker.get_postfix_str()) @@ -1486,14 +1414,12 @@ def execute_tuning_task(test_vector: str) -> TuningResult: result = completed_future.result() if result.success: - consecutive_failures = 0 results_writer.write_result(result) if debug_writer: debug_writer.write_result(result) state_file.set_success(result.test_vector) else: has_errors = True - consecutive_failures += 1 state_file.set_failed(result.test_vector) error_msg = f"[GPU {result.gpu_id}] Tuning failed for '{result.test_vector}'" @@ -1508,10 +1434,6 @@ def execute_tuning_task(test_vector: str) -> TuningResult: if has_errors and ctx.options.abort_on_error: return False - if consecutive_failures >= MAX_FAILURES: - logger.error("Aborting due to too many consecutive failures") - return False - except KeyboardInterrupt: logger.info("Tuning interrupted by user") raise From 90132b2de8394cdd2f9d625f44096fa92d518c04 Mon Sep 17 00:00:00 2001 From: Mirza Halilcevic Date: Sun, 18 Jan 2026 12:25:47 +0000 Subject: [PATCH 12/23] Address copilot comments. --- mlir/utils/performance/tuningRunner.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py index c7521b313221..ed7a0de0bdaf 100755 --- a/mlir/utils/performance/tuningRunner.py +++ b/mlir/utils/performance/tuningRunner.py @@ -571,7 +571,7 @@ def _parse_header_line(line: str) -> Dict[str, int]: for i, col in enumerate(header_text.split('\t')): if not col: continue - # Exctract base column name (handles 'perfConfig (tuning_space)') + # Extract base column name (handles 'perfConfig (tuning_space)') col_name = col.split()[0] indices[col_name] = i @@ -835,9 +835,6 @@ def __exit__(self, exc_type, exc_value, traceback): self.file.close() def _write_header(self): - if self._header_written: - return - if self._is_appending: print("", file=self.file) # Blank line before new section @@ -852,15 +849,17 @@ def _write_header(self): if self.options.tflops: columns.append('TFlops') columns.append('elapsedSeconds') - print("# " + "\t".join(columns), file=self.file) + print("# " + "\t".join(columns), file=self.file) self.file.flush() + self._header_written = True def write_result(self, result: TuningResult): assert result.success and result.winning_config and result.max_tflops, "write_result called with invalid result" - self._write_header() + if not self._header_written: + self._write_header() fields = [ self.options.arch, @@ -870,8 +869,8 @@ def write_result(self, result: TuningResult): if self.options.tflops: fields.append(str(result.max_tflops)) fields.append(f"{result.elapsed_seconds:.1f}") - print("\t".join(fields), file=self.file) + print("\t".join(fields), file=self.file) self.file.flush() @@ -899,8 +898,8 @@ def write_result(self, result: TuningResult): sep='\t', header=not self._header_written, index=False) - self.file.flush() + self._header_written = True @@ -999,7 +998,7 @@ def format_error(context: str, max_lines: int = 10) -> str: """Format an error message with optional details.""" - def truncate(text: str) -> str: + def truncate(text: Optional[str]) -> Optional[str]: if not text or not text.strip(): return None lines = text.strip().splitlines() From b781b40c3eb1958a33b4cf22a3f1c6dde1a9ac97 Mon Sep 17 00:00:00 2001 From: Mirza Halilcevic Date: Mon, 19 Jan 2026 18:11:54 +0000 Subject: [PATCH 13/23] Show tuning-driver output during failures. --- mlir/utils/performance/tuningRunner.py | 79 +++++++++++++------------- 1 file changed, 41 insertions(+), 38 deletions(-) diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py index ed7a0de0bdaf..7b81fb2f3660 100755 --- a/mlir/utils/performance/tuningRunner.py +++ b/mlir/utils/performance/tuningRunner.py @@ -658,7 +658,9 @@ def _format_rate(self, seconds: float) -> str: return f"{seconds / 3600:.1f}h/cfg" def _format_eta(self, seconds: float) -> str: - if seconds < 60: + if seconds == 0: + return "0s" + elif seconds < 60: return "<1m" elif seconds < 3600: return f"{int(seconds // 60)}m" @@ -678,7 +680,7 @@ def get_postfix_str(self) -> str: eta = "n/a" if len(self.success_times) >= 3: median = statistics.median(self.success_times) - eta_seconds = (remaining / self.num_workers) * median + eta_seconds = (remaining / self.num_workers) * median if self.num_workers > 0 else 0 rate = self._format_rate(median) eta = self._format_eta(eta_seconds) @@ -735,9 +737,9 @@ def get_compile_threads(self, gpu_id: int) -> int: """Get the number of compile threads allocated to a GPU.""" return self._threads_per_gpu[gpu_id] - def print_gpu_summary(self): + def print_gpu_summary(self, num_workers: Optional[int] = None) -> None: """Print summary of GPU allocation.""" - num_active = len(self.options.gpu_ids) + num_active = num_workers or len(self.options.gpu_ids) lines = [f"Using {num_active} GPU(s)"] for gpu_id in self.options.gpu_ids[:num_active]: node = self.gpu_topology.get_numa_node(gpu_id) @@ -1255,20 +1257,23 @@ def tune_config(test_vector: str, conf_class: type, paths: Paths, options: Optio # Note: communicate waits for process to terminate which might cause CI timeouts if tuning takes too long tuning_stdout, tuning_stderr = tuning_driver.communicate() + tuning_output = tuning_stdout.decode('utf-8').splitlines() + tuning_errors = tuning_stderr.decode('utf-8') + if tuning_driver.returncode != 0: - error = format_error("Tuning pipeline failed", - command=tuning_pipeline, - stderr=tuning_stderr.decode('utf-8'), - exit_code=tuning_driver.returncode, - gpu_id=gpu_id) + error = format_error( + "Tuning pipeline failed", + command=tuning_pipeline, + stdout=tuning_output[-10:], # Last 10 lines of stdout + stderr=tuning_errors, + exit_code=tuning_driver.returncode, + gpu_id=gpu_id) return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id, error=error) else: # Log any stderr output from tuning driver because it may contain warnings - tuning_stderr_str = tuning_stderr.decode('utf-8').strip() - if tuning_stderr_str: - logger.debug(f"[GPU {gpu_id}] rocmlir-tuning-driver stderr:\n{tuning_stderr_str}") + if tuning_errors.strip(): + logger.warning(f"[GPU {gpu_id}] rocmlir-tuning-driver stderr:\n{tuning_errors}") - tuning_output = tuning_stdout.decode('utf-8').splitlines() winning_config, max_tflops, entries = find_best_perfconfig(tuning_output, config, paths, options, gpu_id) except TuningError as e: @@ -1310,29 +1315,27 @@ def tune_config(test_vector: str, conf_class: type, paths: Paths, options: Optio def tune_configs(ctx: TuningContext) -> bool: """Tune multiple configurations in parallel across available GPUs.""" - # Load cached results unless retuning is forced + # Load tuned configs from output file (unless --retune) if ctx.options.retune: cache = TunedConfigsCache() else: cache = TunedConfigsCache.from_output_file(ctx.options) - if cache.count() > 0: - logger.info(f"Found {cache.count()} tuned config(s) in {ctx.options.output}") # Load state file state_file = TuningStateFile(get_state_filepath(ctx.options.output), ctx.options.arch, ctx.options.tuning_space_kind) state = state_file.state - crashed_count = state.crashed_count() - if crashed_count > 0: - logger.warning(f"Found {crashed_count} crashed config(s) in state file") + if cache.count() > 0: + logger.info(f"Found {cache.count()} tuned config(s) in {ctx.options.output}") + if state.crashed_count() > 0: + logger.warning(f"Found {state.crashed_count()} crashed config(s) in state file") + if state.failed_count() > 0: + logger.warning(f"Found {state.failed_count()} failed config(s) in state file") - failed_count = state.failed_count() - if failed_count > 0: - logger.warning(f"Found {failed_count} failed config(s) in state file") + pending_configs = ctx.configs # Filter out already-tuned configs (unless --retune) - pending_configs = ctx.configs skipped_success = 0 if not ctx.options.retune: pending_configs = [c for c in pending_configs if not cache.contains(c)] @@ -1359,7 +1362,7 @@ def tune_configs(ctx: TuningContext) -> bool: pool = GpuWorkerPool(ctx) num_workers = min(pool.worker_count, len(pending_configs)) - ctx.print_gpu_summary() + ctx.print_gpu_summary(num_workers=num_workers) # Prepare ETA tracker with historical data initial_times = [r.elapsed_seconds for r in cache.get_all_results() if r.elapsed_seconds > 0.0] @@ -1369,27 +1372,14 @@ def tune_configs(ctx: TuningContext) -> bool: ok_count=skipped_success, fail_count=skipped_failed) - def execute_tuning_task(test_vector: str) -> TuningResult: - gpu_id = pool.acquire_gpu_for_thread() - - state_file.set_running(test_vector) - - start_time = time.time() - compile_threads = ctx.get_compile_threads(gpu_id) - result = tune_config(test_vector, ctx.conf_class, ctx.paths, ctx.options, gpu_id, - compile_threads) - result.elapsed_seconds = time.time() - start_time - - return result - has_errors = False with (OutputFileWriter(ctx.options.output, ctx.options) as results_writer, DebugFileWriter(f"{ctx.options.output}.debug") if ctx.options.debug else nullcontext() as debug_writer): + executor = None progress_bar = None - try: # No context manager for executor because we need to shutdown with wait=False progress_bar = tqdm( total=len(ctx.configs), @@ -1403,6 +1393,19 @@ def execute_tuning_task(test_vector: str) -> TuningResult: '{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [t={elapsed}{postfix}]') progress_bar.set_postfix_str(eta_tracker.get_postfix_str()) + def execute_tuning_task(test_vector: str) -> TuningResult: + gpu_id = pool.acquire_gpu_for_thread() + + state_file.set_running(test_vector) + + start_time = time.time() + compile_threads = ctx.get_compile_threads(gpu_id) + result = tune_config(test_vector, ctx.conf_class, ctx.paths, ctx.options, gpu_id, + compile_threads) + result.elapsed_seconds = time.time() - start_time + + return result + executor = ThreadPoolExecutor(max_workers=num_workers) pending_futures = { executor.submit(execute_tuning_task, test_vector): test_vector From 056b12c3fb90d3987ea9a0b781e40b835812e151 Mon Sep 17 00:00:00 2001 From: Mirza Halilcevic Date: Mon, 19 Jan 2026 20:58:26 +0000 Subject: [PATCH 14/23] Add --status option. --- mlir/utils/performance/tuningRunner.py | 45 +++++++++++++++----------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py index 7b81fb2f3660..7505b18d55b1 100755 --- a/mlir/utils/performance/tuningRunner.py +++ b/mlir/utils/performance/tuningRunner.py @@ -1313,7 +1313,7 @@ def tune_config(test_vector: str, conf_class: type, paths: Paths, options: Optio verify_tflops=verify_tflops) -def tune_configs(ctx: TuningContext) -> bool: +def tune_configs(ctx: TuningContext, status_only: bool) -> bool: """Tune multiple configurations in parallel across available GPUs.""" # Load tuned configs from output file (unless --retune) if ctx.options.retune: @@ -1356,6 +1356,10 @@ def tune_configs(ctx: TuningContext) -> bool: logger.info( f"Skipping {skipped_failed} failed/crashed config(s) - use '--retry-failed' to retune") + if status_only: + logger.info(f"{len(pending_configs)}/{len(ctx.configs)} config(s) pending tuning") + return True + if not pending_configs: logger.info("No configurations to tune") return True @@ -1480,7 +1484,7 @@ def extract_fusion_configs(test_dir: str, paths: Paths) -> Operation: op_type = Operation.FUSION for filename in glob.glob(test_dir + '/*mlir'): - logger.info(f"Extract from: {filename}") + logger.info(f"Extracting fusion configs from: {filename}") test_entry = perfRunner.get_fusion_test_info(filename, paths) if not test_entry: continue @@ -1588,10 +1592,11 @@ def parse_arguments(gpu_topology: GpuTopology, metavar='FILE', help="Path to file containing list of configurations to tune. Use '-' for stdin.") - config_group.add_argument("--config", - type=str, - metavar='CONFIG', - help="Specific config to tune. Format depends on --op type.") + config_group.add_argument( + "--config", + type=str, + metavar='CONFIG', + help="Specific config to tune. Can be a config string or path to an .mlir file.") parser.add_argument("--op", "--operation", @@ -1737,11 +1742,16 @@ def parse_arguments(gpu_topology: GpuTopology, "Wait for all compilation tasks to complete before starting tuning. Useful for systems with shared CPU/GPU memory (e.g., APUs)." ) + parser.add_argument("-s", + "--status", + action='store_true', + default=False, + help="Only show tuning status without performing any tuning") + return parser.parse_args(args) def main(args=None): - numa_topology = NumaTopology.discover() gpu_topology = GpuTopology.discover() available_gpus = sorted(gpu_topology.gpus.keys()) @@ -1753,16 +1763,15 @@ def main(args=None): setup_logger(quiet=parsed_args.quiet, verbose=parsed_args.verbose) + op_type = Operation.from_name(parsed_args.op) + + # Handle stdin for configs file stdin_temp_file = None - try: - # Handle stdin for configs file - if parsed_args.configs_file == '-': - stdin_temp_file = load_configs_from_stdin() - parsed_args.configs_file = stdin_temp_file + if parsed_args.configs_file == '-': + parsed_args.configs_file = load_configs_from_stdin() - op_type = Operation.from_name(parsed_args.op) + try: paths = resolve_paths(op_type, parsed_args) - if not paths.mlir_paths: logger.error("rocMLIR build dir was not provided/found") return 1 @@ -1770,9 +1779,7 @@ def main(args=None): if op_type == Operation.FUSION: op_type = extract_fusion_configs(parsed_args.test_dir, paths) - conf_class = get_config_class(op_type) configs = load_configs(op_type, parsed_args, paths) - finally: if stdin_temp_file: os.unlink(stdin_temp_file) @@ -1802,14 +1809,14 @@ def main(args=None): wait_for_compiles=parsed_args.wait_for_compiles) ctx = TuningContext(configs=configs, - conf_class=conf_class, + conf_class=get_config_class(op_type), paths=paths, options=options, gpu_topology=gpu_topology, - numa_topology=numa_topology) + numa_topology=NumaTopology.discover()) try: - tuning_succeeded = tune_configs(ctx) + tuning_succeeded = tune_configs(ctx, status_only=parsed_args.status) except KeyboardInterrupt: return 130 # 128 + SIGINT From 7041c6455a87df88df2b7e8ab04d4fb080ecf329 Mon Sep 17 00:00:00 2001 From: Mirza Halilcevic Date: Wed, 21 Jan 2026 14:08:28 +0000 Subject: [PATCH 15/23] Improve order of logs for easier tracking. --- mlir/utils/performance/tuningRunner.py | 118 ++++++++++++++++--------- 1 file changed, 76 insertions(+), 42 deletions(-) diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py index 850b2c77caab..80e2a56879d1 100755 --- a/mlir/utils/performance/tuningRunner.py +++ b/mlir/utils/performance/tuningRunner.py @@ -28,6 +28,7 @@ import json import logging import os +import signal import statistics import subprocess import sys @@ -111,6 +112,16 @@ def emit(self, record): self.handleError(record) +class GpuLoggerAdapter(logging.LoggerAdapter): + """Logger adapter that prefixes messages with GPU ID.""" + + def process(self, msg, kwargs): + gpu_id = self.extra.get('gpu_id') + if gpu_id is not None: + return f"[GPU {gpu_id}] {msg}", kwargs + return msg, kwargs + + def setup_logger(quiet: bool = False, verbose: bool = False) -> logging.Logger: """Configure and return a logger for tuningRunner.""" assert not (quiet and verbose), "quiet and verbose are mutually exclusive" @@ -126,6 +137,11 @@ def setup_logger(quiet: bool = False, verbose: bool = False) -> logging.Logger: logger.addHandler(TqdmLoggingHandler(use_color=sys.stderr.isatty())) +def get_gpu_logger(gpu_id: int) -> logging.LoggerAdapter: + """Get a logger adapter for a specific GPU.""" + return GpuLoggerAdapter(logger, {'gpu_id': gpu_id}) + + # Module-level logger logger: logging.Logger = logging.getLogger("tuningRunner") @@ -168,7 +184,6 @@ class TuningResult: max_tflops: Optional[float] = None entries: List[Dict] = field(default_factory=list) verify_tflops: Optional[float] = None - error: Optional[str] = None # ============================================================================= @@ -909,6 +924,20 @@ def write_result(self, result: TuningResult): # Utilities # ============================================================================= +# Signals that indicate user/system requested termination (should not be logged as failures) +TERMINATION_SIGNALS = frozenset({ + signal.SIGINT, # Ctrl+C + signal.SIGTERM, # Graceful termination request + signal.SIGHUP, # Terminal hangup + signal.SIGQUIT, # Quit from keyboard +}) + + +def raise_if_terminated(returncode: int) -> None: + """Raise KeyboardInterrupt if returncode indicates termination.""" + if -returncode in TERMINATION_SIGNALS: + raise KeyboardInterrupt() + class TuningArgumentParser(argparse.ArgumentParser): """ArgumentParser with custom validation for tuning arguments.""" @@ -1051,6 +1080,8 @@ def verify_perfconfig(perfconfig: str, config: PerfConfiguration, paths: Paths, Returns the execution time in nanoseconds, or raises TuningError on failure. """ + gpu_logger = get_gpu_logger(gpu_id) + config.set_perfconfig(perfconfig) command_line_options = config.generate_mlir_driver_commandline(options.rocmlir_gen_flags, @@ -1075,7 +1106,7 @@ def verify_perfconfig(perfconfig: str, config: PerfConfiguration, paths: Paths, verification_pipeline = " | ".join([ ' '.join(rocmlir_gen_command), ' '.join(rocmlir_driver_command), ' '.join(rocprof_command) ]) - logger.debug(f"[GPU {gpu_id}] Verifying perfconfig '{perfconfig}'\n{verification_pipeline}") + gpu_logger.debug(f"Verifying perfconfig '{perfconfig}'\nCommand: {verification_pipeline}") with tempfile.TemporaryDirectory() as tmpdir: p1 = None @@ -1105,6 +1136,7 @@ def verify_perfconfig(perfconfig: str, config: PerfConfiguration, paths: Paths, try: outs, errs = p3.communicate(timeout=600) + raise_if_terminated(p3.returncode) outs = outs.decode('utf-8') if p3.returncode != 0 or not CORRECT_RESULT_RE.search(outs): raise TuningError( @@ -1146,6 +1178,8 @@ def find_best_perfconfig(tuning_output: List[str], config: PerfConfiguration, pa Returns the winning config, its TFLOPS, and all entries. """ + gpu_logger = get_gpu_logger(gpu_id) + max_tflops: Optional[float] = None winning_config: Optional[str] = None entries = [] @@ -1157,7 +1191,7 @@ def find_best_perfconfig(tuning_output: List[str], config: PerfConfiguration, pa parts = result.split('\t') if len(parts) < 2: - logger.debug(f"Skipping malformed tuning output line: '{result}'") + gpu_logger.debug(f"Skipping malformed tuning output line: '{result}'") continue perfconfig = parts[0] @@ -1170,7 +1204,7 @@ def find_best_perfconfig(tuning_output: List[str], config: PerfConfiguration, pa nano_seconds = float(time) measurements = json.loads(parts[1]) if len(parts) == 3 else None except (ValueError, json.JSONDecodeError): - logger.debug(f"Skipping malformed tuning output line: '{result}'") + gpu_logger.debug(f"Skipping malformed tuning output line: '{result}'") continue config.set_perfconfig(perfconfig) @@ -1195,6 +1229,8 @@ def find_best_perfconfig(tuning_output: List[str], config: PerfConfiguration, pa def tune_config(test_vector: str, conf_class: type, paths: Paths, options: Options, gpu_id: int, num_compile_threads: int) -> TuningResult: """Tune a single configuration and return the result.""" + gpu_logger = get_gpu_logger(gpu_id) + tuning_driver_args = [ f"--tuning-space={options.tuning_space_kind}", f"--num-iterations={MLIR_N_REPEATS}", f"--warmup-iterations={WARMUP_ITERATIONS}", "--use-median", f"--sleep-us={SLEEP_US}", @@ -1239,16 +1275,15 @@ def tune_config(test_vector: str, conf_class: type, paths: Paths, options: Optio stderr=subprocess.PIPE, env=env) output, err = tuning_key.communicate() + raise_if_terminated(tuning_key.returncode) if tuning_key.returncode != 0: - error = format_error("Failed to generate tuning key", - command=' '.join(rocmlir_gen_command), - stderr=err.decode('utf-8'), - exit_code=tuning_key.returncode, - gpu_id=gpu_id) - return TuningResult(test_vector=test_vector, - success=False, - gpu_id=gpu_id, - error=error) + gpu_logger.error( + format_error("Failed to generate tuning key", + command=' '.join(rocmlir_gen_command), + stderr=err.decode('utf-8'), + exit_code=tuning_key.returncode, + gpu_id=gpu_id)) + return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id) result = output.decode('utf-8').strip().split('\t') command_line = result[2].split(sep=' ') config = conf_class.from_command_line(command_line, options.arch, options.num_cu, @@ -1260,55 +1295,55 @@ def tune_config(test_vector: str, conf_class: type, paths: Paths, options: Optio env=env) tuning_pipeline = ' '.join(tuning_driver_command) - logger.debug(f"[GPU {gpu_id}] Tuning '{test_vector}'\n{tuning_pipeline}") + gpu_logger.debug(f"Tuning '{test_vector}'\nCommand: {tuning_pipeline}") # Note: communicate waits for process to terminate which might cause CI timeouts if tuning takes too long tuning_stdout, tuning_stderr = tuning_driver.communicate() + raise_if_terminated(tuning_driver.returncode) + tuning_output = tuning_stdout.decode('utf-8').splitlines() tuning_errors = tuning_stderr.decode('utf-8') if tuning_driver.returncode != 0: - error = format_error( - "Tuning pipeline failed", - command=tuning_pipeline, - stdout=tuning_output[-10:], # Last 10 lines of stdout - stderr=tuning_errors, - exit_code=tuning_driver.returncode, - gpu_id=gpu_id) - return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id, error=error) + gpu_logger.error( + format_error( + "Tuning pipeline failed", + command=tuning_pipeline, + stdout=tuning_output[-10:], # Last 10 lines of stdout + stderr=tuning_errors, + exit_code=tuning_driver.returncode, + gpu_id=gpu_id)) + return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id) else: # Log any stderr output from tuning driver because it may contain warnings if tuning_errors.strip(): - logger.warning(f"[GPU {gpu_id}] rocmlir-tuning-driver stderr:\n{tuning_errors}") + gpu_logger.warning(f"rocmlir-tuning-driver stderr:\n{tuning_errors}") winning_config, max_tflops, entries = find_best_perfconfig(tuning_output, config, paths, options, gpu_id) except TuningError as e: - return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id, error=str(e)) + gpu_logger.error(str(e)) + return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id) finally: kill_process(rocmlir_gen) kill_process(tuning_driver) if winning_config is None: - return TuningResult(test_vector=test_vector, - success=False, - gpu_id=gpu_id, - error="No valid perf config found") + gpu_logger.error("No valid perf config found") + return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id) verify_tflops = None if options.verify_mode != "none": try: verify_ns = verify_perfconfig(winning_config, config, paths, options, gpu_id) except TuningError as e: - return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id, error=str(e)) + gpu_logger.error(str(e)) + return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id) if np.isnan(verify_ns): - return TuningResult( - test_vector=test_vector, - success=False, - gpu_id=gpu_id, - error=f"Verification returned NaN for winning perfconfig '{winning_config}'") + gpu_logger.error(f"Verification returned NaN for winning perfconfig '{winning_config}'") + return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id) verify_tflops = config.compute_tflops(verify_ns) @@ -1416,6 +1451,11 @@ def execute_tuning_task(test_vector: str) -> TuningResult: compile_threads) result.elapsed_seconds = time.time() - start_time + if result.success: + state_file.set_success(result.test_vector) + else: + state_file.set_failed(result.test_vector) + return result executor = ThreadPoolExecutor(max_workers=num_workers) @@ -1431,15 +1471,9 @@ def execute_tuning_task(test_vector: str) -> TuningResult: results_writer.write_result(result) if debug_writer: debug_writer.write_result(result) - state_file.set_success(result.test_vector) else: has_errors = True - state_file.set_failed(result.test_vector) - - error_msg = f"[GPU {result.gpu_id}] Tuning failed for '{result.test_vector}'" - if result.error: - error_msg += "\n" + result.error - logger.error(error_msg) + logger.error(f"Tuning failed for '{result.test_vector}' on GPU {result.gpu_id}") eta_tracker.record(result) progress_bar.update(1) @@ -1825,7 +1859,7 @@ def main(args=None): try: tuning_succeeded = tune_configs(ctx, status_only=parsed_args.status) except KeyboardInterrupt: - return 130 # 128 + SIGINT + return 128 + signal.SIGINT return 0 if tuning_succeeded else 1 From 8f4259e9a25e2d7f4b3ed66dc3c4b1713c6948f3 Mon Sep 17 00:00:00 2001 From: Mirza Halilcevic Date: Wed, 21 Jan 2026 14:32:09 +0000 Subject: [PATCH 16/23] Update github ci python version. --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 08a622077d13..8e0a3d5a82af 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,7 +21,7 @@ jobs: py-checks: runs-on: ubuntu-latest container: - image: python:3.8 + image: python:3.10 options: --user root steps: - uses: actions/checkout@v4 From aeb430c1e6a241421f9806d1be41530a80799e06 Mon Sep 17 00:00:00 2001 From: Mirza Halilcevic Date: Mon, 2 Feb 2026 20:39:18 +0000 Subject: [PATCH 17/23] Fix state transitions. --- mlir/utils/performance/tuningRunner.py | 76 ++++++++++++++++---------- 1 file changed, 47 insertions(+), 29 deletions(-) diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py index 80e2a56879d1..8ffa4662ea76 100755 --- a/mlir/utils/performance/tuningRunner.py +++ b/mlir/utils/performance/tuningRunner.py @@ -167,7 +167,7 @@ class Options: output: str abort_on_error: bool retune: bool - retry_failed: bool + retry_states: frozenset gpu_ids: List[int] num_cpus: Optional[int] wait_for_compiles: bool @@ -309,15 +309,15 @@ class ConfigState(Enum): State transitions: PENDING (implicit) -> RUNNING: Config starts tuning - RUNNING -> SUCCESS (implicit): Tuning completes successfully (removed from state, written to output) + RUNNING -> SUCCEEDED (implicit): Tuning completes successfully (removed from state, written to output) RUNNING -> FAILED: Tuning completes with error RUNNING -> INTERRUPTED: User interrupted (Ctrl+C) during tuning RUNNING -> CRASHED: Detected on next startup (stale RUNNING state) - FAILED/CRASHED -> PENDING: User requests retry with --retry-failed + -> PENDING: User requests retry with --retry - Note: PENDING and SUCCESS are implicit states: + Note: PENDING and SUCCEEDED are implicit states: - PENDING: not in state file AND not in output file - - SUCCESS: in output file (not tracked in state file) + - SUCCEEDED: in output file (not tracked in state file) """ RUNNING = "running" # Currently being tuned FAILED = "failed" # Tuning completed with error @@ -325,25 +325,36 @@ class ConfigState(Enum): CRASHED = "crashed" # Process crashed while tuning (detected on startup) +# States representing unsuccessful tuning outcomes that are skipped by default +UNSUCCESSFUL_STATES = frozenset({ConfigState.FAILED, ConfigState.CRASHED}) + + @dataclass class TuningState: """State tracking for configs within a single context.""" configs: Dict[str, ConfigState] = field(default_factory=dict) + _pre_running_states: Dict[str, ConfigState] = field(default_factory=dict) def set_running(self, test_vector: str) -> None: + if test_vector in self.configs: + self._pre_running_states[test_vector] = self.configs[test_vector] self.configs[test_vector] = ConfigState.RUNNING def set_failed(self, test_vector: str) -> None: self.configs[test_vector] = ConfigState.FAILED + self._pre_running_states.pop(test_vector, None) def set_interrupted(self, test_vector: str) -> None: self.configs[test_vector] = ConfigState.INTERRUPTED + self._pre_running_states.pop(test_vector, None) def remove(self, test_vector: str) -> None: self.configs.pop(test_vector, None) + self._pre_running_states.pop(test_vector, None) - def should_skip(self, test_vector: str) -> bool: - return self.configs.get(test_vector) in (ConfigState.FAILED, ConfigState.CRASHED) + def should_skip(self, test_vector: str, retry_states: frozenset = frozenset()) -> bool: + state = self.configs.get(test_vector) + return state in UNSUCCESSFUL_STATES and state not in retry_states def is_empty(self) -> bool: return not self.configs @@ -358,7 +369,8 @@ def promote_running_to_interrupted(self) -> int: count = 0 for tv in self.configs: if self.configs[tv] == ConfigState.RUNNING: - self.configs[tv] = ConfigState.INTERRUPTED + prev_state = self._pre_running_states.pop(tv, None) + self.configs[tv] = prev_state or ConfigState.INTERRUPTED count += 1 return count @@ -460,7 +472,7 @@ def set_failed(self, test_vector: str) -> None: self._state.set_failed(test_vector) self._save_locked() - def set_success(self, test_vector: str) -> None: + def set_succeeded(self, test_vector: str) -> None: with self._lock: self._state.remove(test_vector) self._save_locked() @@ -1379,25 +1391,29 @@ def tune_configs(ctx: TuningContext, status_only: bool) -> bool: pending_configs = ctx.configs # Filter out already-tuned configs (unless --retune) - skipped_success = 0 + skipped_successful = 0 if not ctx.options.retune: pending_configs = [c for c in pending_configs if not cache.contains(c)] - skipped_success = len(ctx.configs) - len(pending_configs) + skipped_successful = len(ctx.configs) - len(pending_configs) - # Filter out failed/crashed configs (unless --retry-failed or --retune) - skipped_failed = 0 - if not ctx.options.retry_failed and not ctx.options.retune: + # Filter out unsuccessful configs (unless --retry or --retune) + skipped_unsuccessful = 0 + if not ctx.options.retune: before_filter = len(pending_configs) - pending_configs = [c for c in pending_configs if not state.should_skip(c)] - skipped_failed = before_filter - len(pending_configs) + pending_configs = [ + c for c in pending_configs if not state.should_skip(c, ctx.options.retry_states) + ] + skipped_unsuccessful = before_filter - len(pending_configs) - total_skipped = skipped_success + skipped_failed + total_skipped = skipped_successful + skipped_unsuccessful - if skipped_success > 0: - logger.info(f"Skipping {skipped_success} already tuned config(s)") - if skipped_failed > 0: + if skipped_successful > 0: logger.info( - f"Skipping {skipped_failed} failed/crashed config(s) - use '--retry-failed' to retune") + f"Skipping {skipped_successful} already tuned config(s) - use '--retune' to retune") + if skipped_unsuccessful > 0: + logger.info( + f"Skipping {skipped_unsuccessful} unsuccessful config(s) - use '--retry ' to retry" + ) if status_only: logger.info(f"{len(pending_configs)}/{len(ctx.configs)} config(s) pending tuning") @@ -1416,8 +1432,8 @@ def tune_configs(ctx: TuningContext, status_only: bool) -> bool: eta_tracker = ETATracker(total_configs=len(pending_configs), num_workers=num_workers, success_times=initial_times, - ok_count=skipped_success, - fail_count=skipped_failed) + ok_count=skipped_successful, + fail_count=skipped_unsuccessful) has_errors = False @@ -1452,7 +1468,7 @@ def execute_tuning_task(test_vector: str) -> TuningResult: result.elapsed_seconds = time.time() - start_time if result.success: - state_file.set_success(result.test_vector) + state_file.set_succeeded(result.test_vector) else: state_file.set_failed(result.test_vector) @@ -1754,10 +1770,12 @@ def parse_arguments(gpu_topology: GpuTopology, default=False, help="Force retuning of all configs, ignoring existing results in the output file") - parser.add_argument("--retry-failed", - action='store_true', - default=False, - help="Retry previously failed/crashed configs instead of skipping them") + parser.add_argument("--retry", + nargs='+', + choices=["failed", "crashed"], + default=[], + metavar='STATE', + help="Retry configs in specified states") parser.add_argument("--gpus", type=int, @@ -1844,7 +1862,7 @@ def main(args=None): output=parsed_args.output, abort_on_error=parsed_args.abort_on_error, retune=parsed_args.retune, - retry_failed=parsed_args.retry_failed, + retry_states=frozenset(ConfigState(s) for s in parsed_args.retry), gpu_ids=parsed_args.gpus, num_cpus=parsed_args.num_cpus, wait_for_compiles=parsed_args.wait_for_compiles) From c39a3d200af169a6979f24b1b27346b651eb94ee Mon Sep 17 00:00:00 2001 From: Mirza Halilcevic Date: Mon, 2 Feb 2026 20:53:05 +0000 Subject: [PATCH 18/23] Add timeout option. --- mlir/utils/performance/tuningRunner.py | 49 ++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py index 8ffa4662ea76..1d575f5fa44f 100755 --- a/mlir/utils/performance/tuningRunner.py +++ b/mlir/utils/performance/tuningRunner.py @@ -171,6 +171,7 @@ class Options: gpu_ids: List[int] num_cpus: Optional[int] wait_for_compiles: bool + timeout: Optional[int] @dataclass @@ -178,6 +179,7 @@ class TuningResult: """Result of tuning a single configuration.""" test_vector: str success: bool + timed_out: bool = False gpu_id: int = -1 elapsed_seconds: float = 0.0 winning_config: Optional[str] = None @@ -311,6 +313,7 @@ class ConfigState(Enum): PENDING (implicit) -> RUNNING: Config starts tuning RUNNING -> SUCCEEDED (implicit): Tuning completes successfully (removed from state, written to output) RUNNING -> FAILED: Tuning completes with error + RUNNING -> TIMED_OUT: Tuning exceeded timeout RUNNING -> INTERRUPTED: User interrupted (Ctrl+C) during tuning RUNNING -> CRASHED: Detected on next startup (stale RUNNING state) -> PENDING: User requests retry with --retry @@ -321,12 +324,13 @@ class ConfigState(Enum): """ RUNNING = "running" # Currently being tuned FAILED = "failed" # Tuning completed with error + TIMED_OUT = "timed_out" # Tuning exceeded timeout INTERRUPTED = "interrupted" # User interrupted during tuning (Ctrl+C) CRASHED = "crashed" # Process crashed while tuning (detected on startup) # States representing unsuccessful tuning outcomes that are skipped by default -UNSUCCESSFUL_STATES = frozenset({ConfigState.FAILED, ConfigState.CRASHED}) +UNSUCCESSFUL_STATES = frozenset({ConfigState.FAILED, ConfigState.TIMED_OUT, ConfigState.CRASHED}) @dataclass @@ -344,6 +348,10 @@ def set_failed(self, test_vector: str) -> None: self.configs[test_vector] = ConfigState.FAILED self._pre_running_states.pop(test_vector, None) + def set_timed_out(self, test_vector: str) -> None: + self.configs[test_vector] = ConfigState.TIMED_OUT + self._pre_running_states.pop(test_vector, None) + def set_interrupted(self, test_vector: str) -> None: self.configs[test_vector] = ConfigState.INTERRUPTED self._pre_running_states.pop(test_vector, None) @@ -362,6 +370,9 @@ def is_empty(self) -> bool: def failed_count(self) -> int: return sum(1 for s in self.configs.values() if s == ConfigState.FAILED) + def timed_out_count(self) -> int: + return sum(1 for s in self.configs.values() if s == ConfigState.TIMED_OUT) + def crashed_count(self) -> int: return sum(1 for s in self.configs.values() if s == ConfigState.CRASHED) @@ -472,6 +483,11 @@ def set_failed(self, test_vector: str) -> None: self._state.set_failed(test_vector) self._save_locked() + def set_timed_out(self, test_vector: str) -> None: + with self._lock: + self._state.set_timed_out(test_vector) + self._save_locked() + def set_succeeded(self, test_vector: str) -> None: with self._lock: self._state.remove(test_vector) @@ -1309,8 +1325,17 @@ def tune_config(test_vector: str, conf_class: type, paths: Paths, options: Optio gpu_logger.debug(f"Tuning '{test_vector}'\nCommand: {tuning_pipeline}") - # Note: communicate waits for process to terminate which might cause CI timeouts if tuning takes too long - tuning_stdout, tuning_stderr = tuning_driver.communicate() + try: + tuning_stdout, tuning_stderr = tuning_driver.communicate(timeout=options.timeout) + except subprocess.TimeoutExpired: + gpu_logger.error( + format_error(f"Tuning timed out after {options.timeout}s", + command=tuning_pipeline, + gpu_id=gpu_id)) + return TuningResult(test_vector=test_vector, + success=False, + timed_out=True, + gpu_id=gpu_id) raise_if_terminated(tuning_driver.returncode) @@ -1385,6 +1410,8 @@ def tune_configs(ctx: TuningContext, status_only: bool) -> bool: logger.info(f"Found {cache.count()} tuned config(s) in {ctx.options.output}") if state.crashed_count() > 0: logger.warning(f"Found {state.crashed_count()} crashed config(s) in state file") + if state.timed_out_count() > 0: + logger.warning(f"Found {state.timed_out_count()} timed out config(s) in state file") if state.failed_count() > 0: logger.warning(f"Found {state.failed_count()} failed config(s) in state file") @@ -1469,6 +1496,8 @@ def execute_tuning_task(test_vector: str) -> TuningResult: if result.success: state_file.set_succeeded(result.test_vector) + elif result.timed_out: + state_file.set_timed_out(result.test_vector) else: state_file.set_failed(result.test_vector) @@ -1489,7 +1518,8 @@ def execute_tuning_task(test_vector: str) -> TuningResult: debug_writer.write_result(result) else: has_errors = True - logger.error(f"Tuning failed for '{result.test_vector}' on GPU {result.gpu_id}") + logger.error( + f"Tuning unsuccessful for '{result.test_vector}' on GPU {result.gpu_id}") eta_tracker.record(result) progress_bar.update(1) @@ -1772,11 +1802,17 @@ def parse_arguments(gpu_topology: GpuTopology, parser.add_argument("--retry", nargs='+', - choices=["failed", "crashed"], + choices=["failed", "timed_out", "crashed"], default=[], metavar='STATE', help="Retry configs in specified states") + parser.add_argument("--timeout", + type=int, + default=None, + metavar='SECONDS', + help="Timeout in seconds for tuning each config") + parser.add_argument("--gpus", type=int, nargs='+', @@ -1865,7 +1901,8 @@ def main(args=None): retry_states=frozenset(ConfigState(s) for s in parsed_args.retry), gpu_ids=parsed_args.gpus, num_cpus=parsed_args.num_cpus, - wait_for_compiles=parsed_args.wait_for_compiles) + wait_for_compiles=parsed_args.wait_for_compiles, + timeout=parsed_args.timeout) ctx = TuningContext(configs=configs, conf_class=get_config_class(op_type), From 2b6e394a6d09feb8509903418a556b8335bf870d Mon Sep 17 00:00:00 2001 From: Mirza Halilcevic Date: Tue, 3 Feb 2026 14:14:15 +0000 Subject: [PATCH 19/23] Improve output file format. --- mlir/utils/performance/tuningRunner.py | 216 +++++++++++++++---------- 1 file changed, 135 insertions(+), 81 deletions(-) diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py index 1d575f5fa44f..f8a905469634 100755 --- a/mlir/utils/performance/tuningRunner.py +++ b/mlir/utils/performance/tuningRunner.py @@ -38,8 +38,9 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from contextlib import nullcontext from dataclasses import dataclass, field +from datetime import datetime, timezone from enum import Enum -from typing import Any, Dict, List, Optional +from typing import Dict, List, Optional from collections import deque import numpy as np @@ -157,13 +158,13 @@ class Options: tuning_space_kind: str quiet: bool verbose: bool - arch: str + chip: str + arch: str # Old arch value for backwards compatibility num_cu: int num_chiplets: int rocmlir_gen_flags: str verify_mode: str verify_perfconfigs: bool - tflops: bool output: str abort_on_error: bool retune: bool @@ -181,7 +182,8 @@ class TuningResult: success: bool timed_out: bool = False gpu_id: int = -1 - elapsed_seconds: float = 0.0 + duration_seconds: float = 0.0 + timestamp: Optional[str] = None winning_config: Optional[str] = None max_tflops: Optional[float] = None entries: List[Dict] = field(default_factory=list) @@ -392,7 +394,7 @@ class TuningStateFile: File format: { "contexts": { - "/": { + "///": { "test_vector_1": "failed", "test_vector_2": "crashed" } @@ -402,9 +404,10 @@ class TuningStateFile: If filepath is None, all operations are no-ops. """ - def __init__(self, filepath: Optional[str], arch: str, tuning_space: str): + def __init__(self, filepath: Optional[str], arch: str, num_cu: int, num_chiplets: int, + tuning_space: str): self.filepath = filepath - self.context_key = f"{arch}/{tuning_space}" + self.context_key = f"{arch}/{num_cu}/{num_chiplets}/{tuning_space}" self._lock = threading.Lock() self._all_contexts: Dict[str, Dict[str, str]] = {} # context_key -> {tv -> state_str} self._state = TuningState() @@ -539,8 +542,10 @@ def count(self) -> int: def from_output_file(cls, options: Options) -> 'TunedConfigsCache': """Load previously tuned configurations from an output TSV file. - Format: # arch\tnumCUs\tnumChiplets\ttestVector\tperfConfig (tuning_space)\t[TFlops]\telapsedSeconds - Only loads entries matching current arch and tuning space. + Format (new): # arch\tnumCUs\tnumChiplets\ttestVector\tperfConfig\tTFlops\ttuningSpace\tcommitId\ttimestamp\tdurationSec + Format (old): # arch\tnumCUs\tnumChiplets\ttestVector\tperfConfig (tuning_space)\t[TFlops] + + Only loads entries matching current arch, num_cu, num_chiplets, and tuning space. """ if options.output == '-' or not os.path.exists(options.output): return cls() @@ -548,10 +553,9 @@ def from_output_file(cls, options: Options) -> 'TunedConfigsCache': results: Dict[str, TuningResult] = {} current_commit = get_git_commit_hash() + warned_commits: set = set() - # Active section state - metadata: Dict[str, Optional[Any]] = {} - matching_section = False + header_tuning_space: Optional[str] = None column_indices: Dict[str, int] = {} with open(options.output, mode='r') as f: @@ -560,40 +564,24 @@ def from_output_file(cls, options: Options) -> 'TunedConfigsCache': if not line: continue - # Check for metadata line - if line.startswith('## '): - parts = line[3:].split(':') - if len(parts) == 2: - metadata[parts[0].strip()] = parts[1].strip() - continue - # Check for header line if cls._is_header_line(line): - # Determine if this section matches based on tuning space - matching_section = f'({options.tuning_space_kind})' in line - if matching_section: - column_indices = cls._parse_header_line(line) - # Warn if commit hashes differ - file_commit = metadata.get('commit', 'unknown') - if file_commit != current_commit: - logger.warning( - f"Loading tuned configs from different commit (file: {file_commit[:8]}, current: {current_commit[:8]})" - ) - - # Reset metadata for next section - metadata = {} + column_indices = cls._parse_header_line(line) + # Extract tuning space from header for old format (perfConfig (tuning_space)) + header_tuning_space = cls._extract_tuning_space_from_header(line) continue - # Skip other comment lines + # Skip comment lines if line.startswith('#'): continue - # Skip data lines from non-matching sections - if not matching_section or not column_indices: + # Skip if we haven't seen a header yet + if not column_indices: continue # Parse data line - result = cls._parse_data_line(line.split('\t'), column_indices, options.arch) + result = cls._parse_data_line(line.split('\t'), column_indices, options, + header_tuning_space, current_commit, warned_commits) if result: results[result.test_vector] = result @@ -604,6 +592,13 @@ def _is_header_line(line: str) -> bool: """Check if line is a column header.""" return line.startswith('# arch\t') + @staticmethod + def _extract_tuning_space_from_header(line: str) -> Optional[str]: + """Extract tuning space from old format header like 'perfConfig (quick)' or 'TFlops (quick)'.""" + import re + match = re.search(r'\((\w+)\)', line) + return match.group(1) if match else None + @staticmethod def _parse_header_line(line: str) -> Dict[str, int]: """Parse column header and return name -> index mapping.""" @@ -621,12 +616,15 @@ def _parse_header_line(line: str) -> Dict[str, int]: return indices @staticmethod - def _parse_data_line(fields: List[str], column_indices: Dict[str, int], - arch: str) -> Optional[TuningResult]: + def _parse_data_line(fields: List[str], column_indices: Dict[str, int], options: Options, + header_tuning_space: Optional[str], current_commit: str, + warned_commits: set) -> Optional[TuningResult]: """Parse a data line and return TuningResult if valid. A line is valid if: - - arch matches current system + - arch matches current system (chip or arch for backwards compatibility) + - numCUs and numChiplets match current system + - tuning space matches (from column or header) - testVector is present - perfConfig is present and not 'None' """ @@ -637,7 +635,24 @@ def get_field(name: str) -> Optional[str]: return fields[idx] return None - if get_field('arch') != arch: + # Check arch match (new format uses chip, old format used arch) + file_arch = get_field('arch') + if file_arch != options.chip and file_arch != options.arch: + return None + + # Check numCUs match + file_num_cu = get_field('numCUs') + if file_num_cu and file_num_cu != str(options.num_cu): + return None + + # Check numChiplets match + file_num_chiplets = get_field('numChiplets') + if file_num_chiplets and file_num_chiplets != str(options.num_chiplets): + return None + + # Check tuning space match (new format has column, old format used header) + file_tuning_space = get_field('tuningSpace') or header_tuning_space + if file_tuning_space != options.tuning_space_kind: return None test_vector = get_field('testVector') @@ -648,6 +663,7 @@ def get_field(name: str) -> Optional[str]: if not perf_config or perf_config == 'None': return None + # TFlops (optional) max_tflops = None tflops_str = get_field('TFlops') if tflops_str: @@ -658,18 +674,31 @@ def get_field(name: str) -> Optional[str]: except ValueError: pass - elapsed_seconds = 0.0 - elapsed_str = get_field('elapsedSeconds') - if elapsed_str: + # Duration (optional) + duration_seconds = 0.0 + duration_str = get_field('durationSec') + if duration_str: try: - elapsed_seconds = float(elapsed_str) + duration_seconds = float(duration_str) except ValueError: pass + # Timestamp (optional) + timestamp = get_field('timestamp') + + # Warn if commit differs (avoid spamming for same commit) + file_commit = get_field('commitId') + if file_commit and file_commit != current_commit and file_commit not in warned_commits: + logger.warning( + f"Loading tuned configs from different commit (file: {file_commit[:8]}, current: {current_commit[:8]})" + ) + warned_commits.add(file_commit) + return TuningResult(test_vector=test_vector, success=True, gpu_id=-1, - elapsed_seconds=elapsed_seconds, + duration_seconds=duration_seconds, + timestamp=timestamp, winning_config=perf_config, max_tflops=max_tflops) @@ -688,7 +717,7 @@ def record(self, result: TuningResult) -> None: self._processed += 1 if result.success: self.ok_count += 1 - self.success_times.append(result.elapsed_seconds) + self.success_times.append(result.duration_seconds) else: self.fail_count += 1 @@ -859,19 +888,25 @@ def _set_memory_policy(self, numa_node: int) -> None: class OutputFileWriter: """Context manager for writing tuning results to TSV file.""" + HEADER_COLUMNS = [ + 'arch', 'numCUs', 'numChiplets', 'testVector', 'perfConfig', 'TFlops', 'tuningSpace', + 'commitId', 'timestamp', 'durationSec' + ] + EXPECTED_HEADER = "# " + "\t".join(HEADER_COLUMNS) + def __init__(self, filepath: str, options: Options): self.filepath = filepath self.options = options self.file = None self._header_written = False - self._is_appending = False def __enter__(self): if self.filepath == '-': self.file = sys.stdout else: - self._is_appending = os.path.exists(self.filepath) and os.path.getsize( - self.filepath) > 0 + if os.path.exists(self.filepath) and os.path.getsize(self.filepath) > 0: + if self._find_last_header() == self.EXPECTED_HEADER: + self._header_written = True self.file = open(self.filepath, 'a') return self @@ -879,41 +914,50 @@ def __exit__(self, exc_type, exc_value, traceback): if self.file and self.file != sys.stdout: self.file.close() - def _write_header(self): - if self._is_appending: - print("", file=self.file) # Blank line before new section + def _find_last_header(self, chunk_size: int = 8192) -> Optional[str]: + """Find the last header line by reading from the end of file.""" + with open(self.filepath, 'rb') as f: + f.seek(0, 2) # Seek to end + file_size = f.tell() + remaining = b'' - # Metadata comments - print(f"## commit: {get_git_commit_hash()}", file=self.file) + pos = file_size + while pos > 0: + read_size = min(chunk_size, pos) + pos -= read_size + f.seek(pos) + chunk = f.read(read_size) + remaining - # TSV header with '# ' prefix - columns = [ - 'arch', 'numCUs', 'numChiplets', 'testVector', - f'perfConfig ({self.options.tuning_space_kind})' - ] - if self.options.tflops: - columns.append('TFlops') - columns.append('elapsedSeconds') + lines = chunk.split(b'\n') + remaining = lines[0] - print("# " + "\t".join(columns), file=self.file) - self.file.flush() + for line in reversed(lines[1:]): + if line.startswith(b'# arch\t'): + return line.decode('utf-8') + + if remaining.startswith(b'# arch\t'): + return remaining.decode('utf-8') + return None + + def _write_header(self): + print(self.EXPECTED_HEADER, file=self.file) + self.file.flush() self._header_written = True def write_result(self, result: TuningResult): - assert result.success and result.winning_config and result.max_tflops, "write_result called with invalid result" + assert result.success and result.winning_config and result.max_tflops and result.timestamp and result.duration_seconds > 0.0, "write_result called with invalid result" if not self._header_written: self._write_header() fields = [ - self.options.arch, + self.options.chip, str(self.options.num_cu), - str(self.options.num_chiplets), result.test_vector, result.winning_config + str(self.options.num_chiplets), result.test_vector, result.winning_config, + str(result.max_tflops), self.options.tuning_space_kind, + get_git_commit_hash(), result.timestamp, f"{result.duration_seconds:.1f}" ] - if self.options.tflops: - fields.append(str(result.max_tflops)) - fields.append(f"{result.elapsed_seconds:.1f}") print("\t".join(fields), file=self.file) self.file.flush() @@ -1238,7 +1282,7 @@ def find_best_perfconfig(tuning_output: List[str], config: PerfConfiguration, pa config.set_perfconfig(perfconfig) entry = config.table_entry(nano_seconds) if options.debug: - entry["Measurements"] = measurements + entry["MeasurementsMs"] = measurements entries.append(entry) if options.verify_perfconfigs and not np.isnan(nano_seconds): @@ -1402,7 +1446,8 @@ def tune_configs(ctx: TuningContext, status_only: bool) -> bool: cache = TunedConfigsCache.from_output_file(ctx.options) # Load state file - state_file = TuningStateFile(get_state_filepath(ctx.options.output), ctx.options.arch, + state_file = TuningStateFile(get_state_filepath(ctx.options.output), ctx.options.chip, + ctx.options.num_cu, ctx.options.num_chiplets, ctx.options.tuning_space_kind) state = state_file.state @@ -1455,7 +1500,9 @@ def tune_configs(ctx: TuningContext, status_only: bool) -> bool: ctx.print_gpu_summary(num_workers=num_workers) # Prepare ETA tracker with historical data - initial_times = [r.elapsed_seconds for r in cache.get_all_results() if r.elapsed_seconds > 0.0] + initial_times = [ + r.duration_seconds for r in cache.get_all_results() if r.duration_seconds > 0.0 + ] eta_tracker = ETATracker(total_configs=len(pending_configs), num_workers=num_workers, success_times=initial_times, @@ -1464,8 +1511,12 @@ def tune_configs(ctx: TuningContext, status_only: bool) -> bool: has_errors = False + debug_enabled = ctx.options.debug and ctx.options.output != '-' + if ctx.options.debug and not debug_enabled: + logger.warning("Debug output disabled when writing to stdout") + with (OutputFileWriter(ctx.options.output, ctx.options) as results_writer, - DebugFileWriter(f"{ctx.options.output}.debug") if ctx.options.debug else nullcontext() as + DebugFileWriter(f"{ctx.options.output}.debug") if debug_enabled else nullcontext() as debug_writer): executor = None @@ -1488,11 +1539,13 @@ def execute_tuning_task(test_vector: str) -> TuningResult: state_file.set_running(test_vector) + timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') start_time = time.time() compile_threads = ctx.get_compile_threads(gpu_id) result = tune_config(test_vector, ctx.conf_class, ctx.paths, ctx.options, gpu_id, compile_threads) - result.elapsed_seconds = time.time() - start_time + result.duration_seconds = time.time() - start_time + result.timestamp = timestamp if result.success: state_file.set_succeeded(result.test_vector) @@ -1784,10 +1837,11 @@ def parse_arguments(gpu_topology: GpuTopology, metavar='TYPE', help="Force a set of scale types for gemm tuning. Only used when --op=gemm.") - parser.add_argument("--tflops", - action='store_true', - default=False, - help="Include achieved TFLOPS in the output alongside the winning config") + parser.add_argument( + "--tflops", + action='store_true', + default=False, + help="[Deprecated, TFlops is always included] Include achieved TFLOPS in the output") parser.add_argument("--abort-on-error", action='store_true', @@ -1884,7 +1938,8 @@ def main(args=None): num_cu = perfRunner.get_num_cu(chip) num_chiplets = perfRunner.get_num_chiplets(chip, num_cu) - options = Options(arch=arch, + options = Options(chip=chip, + arch=arch, num_cu=num_cu, num_chiplets=num_chiplets, debug=parsed_args.debug, @@ -1894,7 +1949,6 @@ def main(args=None): rocmlir_gen_flags=parsed_args.rocmlir_gen_flags, verify_mode=parsed_args.verify_mode, verify_perfconfigs=parsed_args.verify_perf_configs, - tflops=parsed_args.tflops, output=parsed_args.output, abort_on_error=parsed_args.abort_on_error, retune=parsed_args.retune, From f1a885ffa49a5f99317ba089f6854d144a4f3248 Mon Sep 17 00:00:00 2001 From: Mirza Halilcevic Date: Tue, 3 Feb 2026 15:50:14 +0000 Subject: [PATCH 20/23] Simplify output file writing. --- mlir/utils/performance/tuningRunner.py | 45 +++----------------------- 1 file changed, 5 insertions(+), 40 deletions(-) diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py index f8a905469634..405da8230f50 100755 --- a/mlir/utils/performance/tuningRunner.py +++ b/mlir/utils/performance/tuningRunner.py @@ -425,13 +425,9 @@ def _load(self) -> None: if not self.filepath or not os.path.exists(self.filepath): return - try: - with open(self.filepath, 'r') as f: - data = json.load(f) - self._all_contexts = data.get('contexts', {}) - except (json.JSONDecodeError, TypeError, OSError) as e: - logger.warning(f"Failed to load state file, starting fresh: {e}") - return + with open(self.filepath, 'r') as f: + data = json.load(f) + self._all_contexts = data['contexts'] # Process configs for active context with state transitions if self.context_key in self._all_contexts: @@ -892,7 +888,7 @@ class OutputFileWriter: 'arch', 'numCUs', 'numChiplets', 'testVector', 'perfConfig', 'TFlops', 'tuningSpace', 'commitId', 'timestamp', 'durationSec' ] - EXPECTED_HEADER = "# " + "\t".join(HEADER_COLUMNS) + HEADER = "# " + "\t".join(HEADER_COLUMNS) def __init__(self, filepath: str, options: Options): self.filepath = filepath @@ -904,9 +900,6 @@ def __enter__(self): if self.filepath == '-': self.file = sys.stdout else: - if os.path.exists(self.filepath) and os.path.getsize(self.filepath) > 0: - if self._find_last_header() == self.EXPECTED_HEADER: - self._header_written = True self.file = open(self.filepath, 'a') return self @@ -914,34 +907,8 @@ def __exit__(self, exc_type, exc_value, traceback): if self.file and self.file != sys.stdout: self.file.close() - def _find_last_header(self, chunk_size: int = 8192) -> Optional[str]: - """Find the last header line by reading from the end of file.""" - with open(self.filepath, 'rb') as f: - f.seek(0, 2) # Seek to end - file_size = f.tell() - remaining = b'' - - pos = file_size - while pos > 0: - read_size = min(chunk_size, pos) - pos -= read_size - f.seek(pos) - chunk = f.read(read_size) + remaining - - lines = chunk.split(b'\n') - remaining = lines[0] - - for line in reversed(lines[1:]): - if line.startswith(b'# arch\t'): - return line.decode('utf-8') - - if remaining.startswith(b'# arch\t'): - return remaining.decode('utf-8') - - return None - def _write_header(self): - print(self.EXPECTED_HEADER, file=self.file) + print(self.HEADER, file=self.file) self.file.flush() self._header_written = True @@ -972,7 +939,6 @@ def __init__(self, filepath: str): self._header_written = False def __enter__(self): - self._header_written = os.path.exists(self.filepath) and os.path.getsize(self.filepath) > 0 self.file = open(self.filepath, 'a') return self @@ -988,7 +954,6 @@ def write_result(self, result: TuningResult): header=not self._header_written, index=False) self.file.flush() - self._header_written = True From b2dc435fd9d6f10c0ff1f15039b79b7b54f457fd Mon Sep 17 00:00:00 2001 From: Mirza Halilcevic Date: Thu, 5 Feb 2026 00:41:29 +0000 Subject: [PATCH 21/23] Address code review comments. --- mlir/utils/performance/tuningRunner.py | 73 ++++++++++++++++---------- 1 file changed, 44 insertions(+), 29 deletions(-) diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py index 405da8230f50..bad88362b477 100755 --- a/mlir/utils/performance/tuningRunner.py +++ b/mlir/utils/performance/tuningRunner.py @@ -28,6 +28,7 @@ import json import logging import os +import re import signal import statistics import subprocess @@ -35,13 +36,13 @@ import tempfile import threading import time +from collections import deque from concurrent.futures import ThreadPoolExecutor, as_completed from contextlib import nullcontext from dataclasses import dataclass, field from datetime import datetime, timezone from enum import Enum -from typing import Dict, List, Optional -from collections import deque +from typing import Dict, List, Optional, Tuple import numpy as np import pandas as pd @@ -67,6 +68,11 @@ WARMUP_ITERATIONS = 1 SLEEP_US = 100 # 0.1 ms +OUTPUT_HEADER_COLUMNS = [ + 'arch', 'numCUs', 'numChiplets', 'testVector', 'perfConfig', 'TFlops', 'tuningSpace', + 'commitId', 'timestamp', 'durationSec' +] + # ============================================================================= # Logging Setup # ============================================================================= @@ -123,9 +129,10 @@ def process(self, msg, kwargs): return msg, kwargs -def setup_logger(quiet: bool = False, verbose: bool = False) -> logging.Logger: +def setup_logger(quiet: bool = False, verbose: bool = False) -> None: """Configure and return a logger for tuningRunner.""" - assert not (quiet and verbose), "quiet and verbose are mutually exclusive" + if quiet and verbose: + raise ValueError("quiet and verbose are mutually exclusive") if quiet: logger.setLevel(logging.ERROR) @@ -586,12 +593,12 @@ def from_output_file(cls, options: Options) -> 'TunedConfigsCache': @staticmethod def _is_header_line(line: str) -> bool: """Check if line is a column header.""" - return line.startswith('# arch\t') + header_prefix = f"# {OUTPUT_HEADER_COLUMNS[0]}\t" + return line.startswith(header_prefix) @staticmethod def _extract_tuning_space_from_header(line: str) -> Optional[str]: """Extract tuning space from old format header like 'perfConfig (quick)' or 'TFlops (quick)'.""" - import re match = re.search(r'\((\w+)\)', line) return match.group(1) if match else None @@ -884,11 +891,7 @@ def _set_memory_policy(self, numa_node: int) -> None: class OutputFileWriter: """Context manager for writing tuning results to TSV file.""" - HEADER_COLUMNS = [ - 'arch', 'numCUs', 'numChiplets', 'testVector', 'perfConfig', 'TFlops', 'tuningSpace', - 'commitId', 'timestamp', 'durationSec' - ] - HEADER = "# " + "\t".join(HEADER_COLUMNS) + HEADER = "# " + "\t".join(OUTPUT_HEADER_COLUMNS) def __init__(self, filepath: str, options: Options): self.filepath = filepath @@ -913,7 +916,14 @@ def _write_header(self): self._header_written = True def write_result(self, result: TuningResult): - assert result.success and result.winning_config and result.max_tflops and result.timestamp and result.duration_seconds > 0.0, "write_result called with invalid result" + if not result.success: + raise ValueError("write_result called with unsuccessful result") + if not result.winning_config: + raise ValueError("write_result called without winning_config") + if result.max_tflops is None: + raise ValueError("write_result called without max_tflops") + if not result.timestamp: + raise ValueError("write_result called without timestamp") if not self._header_written: self._write_header() @@ -947,7 +957,10 @@ def __exit__(self, exc_type, exc_value, traceback): self.file.close() def write_result(self, result: TuningResult): - assert result.success and result.entries, "write_result called with invalid result" + if not result.success: + raise ValueError("write_result called with unsuccessful result") + if not result.entries: + raise ValueError("write_result called without entries") pd.DataFrame(result.entries).to_csv(self.file, sep='\t', @@ -1208,9 +1221,9 @@ def verify_perfconfig(perfconfig: str, config: PerfConfiguration, paths: Paths, return nano_seconds -def find_best_perfconfig(tuning_output: List[str], config: PerfConfiguration, paths: Paths, +def find_best_perfconfig(tuning_output_lines: List[str], config: PerfConfiguration, paths: Paths, options: Options, - gpu_id: int) -> tuple[Optional[str], Optional[float], List[Dict]]: + gpu_id: int) -> Tuple[Optional[str], Optional[float], List[Dict]]: """Parse tuning driver output and find the best performing perfconfig. Returns the winning config, its TFLOPS, and all entries. @@ -1221,7 +1234,7 @@ def find_best_perfconfig(tuning_output: List[str], config: PerfConfiguration, pa winning_config: Optional[str] = None entries = [] - for line in tuning_output: + for line in tuning_output_lines: result = line.strip() if not result: continue @@ -1348,26 +1361,25 @@ def tune_config(test_vector: str, conf_class: type, paths: Paths, options: Optio raise_if_terminated(tuning_driver.returncode) - tuning_output = tuning_stdout.decode('utf-8').splitlines() + tuning_output = tuning_stdout.decode('utf-8') tuning_errors = tuning_stderr.decode('utf-8') if tuning_driver.returncode != 0: gpu_logger.error( - format_error( - "Tuning pipeline failed", - command=tuning_pipeline, - stdout=tuning_output[-10:], # Last 10 lines of stdout - stderr=tuning_errors, - exit_code=tuning_driver.returncode, - gpu_id=gpu_id)) + format_error("Tuning pipeline failed", + command=tuning_pipeline, + stdout=tuning_output, + stderr=tuning_errors, + exit_code=tuning_driver.returncode, + gpu_id=gpu_id)) return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id) else: # Log any stderr output from tuning driver because it may contain warnings if tuning_errors.strip(): gpu_logger.warning(f"rocmlir-tuning-driver stderr:\n{tuning_errors}") - winning_config, max_tflops, entries = find_best_perfconfig(tuning_output, config, paths, - options, gpu_id) + winning_config, max_tflops, entries = find_best_perfconfig(tuning_output.splitlines(), + config, paths, options, gpu_id) except TuningError as e: gpu_logger.error(str(e)) return TuningResult(test_vector=test_vector, success=False, gpu_id=gpu_id) @@ -1636,7 +1648,8 @@ def get_config_class(op_type: Operation) -> type: Operation.CONV_GEMM: ConvGemmConfiguration, } - assert op_type in config_classes, f"No config class for operation: {str(op_type)}" + if op_type not in config_classes: + raise ValueError(f"No config class for operation: {str(op_type)}") return config_classes[op_type] @@ -1669,7 +1682,8 @@ def load_configs(op_type: Operation, parsed_args: argparse.Namespace, paths: Pat lambda: perfRunner.get_conv_gemm_configurations(paths.configuration_file_path), } - assert op_type in loaders, f"No config loader for operation: {str(op_type)}" + if op_type not in loaders: + raise ValueError(f"No config loader for operation: {str(op_type)}") return loaders[op_type]() @@ -1882,7 +1896,8 @@ def main(args=None): # Handle stdin for configs file stdin_temp_file = None if parsed_args.configs_file == '-': - parsed_args.configs_file = load_configs_from_stdin() + stdin_temp_file = load_configs_from_stdin() + parsed_args.configs_file = stdin_temp_file try: paths = resolve_paths(op_type, parsed_args) From d964457a8c975c62c2575ba56e7458ba54e91db4 Mon Sep 17 00:00:00 2001 From: Mirza Halilcevic Date: Thu, 5 Feb 2026 01:00:27 +0000 Subject: [PATCH 22/23] Use llvm dbgs instead of errs where appropriate. --- mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp b/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp index 78513dc5b16f..8bf7ab8b73d2 100644 --- a/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp +++ b/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp @@ -28,12 +28,15 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/LogicalResult.h" #include #include +#define DEBUG_TYPE "rock-tuning-parameter" + // Found experimentally, might need to change it if we add more params to the // tuning space #define NUM_RANDOM_PERFCONFIGS_PER_TILE_SIZE 50 @@ -440,8 +443,8 @@ static void createGemmGemmTuningRangeBF(TuningParamSet *newSpace, // int64_t numEUPerCU = // rock::lookupArchInfo(rock::getArchValue(gemmGemmOp)).numEUPerCU; bool isWMMA = archInfo.isWmma(gemmGemmOp); - llvm::errs() << "isWMMA: " << isWMMA << "\n"; - llvm::errs() << "features: " << features << "\n"; + LLVM_DEBUG(llvm::dbgs() << "isWMMA: " << isWMMA << "\n"); + LLVM_DEBUG(llvm::dbgs() << "features: " << features << "\n"); if (!archInfo.isAccel(gemmGemmOp)) { // We only support GPUs with matrix accelerator extensions return; @@ -620,7 +623,7 @@ static void createGemmTuningRangeBF(TuningParamSet *newSpace, int64_t outputSwizzle{2}, wavesPerEU{0}, gridGroupSize{0}; OpBuilder b(gemmOp.getContext()); if (archInfo.isAccel(gemmOp)) { - llvm::errs() << "createGemmTuningRangeBF: accel\n"; + LLVM_DEBUG(llvm::dbgs() << "createGemmTuningRangeBF: accel\n"); for (uint32_t gemmMPerBlock : accelParams[0]) { SmallVector mPerWaveRange = computeDPerWave(kind, gemmMPerBlock, waveSize); @@ -665,7 +668,7 @@ static void createGemmTuningRangeBF(TuningParamSet *newSpace, } } } else { - llvm::errs() << "createGemmTuningRangeBF: non-accel\n"; + LLVM_DEBUG(llvm::dbgs() << "createGemmTuningRangeBF: non-accel\n"); // Non-accel PopulateParams tuningInfo; for (uint32_t blockSize : validRangeGeneralGemmParams[0]) { From 408100adb16cfc191ad9293023f325a4860e2305 Mon Sep 17 00:00:00 2001 From: Mirza Halilcevic Date: Thu, 5 Feb 2026 14:40:27 +0000 Subject: [PATCH 23/23] Warn if env vars are set. --- mlir/utils/performance/tuningRunner.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py index bad88362b477..7aa804c3a434 100755 --- a/mlir/utils/performance/tuningRunner.py +++ b/mlir/utils/performance/tuningRunner.py @@ -24,6 +24,7 @@ """ import argparse +import functools import glob import json import logging @@ -924,6 +925,8 @@ def write_result(self, result: TuningResult): raise ValueError("write_result called without max_tflops") if not result.timestamp: raise ValueError("write_result called without timestamp") + if result.duration_seconds <= 0.0: + raise ValueError("write_result called with invalid duration_seconds") if not self._header_written: self._write_header() @@ -1028,13 +1031,14 @@ def __call__(self, parser, namespace, values, option_string=None): setattr(namespace, self.dest, values) +@functools.lru_cache(maxsize=1) def get_git_commit_hash() -> str: """Get the current git commit hash.""" try: return subprocess.check_output(['git', 'rev-parse', 'HEAD'], stderr=subprocess.DEVNULL).decode().strip() except (subprocess.CalledProcessError, FileNotFoundError, OSError) as e: - logger.debug(f"Failed to get git commit hash: {e}") + logger.warning(f"Failed to get git commit hash: {e}") return "unknown" @@ -1883,6 +1887,10 @@ def main(args=None): gpu_topology = GpuTopology.discover() available_gpus = sorted(gpu_topology.gpus.keys()) + # Capture these before set_isolated_gpu_env overwrites them + user_rocr_visible = os.environ.get("ROCR_VISIBLE_DEVICES") + user_hip_visible = os.environ.get("HIP_VISIBLE_DEVICES") + # We call into perfRunner which also queries GPU info using HIP and rocminfo. # To ensure consistency, we isolate the process to the first available GPU. set_isolated_gpu_env(os.environ, available_gpus[0]) @@ -1891,6 +1899,16 @@ def main(args=None): setup_logger(quiet=parsed_args.quiet, verbose=parsed_args.verbose) + if user_rocr_visible or user_hip_visible: + vars_set = [] + if user_rocr_visible: + vars_set.append(f"ROCR_VISIBLE_DEVICES={user_rocr_visible}") + if user_hip_visible: + vars_set.append(f"HIP_VISIBLE_DEVICES={user_hip_visible}") + logger.warning( + f"Ignoring {' and '.join(vars_set)}. " + f"This script manages GPU visibility internally. Use '--gpus' to select specific GPUs.") + op_type = Operation.from_name(parsed_args.op) # Handle stdin for configs file