From ad0e59d1e1b4aeb06ceb7510acc5ea1ae3a77d70 Mon Sep 17 00:00:00 2001 From: Djordje Ramic Date: Thu, 5 Feb 2026 07:26:30 -0600 Subject: [PATCH 1/2] Fix invalid chipset error on multi-GPU systems --- mlir/test/common_utils/common.py | 10 ++++++++ mlir/test/e2e/generateE2ETest.py | 23 ++++++++++++++++- mlir/test/e2e/lit.cfg.py | 10 +++++++- mlir/test/e2e/lit.site.cfg.py.in | 33 +++++++++++++++++++++---- mlir/test/fusion/e2e/lit.cfg.py | 10 +++++++- mlir/test/fusion/e2e/lit.site.cfg.py.in | 32 +++++++++++++++++++----- mlir/test/lit.cfg.py | 10 +++++++- mlir/test/lit.site.cfg.py.in | 31 +++++++++++++++++++---- 8 files changed, 139 insertions(+), 20 deletions(-) diff --git a/mlir/test/common_utils/common.py b/mlir/test/common_utils/common.py index 494fc1728148..60c70de500f1 100644 --- a/mlir/test/common_utils/common.py +++ b/mlir/test/common_utils/common.py @@ -70,6 +70,16 @@ def get_agents(): return agents +def get_default_agent(): + """Returns the architecture of device 0, which HIP uses by default.""" + device_count = hip_check(hip.hipGetDeviceCount()) + if device_count > 0: + props = hip.hipDeviceProp_t() + hip_check(hip.hipGetDeviceProperties(props, 0)) + return props.gcnArchName.decode('utf-8') + return None + + def is_xdlops_present() -> bool: """This function checks whether a GPU with xdlops support is present""" return any([agent.startswith("gfx9") for agent in get_agents()]) diff --git a/mlir/test/e2e/generateE2ETest.py b/mlir/test/e2e/generateE2ETest.py index 355945b94925..71a4d30433d0 100755 --- a/mlir/test/e2e/generateE2ETest.py +++ b/mlir/test/e2e/generateE2ETest.py @@ -49,6 +49,7 @@ def hip_check(call_result): def get_arch(): + """Returns all unique GPU architectures in the system.""" agents = set() device_count = hip_check(hip.hipGetDeviceCount()) for device in range(device_count): @@ -60,6 +61,16 @@ def get_arch(): return agents +def get_default_arch(): + """Returns the architecture of device 0, which HIP uses by default.""" + device_count = hip_check(hip.hipGetDeviceCount()) + if device_count > 0: + props = hip.hipDeviceProp_t() + hip_check(hip.hipGetDeviceProperties(props, 0)) + return props.gcnArchName.decode('utf-8') + return None + + def generate_option_list(prefixes: dict, table: list, key1: str, key2: str): options_list = [] for item in table[key1]: @@ -134,7 +145,17 @@ def usage(): axis_prefixes[axis["name"]] = axis["prefix"] arch_names = get_arch() - arch = ','.join(arch_names) + default_arch = get_default_arch() + # Use device 0's architecture (HIP default) for compilation + # This ensures compiled binaries run on the default GPU + if default_arch: + if len(arch_names) > 1: + print(f"Note: Multiple GPU architectures detected: {', '.join(sorted(arch_names))}. " + f"Using device 0 architecture '{default_arch}' for test generation. " + f"Use HIP_VISIBLE_DEVICES to select a different GPU.") + arch = default_arch + else: + arch = "" combinations = generate_option_list(axis_prefixes, toml_dict, "axis", "values") for suite in toml_dict["suite"]: diff --git a/mlir/test/e2e/lit.cfg.py b/mlir/test/e2e/lit.cfg.py index 6bbcb304416e..190822319498 100644 --- a/mlir/test/e2e/lit.cfg.py +++ b/mlir/test/e2e/lit.cfg.py @@ -38,7 +38,15 @@ config.substitutions.append(('%arch', config.arch)) config.substitutions.append(('%pv', config.populate_validation)) -llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP']) +llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP', + 'HIP_VISIBLE_DEVICES']) + +# When multiple GPUs are present, limit HIP to device 0 to ensure +# compiled binaries match the execution device +# But respect user's HIP_VISIBLE_DEVICES if already set +if hasattr(config, 'multi_gpu_detected') and config.multi_gpu_detected: + if 'HIP_VISIBLE_DEVICES' not in os.environ: + config.environment['HIP_VISIBLE_DEVICES'] = '0' ############## # FIXME: adding a path to the environment isn't appearing to work as diff --git a/mlir/test/e2e/lit.site.cfg.py.in b/mlir/test/e2e/lit.site.cfg.py.in index 3f1bb850e869..6fb13e34b778 100644 --- a/mlir/test/e2e/lit.site.cfg.py.in +++ b/mlir/test/e2e/lit.site.cfg.py.in @@ -28,7 +28,7 @@ config.rocmlir_common_python_tests_utils = "@ROCMLIR_COMMON_PYTHON_TESTS_UTILS@" # Add common python test utils sys.path.append(config.rocmlir_common_python_tests_utils) -from common import get_agents, get_arch_features +from common import get_agents, get_arch_features, get_default_agent # Support substitution of the tools_dir with user parameters. This is # used when we can't determine the tool dir at configuration time. @@ -42,19 +42,42 @@ except KeyError: # If rocm_agent_enumerator shows no viable GPUs, skip tests that need one, # because the default target will lead to compilation failures. +import os config.no_AMD_GPU = False config.arch = "" config.features = None config.arch_support_mfma = False config.arch_support_wmma = False config.arch_support_accel_fp8 = False +config.multi_gpu_detected = False if config.rocm_path: try: + # Check if user already set HIP_VISIBLE_DEVICES - respect their choice + user_hip_visible = os.environ.get('HIP_VISIBLE_DEVICES') + agents = get_agents() - config.arch = ','.join(agents) - for x in agents: - config.features, config.arch_support_mfma, config.arch_support_wmma, config.arch_support_accel_fp8 = get_arch_features(x) - config.substitutions.append(('%features', config.features)) + default_agent = get_default_agent() + if default_agent: + if len(agents) > 1: + config.multi_gpu_detected = True + if user_hip_visible is not None: + # User specified which GPU to use - respect it + # HIP will see their chosen GPU as device 0 + lit_config.note("Multiple GPU architectures detected: %s. " + "Using user-specified HIP_VISIBLE_DEVICES=%s. " + "Device 0 (after filtering) architecture: '%s'." + % (', '.join(sorted(agents)), user_hip_visible, default_agent)) + else: + # No user preference - use device 0 and set HIP_VISIBLE_DEVICES + lit_config.note("Multiple GPU architectures detected: %s. " + "Using device 0 architecture '%s' for E2E tests. " + "HIP_VISIBLE_DEVICES will be set to '0' to ensure binary compatibility." + % (', '.join(sorted(agents)), default_agent)) + config.arch = default_agent + # Get features for the device we'll actually use + config.features, config.arch_support_mfma, config.arch_support_wmma, config.arch_support_accel_fp8 = get_arch_features(default_agent) + if config.features: + config.substitutions.append(('%features', config.features)) # Check other features here if not config.arch: diff --git a/mlir/test/fusion/e2e/lit.cfg.py b/mlir/test/fusion/e2e/lit.cfg.py index eab98645a68b..9c78652e2063 100644 --- a/mlir/test/fusion/e2e/lit.cfg.py +++ b/mlir/test/fusion/e2e/lit.cfg.py @@ -36,7 +36,15 @@ config.substitutions.append(('%arch', config.arch)) config.substitutions.append(('%pv', config.populate_validation)) -llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP']) +llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP', + 'HIP_VISIBLE_DEVICES']) + +# When multiple GPUs are present, limit HIP to device 0 to ensure +# compiled binaries match the execution device +# But respect user's HIP_VISIBLE_DEVICES if already set +if hasattr(config, 'multi_gpu_detected') and config.multi_gpu_detected: + if 'HIP_VISIBLE_DEVICES' not in os.environ: + config.environment['HIP_VISIBLE_DEVICES'] = '0' ############## # FIXME: adding a path to the environment isn't appearing to work as diff --git a/mlir/test/fusion/e2e/lit.site.cfg.py.in b/mlir/test/fusion/e2e/lit.site.cfg.py.in index b9e0f612bbfd..7e7dc810b34f 100644 --- a/mlir/test/fusion/e2e/lit.site.cfg.py.in +++ b/mlir/test/fusion/e2e/lit.site.cfg.py.in @@ -35,7 +35,7 @@ config.rocmlir_common_python_tests_utils = "@ROCMLIR_COMMON_PYTHON_TESTS_UTILS@" # Add common python test utils sys.path.append(config.rocmlir_common_python_tests_utils) -from common import get_agents +from common import get_agents, get_default_agent # Support substitution of the tools_dir with user parameters. This is # used when we can't determine the tool dir at configuration time. @@ -49,20 +49,40 @@ except KeyError: # If rocm_agent_enumerator shows no viable GPUs, skip tests that need one, # because the default target will lead to compilation failures. +import os config.no_AMD_GPU = False config.arch = "" config.arch_support_mfma = False config.arch_support_wmma = False +config.multi_gpu_detected = False if config.rocm_path: try: + # Check if user already set HIP_VISIBLE_DEVICES - respect their choice + user_hip_visible = os.environ.get('HIP_VISIBLE_DEVICES') + agents = get_agents() - config.arch = ','.join(agents) - for x in agents: - if any([arch in x for arch in ["gfx908", "gfx90a", "gfx942", "gfx950"]]): + default_agent = get_default_agent() + if default_agent: + if len(agents) > 1: + config.multi_gpu_detected = True + if user_hip_visible is not None: + # User specified which GPU to use - respect it + lit_config.note("Multiple GPU architectures detected: %s. " + "Using user-specified HIP_VISIBLE_DEVICES=%s. " + "Device 0 (after filtering) architecture: '%s'." + % (', '.join(sorted(agents)), user_hip_visible, default_agent)) + else: + # No user preference - use device 0 and set HIP_VISIBLE_DEVICES + lit_config.note("Multiple GPU architectures detected: %s. " + "Using device 0 architecture '%s' for E2E tests. " + "HIP_VISIBLE_DEVICES will be set to '0' to ensure binary compatibility." + % (', '.join(sorted(agents)), default_agent)) + config.arch = default_agent + # Check features for the device we'll actually use + if any([arch in default_agent for arch in ["gfx908", "gfx90a", "gfx942", "gfx950"]]): config.arch_support_mfma = True - elif "gfx11" in x or "gfx12" in x: + elif "gfx11" in default_agent or "gfx12" in default_agent: config.arch_support_wmma = True - # Check other features here if not config.arch: config.no_AMD_GPU = True except subprocess.CalledProcessError: diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py index caecd5729892..64f3a3cd56d7 100644 --- a/mlir/test/lit.cfg.py +++ b/mlir/test/lit.cfg.py @@ -39,7 +39,15 @@ config.substitutions.append(('%arch', config.arch)) config.substitutions.append(('%pv', config.populate_validation)) -llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP']) +llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP', + 'HIP_VISIBLE_DEVICES']) + +# When multiple GPUs are present, limit HIP to device 0 to ensure +# compiled binaries match the execution device +# But respect user's HIP_VISIBLE_DEVICES if already set +if hasattr(config, 'multi_gpu_detected') and config.multi_gpu_detected: + if 'HIP_VISIBLE_DEVICES' not in os.environ: + config.environment['HIP_VISIBLE_DEVICES'] = '0' ############## # FIXME: adding a path to the environment isn't appearing to work as diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in index 3a0c61cf50e4..5080558d8853 100644 --- a/mlir/test/lit.site.cfg.py.in +++ b/mlir/test/lit.site.cfg.py.in @@ -59,7 +59,7 @@ config.rocmlir_common_python_tests_utils = "@ROCMLIR_COMMON_PYTHON_TESTS_UTILS@" # Add common python test utils sys.path.append(config.rocmlir_common_python_tests_utils) -from common import get_agents, get_arch_features +from common import get_agents, get_arch_features, get_default_agent # Support substitution of the tools_dir with user parameters. This is # used when we can't determine the tool dir at configuration time. @@ -73,19 +73,40 @@ except KeyError: # If rocm_agent_enumerator shows no viable GPUs, skip tests that need one, # because the default target will lead to compilation failures. +import os config.no_AMD_GPU = False config.arch = "" config.arch_support_mfma = False config.arch_support_wmma = False config.arch_support_accel_fp8 = False config.features = None +config.multi_gpu_detected = False if config.rocm_path: try: + # Check if user already set HIP_VISIBLE_DEVICES - respect their choice + user_hip_visible = os.environ.get('HIP_VISIBLE_DEVICES') + agents = get_agents() - config.arch = ','.join(agents) - for x in agents: - if not config.features: - config.features, config.arch_support_mfma, config.arch_support_wmma, config.arch_support_accel_fp8 = get_arch_features(x) + default_agent = get_default_agent() + if default_agent: + if len(agents) > 1: + config.multi_gpu_detected = True + if user_hip_visible is not None: + # User specified which GPU to use - respect it + lit_config.note("Multiple GPU architectures detected: %s. " + "Using user-specified HIP_VISIBLE_DEVICES=%s. " + "Device 0 (after filtering) architecture: '%s'." + % (', '.join(sorted(agents)), user_hip_visible, default_agent)) + else: + # No user preference - use device 0 and set HIP_VISIBLE_DEVICES + lit_config.note("Multiple GPU architectures detected: %s. " + "Using device 0 architecture '%s' for E2E tests. " + "HIP_VISIBLE_DEVICES will be set to '0' to ensure binary compatibility." + % (', '.join(sorted(agents)), default_agent)) + config.arch = default_agent + # Get features for the device we'll actually use + config.features, config.arch_support_mfma, config.arch_support_wmma, config.arch_support_accel_fp8 = get_arch_features(default_agent) + if config.features: config.substitutions.append(('%features', config.features)) if not config.arch: config.no_AMD_GPU = True From 7378c5712ad82b444f06ae93ba6a526c7f581bd9 Mon Sep 17 00:00:00 2001 From: Djordje Ramic Date: Thu, 12 Feb 2026 13:57:31 +0000 Subject: [PATCH 2/2] Yapf formatting --- mlir/test/e2e/lit.cfg.py | 8 ++++---- mlir/test/fusion/e2e/lit.cfg.py | 4 ++-- mlir/test/lit.cfg.py | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/mlir/test/e2e/lit.cfg.py b/mlir/test/e2e/lit.cfg.py index 190822319498..43e5c9718e49 100644 --- a/mlir/test/e2e/lit.cfg.py +++ b/mlir/test/e2e/lit.cfg.py @@ -32,14 +32,14 @@ config.substitutions.append(('%shlibext', config.llvm_shlib_ext)) config.substitutions.append(("%mlir_src_root", config.mlir_src_root)) config.substitutions.append(('%random_data', config.random_data)) -config.substitutions.append(('%constrained_float_range_random_data', - config.constrained_float_range_random_data)) +config.substitutions.append( + ('%constrained_float_range_random_data', config.constrained_float_range_random_data)) config.substitutions.append(('%rocmlir_gen_flags', config.rocmlir_gen_flags)) config.substitutions.append(('%arch', config.arch)) config.substitutions.append(('%pv', config.populate_validation)) -llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP', - 'HIP_VISIBLE_DEVICES']) +llvm_config.with_system_environment( + ['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP', 'HIP_VISIBLE_DEVICES']) # When multiple GPUs are present, limit HIP to device 0 to ensure # compiled binaries match the execution device diff --git a/mlir/test/fusion/e2e/lit.cfg.py b/mlir/test/fusion/e2e/lit.cfg.py index 9c78652e2063..cebdae4eced2 100644 --- a/mlir/test/fusion/e2e/lit.cfg.py +++ b/mlir/test/fusion/e2e/lit.cfg.py @@ -36,8 +36,8 @@ config.substitutions.append(('%arch', config.arch)) config.substitutions.append(('%pv', config.populate_validation)) -llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP', - 'HIP_VISIBLE_DEVICES']) +llvm_config.with_system_environment( + ['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP', 'HIP_VISIBLE_DEVICES']) # When multiple GPUs are present, limit HIP to device 0 to ensure # compiled binaries match the execution device diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py index 64f3a3cd56d7..ecf63326d695 100644 --- a/mlir/test/lit.cfg.py +++ b/mlir/test/lit.cfg.py @@ -33,14 +33,14 @@ config.substitutions.append(('%shlibext', config.llvm_shlib_ext)) config.substitutions.append(("%mlir_src_root", config.mlir_src_root)) config.substitutions.append(('%random_data', config.random_data)) -config.substitutions.append(('%constrained_float_range_random_data', - config.constrained_float_range_random_data)) +config.substitutions.append( + ('%constrained_float_range_random_data', config.constrained_float_range_random_data)) config.substitutions.append(('%rocmlir_gen_flags', config.rocmlir_gen_flags)) config.substitutions.append(('%arch', config.arch)) config.substitutions.append(('%pv', config.populate_validation)) -llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP', - 'HIP_VISIBLE_DEVICES']) +llvm_config.with_system_environment( + ['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP', 'HIP_VISIBLE_DEVICES']) # When multiple GPUs are present, limit HIP to device 0 to ensure # compiled binaries match the execution device