From ad0e59d1e1b4aeb06ceb7510acc5ea1ae3a77d70 Mon Sep 17 00:00:00 2001
From: Djordje Ramic <djoramic@amd.com>
Date: Thu, 5 Feb 2026 07:26:30 -0600
Subject: [PATCH 1/2] Fix invalid chipset error on multi-GPU systems

---
 mlir/test/common_utils/common.py        | 10 ++++++++
 mlir/test/e2e/generateE2ETest.py        | 23 ++++++++++++++++-
 mlir/test/e2e/lit.cfg.py                | 10 +++++++-
 mlir/test/e2e/lit.site.cfg.py.in        | 33 +++++++++++++++++++++----
 mlir/test/fusion/e2e/lit.cfg.py         | 10 +++++++-
 mlir/test/fusion/e2e/lit.site.cfg.py.in | 32 +++++++++++++++++++-----
 mlir/test/lit.cfg.py                    | 10 +++++++-
 mlir/test/lit.site.cfg.py.in            | 31 +++++++++++++++++++----
 8 files changed, 139 insertions(+), 20 deletions(-)

diff --git a/mlir/test/common_utils/common.py b/mlir/test/common_utils/common.py
index 494fc1728148..60c70de500f1 100644
--- a/mlir/test/common_utils/common.py
+++ b/mlir/test/common_utils/common.py
@@ -70,6 +70,16 @@ def get_agents():
     return agents
 
 
+def get_default_agent():
+    """Returns the architecture of device 0, which HIP uses by default."""
+    device_count = hip_check(hip.hipGetDeviceCount())
+    if device_count > 0:
+        props = hip.hipDeviceProp_t()
+        hip_check(hip.hipGetDeviceProperties(props, 0))
+        return props.gcnArchName.decode('utf-8')
+    return None
+
+
 def is_xdlops_present() -> bool:
     """This function checks whether a GPU with xdlops support is present"""
     return any([agent.startswith("gfx9") for agent in get_agents()])
diff --git a/mlir/test/e2e/generateE2ETest.py b/mlir/test/e2e/generateE2ETest.py
index 355945b94925..71a4d30433d0 100755
--- a/mlir/test/e2e/generateE2ETest.py
+++ b/mlir/test/e2e/generateE2ETest.py
@@ -49,6 +49,7 @@ def hip_check(call_result):
 
 
 def get_arch():
+    """Returns all unique GPU architectures in the system."""
     agents = set()
     device_count = hip_check(hip.hipGetDeviceCount())
     for device in range(device_count):
@@ -60,6 +61,16 @@ def get_arch():
     return agents
 
 
+def get_default_arch():
+    """Returns the architecture of device 0, which HIP uses by default."""
+    device_count = hip_check(hip.hipGetDeviceCount())
+    if device_count > 0:
+        props = hip.hipDeviceProp_t()
+        hip_check(hip.hipGetDeviceProperties(props, 0))
+        return props.gcnArchName.decode('utf-8')
+    return None
+
+
 def generate_option_list(prefixes: dict, table: list, key1: str, key2: str):
     options_list = []
     for item in table[key1]:
@@ -134,7 +145,17 @@ def usage():
             axis_prefixes[axis["name"]] = axis["prefix"]
 
     arch_names = get_arch()
-    arch = ','.join(arch_names)
+    default_arch = get_default_arch()
+    # Use device 0's architecture (HIP default) for compilation
+    # This ensures compiled binaries run on the default GPU
+    if default_arch:
+        if len(arch_names) > 1:
+            print(f"Note: Multiple GPU architectures detected: {', '.join(sorted(arch_names))}. "
+                  f"Using device 0 architecture '{default_arch}' for test generation. "
+                  f"Use HIP_VISIBLE_DEVICES to select a different GPU.")
+        arch = default_arch
+    else:
+        arch = ""
     combinations = generate_option_list(axis_prefixes, toml_dict, "axis", "values")
 
     for suite in toml_dict["suite"]:
diff --git a/mlir/test/e2e/lit.cfg.py b/mlir/test/e2e/lit.cfg.py
index 6bbcb304416e..190822319498 100644
--- a/mlir/test/e2e/lit.cfg.py
+++ b/mlir/test/e2e/lit.cfg.py
@@ -38,7 +38,15 @@
 config.substitutions.append(('%arch', config.arch))
 config.substitutions.append(('%pv', config.populate_validation))
 
-llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP'])
+llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP',
+                                     'HIP_VISIBLE_DEVICES'])
+
+# When multiple GPUs are present, limit HIP to device 0 to ensure
+# compiled binaries match the execution device
+# But respect user's HIP_VISIBLE_DEVICES if already set
+if hasattr(config, 'multi_gpu_detected') and config.multi_gpu_detected:
+    if 'HIP_VISIBLE_DEVICES' not in os.environ:
+        config.environment['HIP_VISIBLE_DEVICES'] = '0'
 
 ##############
 # FIXME: adding a path to the environment isn't appearing to work as
diff --git a/mlir/test/e2e/lit.site.cfg.py.in b/mlir/test/e2e/lit.site.cfg.py.in
index 3f1bb850e869..6fb13e34b778 100644
--- a/mlir/test/e2e/lit.site.cfg.py.in
+++ b/mlir/test/e2e/lit.site.cfg.py.in
@@ -28,7 +28,7 @@ config.rocmlir_common_python_tests_utils = "@ROCMLIR_COMMON_PYTHON_TESTS_UTILS@"
 
 # Add common python test utils
 sys.path.append(config.rocmlir_common_python_tests_utils)
-from common import get_agents, get_arch_features
+from common import get_agents, get_arch_features, get_default_agent
 
 # Support substitution of the tools_dir with user parameters. This is
 # used when we can't determine the tool dir at configuration time.
@@ -42,19 +42,42 @@ except KeyError:
 
 # If rocm_agent_enumerator shows no viable GPUs, skip tests that need one,
 # because the default target will lead to compilation failures.
+import os
 config.no_AMD_GPU = False
 config.arch = ""
 config.features = None
 config.arch_support_mfma = False
 config.arch_support_wmma = False
 config.arch_support_accel_fp8 = False
+config.multi_gpu_detected = False
 if config.rocm_path:
     try:
+        # Check if user already set HIP_VISIBLE_DEVICES - respect their choice
+        user_hip_visible = os.environ.get('HIP_VISIBLE_DEVICES')
+        
         agents = get_agents()
-        config.arch = ','.join(agents)
-        for x in agents:
-            config.features, config.arch_support_mfma, config.arch_support_wmma, config.arch_support_accel_fp8 = get_arch_features(x)
-            config.substitutions.append(('%features', config.features))
+        default_agent = get_default_agent()
+        if default_agent:
+            if len(agents) > 1:
+                config.multi_gpu_detected = True
+                if user_hip_visible is not None:
+                    # User specified which GPU to use - respect it
+                    # HIP will see their chosen GPU as device 0
+                    lit_config.note("Multiple GPU architectures detected: %s. "
+                                    "Using user-specified HIP_VISIBLE_DEVICES=%s. "
+                                    "Device 0 (after filtering) architecture: '%s'."
+                                    % (', '.join(sorted(agents)), user_hip_visible, default_agent))
+                else:
+                    # No user preference - use device 0 and set HIP_VISIBLE_DEVICES
+                    lit_config.note("Multiple GPU architectures detected: %s. "
+                                    "Using device 0 architecture '%s' for E2E tests. "
+                                    "HIP_VISIBLE_DEVICES will be set to '0' to ensure binary compatibility."
+                                    % (', '.join(sorted(agents)), default_agent))
+            config.arch = default_agent
+            # Get features for the device we'll actually use
+            config.features, config.arch_support_mfma, config.arch_support_wmma, config.arch_support_accel_fp8 = get_arch_features(default_agent)
+            if config.features:
+                config.substitutions.append(('%features', config.features))
 
         # Check other features here
         if not config.arch:
diff --git a/mlir/test/fusion/e2e/lit.cfg.py b/mlir/test/fusion/e2e/lit.cfg.py
index eab98645a68b..9c78652e2063 100644
--- a/mlir/test/fusion/e2e/lit.cfg.py
+++ b/mlir/test/fusion/e2e/lit.cfg.py
@@ -36,7 +36,15 @@
 config.substitutions.append(('%arch', config.arch))
 config.substitutions.append(('%pv', config.populate_validation))
 
-llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP'])
+llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP',
+                                     'HIP_VISIBLE_DEVICES'])
+
+# When multiple GPUs are present, limit HIP to device 0 to ensure
+# compiled binaries match the execution device
+# But respect user's HIP_VISIBLE_DEVICES if already set
+if hasattr(config, 'multi_gpu_detected') and config.multi_gpu_detected:
+    if 'HIP_VISIBLE_DEVICES' not in os.environ:
+        config.environment['HIP_VISIBLE_DEVICES'] = '0'
 
 ##############
 # FIXME: adding a path to the environment isn't appearing to work as
diff --git a/mlir/test/fusion/e2e/lit.site.cfg.py.in b/mlir/test/fusion/e2e/lit.site.cfg.py.in
index b9e0f612bbfd..7e7dc810b34f 100644
--- a/mlir/test/fusion/e2e/lit.site.cfg.py.in
+++ b/mlir/test/fusion/e2e/lit.site.cfg.py.in
@@ -35,7 +35,7 @@ config.rocmlir_common_python_tests_utils = "@ROCMLIR_COMMON_PYTHON_TESTS_UTILS@"
 
 # Add common python test utils
 sys.path.append(config.rocmlir_common_python_tests_utils)
-from common import get_agents
+from common import get_agents, get_default_agent
 
 # Support substitution of the tools_dir with user parameters. This is
 # used when we can't determine the tool dir at configuration time.
@@ -49,20 +49,40 @@ except KeyError:
 
 # If rocm_agent_enumerator shows no viable GPUs, skip tests that need one,
 # because the default target will lead to compilation failures.
+import os
 config.no_AMD_GPU = False
 config.arch = ""
 config.arch_support_mfma = False
 config.arch_support_wmma = False
+config.multi_gpu_detected = False
 if config.rocm_path:
     try:
+        # Check if user already set HIP_VISIBLE_DEVICES - respect their choice
+        user_hip_visible = os.environ.get('HIP_VISIBLE_DEVICES')
+        
         agents = get_agents()
-        config.arch = ','.join(agents)
-        for x in agents:
-            if any([arch in x for arch in ["gfx908", "gfx90a", "gfx942", "gfx950"]]):
+        default_agent = get_default_agent()
+        if default_agent:
+            if len(agents) > 1:
+                config.multi_gpu_detected = True
+                if user_hip_visible is not None:
+                    # User specified which GPU to use - respect it
+                    lit_config.note("Multiple GPU architectures detected: %s. "
+                                    "Using user-specified HIP_VISIBLE_DEVICES=%s. "
+                                    "Device 0 (after filtering) architecture: '%s'."
+                                    % (', '.join(sorted(agents)), user_hip_visible, default_agent))
+                else:
+                    # No user preference - use device 0 and set HIP_VISIBLE_DEVICES
+                    lit_config.note("Multiple GPU architectures detected: %s. "
+                                    "Using device 0 architecture '%s' for E2E tests. "
+                                    "HIP_VISIBLE_DEVICES will be set to '0' to ensure binary compatibility."
+                                    % (', '.join(sorted(agents)), default_agent))
+            config.arch = default_agent
+            # Check features for the device we'll actually use
+            if any([arch in default_agent for arch in ["gfx908", "gfx90a", "gfx942", "gfx950"]]):
                 config.arch_support_mfma = True
-            elif "gfx11" in x or "gfx12" in x:
+            elif "gfx11" in default_agent or "gfx12" in default_agent:
                 config.arch_support_wmma = True
-            # Check other features here
         if not config.arch:
             config.no_AMD_GPU = True
     except subprocess.CalledProcessError:
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index caecd5729892..64f3a3cd56d7 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -39,7 +39,15 @@
 config.substitutions.append(('%arch', config.arch))
 config.substitutions.append(('%pv', config.populate_validation))
 
-llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP'])
+llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP',
+                                     'HIP_VISIBLE_DEVICES'])
+
+# When multiple GPUs are present, limit HIP to device 0 to ensure
+# compiled binaries match the execution device
+# But respect user's HIP_VISIBLE_DEVICES if already set
+if hasattr(config, 'multi_gpu_detected') and config.multi_gpu_detected:
+    if 'HIP_VISIBLE_DEVICES' not in os.environ:
+        config.environment['HIP_VISIBLE_DEVICES'] = '0'
 
 ##############
 # FIXME: adding a path to the environment isn't appearing to work as
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index 3a0c61cf50e4..5080558d8853 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -59,7 +59,7 @@ config.rocmlir_common_python_tests_utils = "@ROCMLIR_COMMON_PYTHON_TESTS_UTILS@"
 
 # Add common python test utils
 sys.path.append(config.rocmlir_common_python_tests_utils)
-from common import get_agents, get_arch_features
+from common import get_agents, get_arch_features, get_default_agent
 
 # Support substitution of the tools_dir with user parameters. This is
 # used when we can't determine the tool dir at configuration time.
@@ -73,19 +73,40 @@ except KeyError:
 
 # If rocm_agent_enumerator shows no viable GPUs, skip tests that need one,
 # because the default target will lead to compilation failures.
+import os
 config.no_AMD_GPU = False
 config.arch = ""
 config.arch_support_mfma = False
 config.arch_support_wmma = False
 config.arch_support_accel_fp8 = False
 config.features = None
+config.multi_gpu_detected = False
 if config.rocm_path:
     try:
+        # Check if user already set HIP_VISIBLE_DEVICES - respect their choice
+        user_hip_visible = os.environ.get('HIP_VISIBLE_DEVICES')
+        
         agents = get_agents()
-        config.arch = ','.join(agents)
-        for x in agents:
-            if not config.features:
-                config.features, config.arch_support_mfma, config.arch_support_wmma, config.arch_support_accel_fp8 = get_arch_features(x)
+        default_agent = get_default_agent()
+        if default_agent:
+            if len(agents) > 1:
+                config.multi_gpu_detected = True
+                if user_hip_visible is not None:
+                    # User specified which GPU to use - respect it
+                    lit_config.note("Multiple GPU architectures detected: %s. "
+                                    "Using user-specified HIP_VISIBLE_DEVICES=%s. "
+                                    "Device 0 (after filtering) architecture: '%s'."
+                                    % (', '.join(sorted(agents)), user_hip_visible, default_agent))
+                else:
+                    # No user preference - use device 0 and set HIP_VISIBLE_DEVICES
+                    lit_config.note("Multiple GPU architectures detected: %s. "
+                                    "Using device 0 architecture '%s' for E2E tests. "
+                                    "HIP_VISIBLE_DEVICES will be set to '0' to ensure binary compatibility."
+                                    % (', '.join(sorted(agents)), default_agent))
+            config.arch = default_agent
+            # Get features for the device we'll actually use
+            config.features, config.arch_support_mfma, config.arch_support_wmma, config.arch_support_accel_fp8 = get_arch_features(default_agent)
+            if config.features:
                 config.substitutions.append(('%features', config.features))
         if not config.arch:
             config.no_AMD_GPU = True

From 7378c5712ad82b444f06ae93ba6a526c7f581bd9 Mon Sep 17 00:00:00 2001
From: Djordje Ramic <djoramic@amd.com>
Date: Thu, 12 Feb 2026 13:57:31 +0000
Subject: [PATCH 2/2] Yapf formatting

---
 mlir/test/e2e/lit.cfg.py        | 8 ++++----
 mlir/test/fusion/e2e/lit.cfg.py | 4 ++--
 mlir/test/lit.cfg.py            | 8 ++++----
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/mlir/test/e2e/lit.cfg.py b/mlir/test/e2e/lit.cfg.py
index 190822319498..43e5c9718e49 100644
--- a/mlir/test/e2e/lit.cfg.py
+++ b/mlir/test/e2e/lit.cfg.py
@@ -32,14 +32,14 @@
 config.substitutions.append(('%shlibext', config.llvm_shlib_ext))
 config.substitutions.append(("%mlir_src_root", config.mlir_src_root))
 config.substitutions.append(('%random_data', config.random_data))
-config.substitutions.append(('%constrained_float_range_random_data',
-                             config.constrained_float_range_random_data))
+config.substitutions.append(
+    ('%constrained_float_range_random_data', config.constrained_float_range_random_data))
 config.substitutions.append(('%rocmlir_gen_flags', config.rocmlir_gen_flags))
 config.substitutions.append(('%arch', config.arch))
 config.substitutions.append(('%pv', config.populate_validation))
 
-llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP',
-                                     'HIP_VISIBLE_DEVICES'])
+llvm_config.with_system_environment(
+    ['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP', 'HIP_VISIBLE_DEVICES'])
 
 # When multiple GPUs are present, limit HIP to device 0 to ensure
 # compiled binaries match the execution device
diff --git a/mlir/test/fusion/e2e/lit.cfg.py b/mlir/test/fusion/e2e/lit.cfg.py
index 9c78652e2063..cebdae4eced2 100644
--- a/mlir/test/fusion/e2e/lit.cfg.py
+++ b/mlir/test/fusion/e2e/lit.cfg.py
@@ -36,8 +36,8 @@
 config.substitutions.append(('%arch', config.arch))
 config.substitutions.append(('%pv', config.populate_validation))
 
-llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP',
-                                     'HIP_VISIBLE_DEVICES'])
+llvm_config.with_system_environment(
+    ['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP', 'HIP_VISIBLE_DEVICES'])
 
 # When multiple GPUs are present, limit HIP to device 0 to ensure
 # compiled binaries match the execution device
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index 64f3a3cd56d7..ecf63326d695 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -33,14 +33,14 @@
 config.substitutions.append(('%shlibext', config.llvm_shlib_ext))
 config.substitutions.append(("%mlir_src_root", config.mlir_src_root))
 config.substitutions.append(('%random_data', config.random_data))
-config.substitutions.append(('%constrained_float_range_random_data',
-                             config.constrained_float_range_random_data))
+config.substitutions.append(
+    ('%constrained_float_range_random_data', config.constrained_float_range_random_data))
 config.substitutions.append(('%rocmlir_gen_flags', config.rocmlir_gen_flags))
 config.substitutions.append(('%arch', config.arch))
 config.substitutions.append(('%pv', config.populate_validation))
 
-llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP',
-                                     'HIP_VISIBLE_DEVICES'])
+llvm_config.with_system_environment(
+    ['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP', 'HIP_VISIBLE_DEVICES'])
 
 # When multiple GPUs are present, limit HIP to device 0 to ensure
 # compiled binaries match the execution device