diff --git a/mlir/test/common_utils/common.py b/mlir/test/common_utils/common.py index 494fc1728148..60c70de500f1 100644 --- a/mlir/test/common_utils/common.py +++ b/mlir/test/common_utils/common.py @@ -70,6 +70,16 @@ def get_agents(): return agents +def get_default_agent(): + """Returns the architecture of device 0, which HIP uses by default.""" + device_count = hip_check(hip.hipGetDeviceCount()) + if device_count > 0: + props = hip.hipDeviceProp_t() + hip_check(hip.hipGetDeviceProperties(props, 0)) + return props.gcnArchName.decode('utf-8') + return None + + def is_xdlops_present() -> bool: """This function checks whether a GPU with xdlops support is present""" return any([agent.startswith("gfx9") for agent in get_agents()]) diff --git a/mlir/test/e2e/generateE2ETest.py b/mlir/test/e2e/generateE2ETest.py index 355945b94925..71a4d30433d0 100755 --- a/mlir/test/e2e/generateE2ETest.py +++ b/mlir/test/e2e/generateE2ETest.py @@ -49,6 +49,7 @@ def hip_check(call_result): def get_arch(): + """Returns all unique GPU architectures in the system.""" agents = set() device_count = hip_check(hip.hipGetDeviceCount()) for device in range(device_count): @@ -60,6 +61,16 @@ def get_arch(): return agents +def get_default_arch(): + """Returns the architecture of device 0, which HIP uses by default.""" + device_count = hip_check(hip.hipGetDeviceCount()) + if device_count > 0: + props = hip.hipDeviceProp_t() + hip_check(hip.hipGetDeviceProperties(props, 0)) + return props.gcnArchName.decode('utf-8') + return None + + def generate_option_list(prefixes: dict, table: list, key1: str, key2: str): options_list = [] for item in table[key1]: @@ -134,7 +145,17 @@ def usage(): axis_prefixes[axis["name"]] = axis["prefix"] arch_names = get_arch() - arch = ','.join(arch_names) + default_arch = get_default_arch() + # Use device 0's architecture (HIP default) for compilation + # This ensures compiled binaries run on the default GPU + if default_arch: + if len(arch_names) > 1: + print(f"Note: Multiple GPU architectures detected: {', '.join(sorted(arch_names))}. " + f"Using device 0 architecture '{default_arch}' for test generation. " + f"Use HIP_VISIBLE_DEVICES to select a different GPU.") + arch = default_arch + else: + arch = "" combinations = generate_option_list(axis_prefixes, toml_dict, "axis", "values") for suite in toml_dict["suite"]: diff --git a/mlir/test/e2e/lit.cfg.py b/mlir/test/e2e/lit.cfg.py index 6bbcb304416e..43e5c9718e49 100644 --- a/mlir/test/e2e/lit.cfg.py +++ b/mlir/test/e2e/lit.cfg.py @@ -32,13 +32,21 @@ config.substitutions.append(('%shlibext', config.llvm_shlib_ext)) config.substitutions.append(("%mlir_src_root", config.mlir_src_root)) config.substitutions.append(('%random_data', config.random_data)) -config.substitutions.append(('%constrained_float_range_random_data', - config.constrained_float_range_random_data)) +config.substitutions.append( + ('%constrained_float_range_random_data', config.constrained_float_range_random_data)) config.substitutions.append(('%rocmlir_gen_flags', config.rocmlir_gen_flags)) config.substitutions.append(('%arch', config.arch)) config.substitutions.append(('%pv', config.populate_validation)) -llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP']) +llvm_config.with_system_environment( + ['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP', 'HIP_VISIBLE_DEVICES']) + +# When multiple GPUs are present, limit HIP to device 0 to ensure +# compiled binaries match the execution device +# But respect user's HIP_VISIBLE_DEVICES if already set +if hasattr(config, 'multi_gpu_detected') and config.multi_gpu_detected: + if 'HIP_VISIBLE_DEVICES' not in os.environ: + config.environment['HIP_VISIBLE_DEVICES'] = '0' ############## # FIXME: adding a path to the environment isn't appearing to work as diff --git a/mlir/test/e2e/lit.site.cfg.py.in b/mlir/test/e2e/lit.site.cfg.py.in index 3f1bb850e869..6fb13e34b778 100644 --- a/mlir/test/e2e/lit.site.cfg.py.in +++ b/mlir/test/e2e/lit.site.cfg.py.in @@ -28,7 +28,7 @@ config.rocmlir_common_python_tests_utils = "@ROCMLIR_COMMON_PYTHON_TESTS_UTILS@" # Add common python test utils sys.path.append(config.rocmlir_common_python_tests_utils) -from common import get_agents, get_arch_features +from common import get_agents, get_arch_features, get_default_agent # Support substitution of the tools_dir with user parameters. This is # used when we can't determine the tool dir at configuration time. @@ -42,19 +42,42 @@ except KeyError: # If rocm_agent_enumerator shows no viable GPUs, skip tests that need one, # because the default target will lead to compilation failures. +import os config.no_AMD_GPU = False config.arch = "" config.features = None config.arch_support_mfma = False config.arch_support_wmma = False config.arch_support_accel_fp8 = False +config.multi_gpu_detected = False if config.rocm_path: try: + # Check if user already set HIP_VISIBLE_DEVICES - respect their choice + user_hip_visible = os.environ.get('HIP_VISIBLE_DEVICES') + agents = get_agents() - config.arch = ','.join(agents) - for x in agents: - config.features, config.arch_support_mfma, config.arch_support_wmma, config.arch_support_accel_fp8 = get_arch_features(x) - config.substitutions.append(('%features', config.features)) + default_agent = get_default_agent() + if default_agent: + if len(agents) > 1: + config.multi_gpu_detected = True + if user_hip_visible is not None: + # User specified which GPU to use - respect it + # HIP will see their chosen GPU as device 0 + lit_config.note("Multiple GPU architectures detected: %s. " + "Using user-specified HIP_VISIBLE_DEVICES=%s. " + "Device 0 (after filtering) architecture: '%s'." + % (', '.join(sorted(agents)), user_hip_visible, default_agent)) + else: + # No user preference - use device 0 and set HIP_VISIBLE_DEVICES + lit_config.note("Multiple GPU architectures detected: %s. " + "Using device 0 architecture '%s' for E2E tests. " + "HIP_VISIBLE_DEVICES will be set to '0' to ensure binary compatibility." + % (', '.join(sorted(agents)), default_agent)) + config.arch = default_agent + # Get features for the device we'll actually use + config.features, config.arch_support_mfma, config.arch_support_wmma, config.arch_support_accel_fp8 = get_arch_features(default_agent) + if config.features: + config.substitutions.append(('%features', config.features)) # Check other features here if not config.arch: diff --git a/mlir/test/fusion/e2e/lit.cfg.py b/mlir/test/fusion/e2e/lit.cfg.py index eab98645a68b..cebdae4eced2 100644 --- a/mlir/test/fusion/e2e/lit.cfg.py +++ b/mlir/test/fusion/e2e/lit.cfg.py @@ -36,7 +36,15 @@ config.substitutions.append(('%arch', config.arch)) config.substitutions.append(('%pv', config.populate_validation)) -llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP']) +llvm_config.with_system_environment( + ['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP', 'HIP_VISIBLE_DEVICES']) + +# When multiple GPUs are present, limit HIP to device 0 to ensure +# compiled binaries match the execution device +# But respect user's HIP_VISIBLE_DEVICES if already set +if hasattr(config, 'multi_gpu_detected') and config.multi_gpu_detected: + if 'HIP_VISIBLE_DEVICES' not in os.environ: + config.environment['HIP_VISIBLE_DEVICES'] = '0' ############## # FIXME: adding a path to the environment isn't appearing to work as diff --git a/mlir/test/fusion/e2e/lit.site.cfg.py.in b/mlir/test/fusion/e2e/lit.site.cfg.py.in index b9e0f612bbfd..7e7dc810b34f 100644 --- a/mlir/test/fusion/e2e/lit.site.cfg.py.in +++ b/mlir/test/fusion/e2e/lit.site.cfg.py.in @@ -35,7 +35,7 @@ config.rocmlir_common_python_tests_utils = "@ROCMLIR_COMMON_PYTHON_TESTS_UTILS@" # Add common python test utils sys.path.append(config.rocmlir_common_python_tests_utils) -from common import get_agents +from common import get_agents, get_default_agent # Support substitution of the tools_dir with user parameters. This is # used when we can't determine the tool dir at configuration time. @@ -49,20 +49,40 @@ except KeyError: # If rocm_agent_enumerator shows no viable GPUs, skip tests that need one, # because the default target will lead to compilation failures. +import os config.no_AMD_GPU = False config.arch = "" config.arch_support_mfma = False config.arch_support_wmma = False +config.multi_gpu_detected = False if config.rocm_path: try: + # Check if user already set HIP_VISIBLE_DEVICES - respect their choice + user_hip_visible = os.environ.get('HIP_VISIBLE_DEVICES') + agents = get_agents() - config.arch = ','.join(agents) - for x in agents: - if any([arch in x for arch in ["gfx908", "gfx90a", "gfx942", "gfx950"]]): + default_agent = get_default_agent() + if default_agent: + if len(agents) > 1: + config.multi_gpu_detected = True + if user_hip_visible is not None: + # User specified which GPU to use - respect it + lit_config.note("Multiple GPU architectures detected: %s. " + "Using user-specified HIP_VISIBLE_DEVICES=%s. " + "Device 0 (after filtering) architecture: '%s'." + % (', '.join(sorted(agents)), user_hip_visible, default_agent)) + else: + # No user preference - use device 0 and set HIP_VISIBLE_DEVICES + lit_config.note("Multiple GPU architectures detected: %s. " + "Using device 0 architecture '%s' for E2E tests. " + "HIP_VISIBLE_DEVICES will be set to '0' to ensure binary compatibility." + % (', '.join(sorted(agents)), default_agent)) + config.arch = default_agent + # Check features for the device we'll actually use + if any([arch in default_agent for arch in ["gfx908", "gfx90a", "gfx942", "gfx950"]]): config.arch_support_mfma = True - elif "gfx11" in x or "gfx12" in x: + elif "gfx11" in default_agent or "gfx12" in default_agent: config.arch_support_wmma = True - # Check other features here if not config.arch: config.no_AMD_GPU = True except subprocess.CalledProcessError: diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py index caecd5729892..ecf63326d695 100644 --- a/mlir/test/lit.cfg.py +++ b/mlir/test/lit.cfg.py @@ -33,13 +33,21 @@ config.substitutions.append(('%shlibext', config.llvm_shlib_ext)) config.substitutions.append(("%mlir_src_root", config.mlir_src_root)) config.substitutions.append(('%random_data', config.random_data)) -config.substitutions.append(('%constrained_float_range_random_data', - config.constrained_float_range_random_data)) +config.substitutions.append( + ('%constrained_float_range_random_data', config.constrained_float_range_random_data)) config.substitutions.append(('%rocmlir_gen_flags', config.rocmlir_gen_flags)) config.substitutions.append(('%arch', config.arch)) config.substitutions.append(('%pv', config.populate_validation)) -llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP']) +llvm_config.with_system_environment( + ['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP', 'HIP_VISIBLE_DEVICES']) + +# When multiple GPUs are present, limit HIP to device 0 to ensure +# compiled binaries match the execution device +# But respect user's HIP_VISIBLE_DEVICES if already set +if hasattr(config, 'multi_gpu_detected') and config.multi_gpu_detected: + if 'HIP_VISIBLE_DEVICES' not in os.environ: + config.environment['HIP_VISIBLE_DEVICES'] = '0' ############## # FIXME: adding a path to the environment isn't appearing to work as diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in index 3a0c61cf50e4..5080558d8853 100644 --- a/mlir/test/lit.site.cfg.py.in +++ b/mlir/test/lit.site.cfg.py.in @@ -59,7 +59,7 @@ config.rocmlir_common_python_tests_utils = "@ROCMLIR_COMMON_PYTHON_TESTS_UTILS@" # Add common python test utils sys.path.append(config.rocmlir_common_python_tests_utils) -from common import get_agents, get_arch_features +from common import get_agents, get_arch_features, get_default_agent # Support substitution of the tools_dir with user parameters. This is # used when we can't determine the tool dir at configuration time. @@ -73,19 +73,40 @@ except KeyError: # If rocm_agent_enumerator shows no viable GPUs, skip tests that need one, # because the default target will lead to compilation failures. +import os config.no_AMD_GPU = False config.arch = "" config.arch_support_mfma = False config.arch_support_wmma = False config.arch_support_accel_fp8 = False config.features = None +config.multi_gpu_detected = False if config.rocm_path: try: + # Check if user already set HIP_VISIBLE_DEVICES - respect their choice + user_hip_visible = os.environ.get('HIP_VISIBLE_DEVICES') + agents = get_agents() - config.arch = ','.join(agents) - for x in agents: - if not config.features: - config.features, config.arch_support_mfma, config.arch_support_wmma, config.arch_support_accel_fp8 = get_arch_features(x) + default_agent = get_default_agent() + if default_agent: + if len(agents) > 1: + config.multi_gpu_detected = True + if user_hip_visible is not None: + # User specified which GPU to use - respect it + lit_config.note("Multiple GPU architectures detected: %s. " + "Using user-specified HIP_VISIBLE_DEVICES=%s. " + "Device 0 (after filtering) architecture: '%s'." + % (', '.join(sorted(agents)), user_hip_visible, default_agent)) + else: + # No user preference - use device 0 and set HIP_VISIBLE_DEVICES + lit_config.note("Multiple GPU architectures detected: %s. " + "Using device 0 architecture '%s' for E2E tests. " + "HIP_VISIBLE_DEVICES will be set to '0' to ensure binary compatibility." + % (', '.join(sorted(agents)), default_agent)) + config.arch = default_agent + # Get features for the device we'll actually use + config.features, config.arch_support_mfma, config.arch_support_wmma, config.arch_support_accel_fp8 = get_arch_features(default_agent) + if config.features: config.substitutions.append(('%features', config.features)) if not config.arch: config.no_AMD_GPU = True