From fb2936700fe4ebdcbf00de8166ce86e4e03e2d16 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Fri, 12 Dec 2025 08:24:12 +0000 Subject: [PATCH 1/2] NVIDIA: SAUCE: acpi: generic initiator in sorted order During creation of the VM's SRAT table, the generic intiator entries are added. Currently, the code queries the object, which may not be in the sorted order. This results in the mismatch in the VMs view of the PXM and the numa node ids. As a fix, the patch builds a list of generic intiator objects, sorts them and then put it in the VM's SRAT table. Original (unsorted) PXM in the VM SRAT table [152h 0338 004h] Proximity Domain : 00000000 [17Ah 0378 004h] Proximity Domain : 00000001 [1A4h 0420 004h] Proximity Domain : 00000007 [1C4h 0452 004h] Proximity Domain : 00000006 [1E4h 0484 004h] Proximity Domain : 00000005 [204h 0516 004h] Proximity Domain : 00000004 [224h 0548 004h] Proximity Domain : 00000003 [244h 0580 004h] Proximity Domain : 00000009 [264h 0612 004h] Proximity Domain : 00000002 [284h 0644 004h] Proximity Domain : 00000008 [2A2h 0674 004h] Proximity Domain : 00000009 After the patch (sorted) [152h 0338 004h] Proximity Domain : 00000000 [17Ah 0378 004h] Proximity Domain : 00000001 [1A4h 0420 004h] Proximity Domain : 00000002 [1C4h 0452 004h] Proximity Domain : 00000003 [1E4h 0484 004h] Proximity Domain : 00000004 [204h 0516 004h] Proximity Domain : 00000005 [224h 0548 004h] Proximity Domain : 00000006 [244h 0580 004h] Proximity Domain : 00000007 [264h 0612 004h] Proximity Domain : 00000008 [284h 0644 004h] Proximity Domain : 00000009 Fixes: 0a5b5acdf2 ("hw/acpi: Implement the SRAT GI affinity structure") Signed-off-by: Ankit Agrawal --- hw/acpi/pci.c | 70 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 11 deletions(-) diff --git a/hw/acpi/pci.c b/hw/acpi/pci.c index 2228f1245e2..a6e474af0f7 100644 --- a/hw/acpi/pci.c +++ b/hw/acpi/pci.c @@ -145,20 +145,56 @@ static void acpi_generic_initiator_class_init(ObjectClass *oc, const void *data) "NUMA node associated with the PCI device"); } -static int build_acpi_generic_initiator(Object *obj, void *opaque) +static gint memory_device_addr_sort(gconstpointer a, gconstpointer b) +{ + const AcpiGenericInitiator *gi_a = a; + const AcpiGenericInitiator *gi_b = b; + + if (gi_a->node > gi_b->node) { + return 1; + } else if (gi_a->node < gi_b->node) { + return -1; + } + return 0; +} + +static int acpi_generic_initiator_list(Object *obj, void *opaque) +{ + GSList **list = opaque; + + if (object_dynamic_cast(obj, TYPE_ACPI_GENERIC_INITIATOR)) { + *list = g_slist_insert_sorted(*list, ACPI_GENERIC_INITIATOR(obj), + memory_device_addr_sort); + } + + object_child_foreach(obj, acpi_generic_initiator_list, opaque); + return 0; +} + +/* + * Identify Generic Initiator objects and link them into the list which is + * returned to the caller. + * + * Note: it is the caller's responsibility to free the list to avoid + * memory leak. + */ +static GSList *acpi_generic_initiator_get_list(void) +{ + GSList *list = NULL; + + object_child_foreach(object_get_root(), + acpi_generic_initiator_list, &list); + return list; +} + +static int build_acpi_generic_initiator(AcpiGenericInitiator *gi, + GArray *table_data) { MachineState *ms = MACHINE(qdev_get_machine()); - AcpiGenericInitiator *gi; - GArray *table_data = opaque; int32_t devfn; uint8_t bus; Object *o; - if (!object_dynamic_cast(obj, TYPE_ACPI_GENERIC_INITIATOR)) { - return 0; - } - - gi = ACPI_GENERIC_INITIATOR(obj); if (gi->node >= ms->numa_state->num_nodes) { error_printf("%s: Specified node %d is invalid.\n", TYPE_ACPI_GENERIC_INITIATOR, gi->node); @@ -182,6 +218,19 @@ static int build_acpi_generic_initiator(Object *obj, void *opaque) return 0; } +static void build_all_acpi_generic_initiators(GArray *table_data) +{ + GSList *gi_list, *list = acpi_generic_initiator_get_list(); + AcpiGenericInitiator *gi; + + for (gi_list = list; gi_list; gi_list = gi_list->next) { + gi = gi_list->data; + build_acpi_generic_initiator(gi, table_data); + } + + g_slist_free(list); +} + typedef struct AcpiGenericPort { /* private */ Object parent; @@ -295,9 +344,8 @@ static int build_acpi_generic_port(Object *obj, void *opaque) void build_srat_generic_affinity_structures(GArray *table_data) { - object_child_foreach_recursive(object_get_root(), - build_acpi_generic_initiator, - table_data); + build_all_acpi_generic_initiators(table_data); + object_child_foreach_recursive(object_get_root(), build_acpi_generic_port, table_data); } From c12786a4d73d969bba42873f497e96e43106e362 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Fri, 12 Dec 2025 08:40:41 +0000 Subject: [PATCH 2/2] NVIDIA: SAUCE: hw/vfio: adjust alignment for hugepfnmap Qemu's determination of the VMA address for a region needs an update to handle regions that may be a BAR, but with the actual size of the mapping to not be at a power-of-2 alignment. This happens for the case of Grace based systems, where the device memory is exposed as a BAR. The mapping however is only of the size of the actual physical memory, which may not be a power-of-2 aligned. This affects hugepfnmap mappings on such regions. The current algorithm determines the VMA address alignment based on the mapping alignment. This needs change so as to be based on the next power-of-2 of the mapping size. This patch updates the algorithm to achieve the alignment. Original VMA mapping to the device memory of size 0x2F00F00000 on a GB200 ff88ff000000-ffb7fff00000 rw-s 400000000000 00:06 727 /dev/vfio/devices/vfio1 After the patch application (aligned at order 13 PMD) ff8ac0000000-ffb9c0f00000 rw-s 400000000000 00:06 727 /dev/vfio/devices/vfio1 Signed-off-by: Ankit Agrawal --- hw/vfio/region.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/vfio/region.c b/hw/vfio/region.c index d04c57db630..1649e409eff 100644 --- a/hw/vfio/region.c +++ b/hw/vfio/region.c @@ -252,7 +252,7 @@ int vfio_region_mmap(VFIORegion *region) prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0; for (i = 0; i < region->nr_mmaps; i++) { - size_t align = MIN(1ULL << ctz64(region->mmaps[i].size), 1 * GiB); + size_t align = MIN(pow2ceil(region->mmaps[i].size), 1 * GiB); void *map_base, *map_align; /*