From cf2997020b0b2afc94c89e5a988192a9fa1a0bb2 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Wed, 31 Dec 2025 15:14:09 -0500 Subject: [PATCH 1/4] Enable narrow PCH for polymorphic_value.h Precompile polymorphic_value.h to eliminate ~4000s of redundant header parsing. Enabled by default for Release builds. Disable with -DNVFUSER_USE_POLYMORPHIC_PCH=OFF. --- CMakeLists.txt | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 86497ad8306..f479e2a8e8e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -564,6 +564,23 @@ endif() target_link_libraries(codegen_internal PUBLIC LLVM_JIT) +# Narrow PCH for polymorphic_value.h +# Post-M8, template instantiation is reduced by 81%, making header parsing +# a significant fraction of build cost. This PCH targets the heaviest headers. +# Enabled by default for Release builds (provides ~50% build time improvement). +if(CMAKE_BUILD_TYPE STREQUAL "Release") + option(NVFUSER_USE_POLYMORPHIC_PCH "Use PCH for polymorphic_value.h to reduce parse time" ON) +else() + option(NVFUSER_USE_POLYMORPHIC_PCH "Use PCH for polymorphic_value.h to reduce parse time" OFF) +endif() + +if(NVFUSER_USE_POLYMORPHIC_PCH) + message(STATUS "Enabling narrow PCH for polymorphic_value.h") + target_precompile_headers(codegen_internal PRIVATE + "${NVFUSER_SRCS_DIR}/polymorphic_value.h" + ) +endif() + add_library(nvfuser_codegen SHARED $) if (BUILD_CUTLASS AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) From 1d9912de5a8a43460a39d1022a9d6d531c1a51b0 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Wed, 31 Dec 2025 16:07:40 -0500 Subject: [PATCH 2/4] Extend PCH to test targets --- CMakeLists.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index f479e2a8e8e..b06c128535d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1129,6 +1129,15 @@ function(add_test_without_main TEST_NAME TEST_SRC ADDITIONAL_LINK) add_executable(${TEST_NAME} ${TEST_SRC}) set_property(TARGET ${TEST_NAME} PROPERTY CXX_STANDARD ${NVFUSER_CPP_STANDARD}) target_compile_definitions(${TEST_NAME} PRIVATE USE_GTEST) + + # Create separate PCH for test targets (can't reuse from codegen_internal + # due to -fpie flag difference between library and executable) + if(NVFUSER_USE_POLYMORPHIC_PCH) + target_precompile_headers(${TEST_NAME} PRIVATE + "${NVFUSER_SRCS_DIR}/polymorphic_value.h" + ) + endif() + target_include_directories(${TEST_NAME} PRIVATE "${NVFUSER_ROOT}") target_include_directories(${TEST_NAME} SYSTEM PRIVATE ${NVFUSER_ROOT}/third_party/googletest/googletest/include From 31c3c4650ae2b66bfe9708fd7e90532f3ad716f6 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Thu, 1 Jan 2026 13:45:09 -0500 Subject: [PATCH 3/4] Expand PCH to include top nvFuser headers --- CMakeLists.txt | 53 +++++++++++++++++++++++------ csrc/multidevice/symmetric_tensor.h | 8 ++--- 2 files changed, 46 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b06c128535d..80fa5869376 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -564,20 +564,31 @@ endif() target_link_libraries(codegen_internal PUBLIC LLVM_JIT) -# Narrow PCH for polymorphic_value.h +# Precompiled Headers for Top nvFuser Headers # Post-M8, template instantiation is reduced by 81%, making header parsing -# a significant fraction of build cost. This PCH targets the heaviest headers. +# a significant fraction of build cost. This PCH targets the top 10 heaviest +# nvFuser-controllable headers by exclusive parse time (from M9 Task 4 analysis). # Enabled by default for Release builds (provides ~50% build time improvement). if(CMAKE_BUILD_TYPE STREQUAL "Release") - option(NVFUSER_USE_POLYMORPHIC_PCH "Use PCH for polymorphic_value.h to reduce parse time" ON) + option(NVFUSER_USE_POLYMORPHIC_PCH "Use PCH for top nvFuser headers to reduce parse time" ON) else() - option(NVFUSER_USE_POLYMORPHIC_PCH "Use PCH for polymorphic_value.h to reduce parse time" OFF) + option(NVFUSER_USE_POLYMORPHIC_PCH "Use PCH for top nvFuser headers to reduce parse time" OFF) endif() if(NVFUSER_USE_POLYMORPHIC_PCH) - message(STATUS "Enabling narrow PCH for polymorphic_value.h") + message(STATUS "Enabling PCH for top 10 nvFuser headers") target_precompile_headers(codegen_internal PRIVATE - "${NVFUSER_SRCS_DIR}/polymorphic_value.h" + # Top 10 nvFuser headers by exclusive parse time (M9 Task 4 analysis) + "${NVFUSER_SRCS_DIR}/polymorphic_value.h" # 1675s (27.9m) + "${NVFUSER_ROOT}/lib/dynamic_type/src/dynamic_type/type_traits.h" # 473.6s (7.9m) + "${NVFUSER_SRCS_DIR}/ir/base_nodes.h" # 284.5s (4.7m) + "${NVFUSER_SRCS_DIR}/scheduler/tools/abstract_tensor.h" # 162.1s (2.7m) + "${NVFUSER_SRCS_DIR}/type.h" # 81.6s (1.4m) + "${NVFUSER_SRCS_DIR}/ir/container.h" # 51.6s (0.9m) + "${NVFUSER_SRCS_DIR}/serde/fusion_cache_generated.h" # 44.1s (0.7m) + "${NVFUSER_SRCS_DIR}/iter_visitor.h" # 38.2s (0.6m) + "${NVFUSER_SRCS_DIR}/ir/internal_nodes.h" # 33.3s (0.6m) + "${NVFUSER_SRCS_DIR}/ir/interface_nodes.h" # 29.6s (0.5m) ) endif() @@ -1130,12 +1141,32 @@ function(add_test_without_main TEST_NAME TEST_SRC ADDITIONAL_LINK) set_property(TARGET ${TEST_NAME} PROPERTY CXX_STANDARD ${NVFUSER_CPP_STANDARD}) target_compile_definitions(${TEST_NAME} PRIVATE USE_GTEST) - # Create separate PCH for test targets (can't reuse from codegen_internal - # due to -fpie flag difference between library and executable) + # PCH for test targets: All test executables share a single PCH to avoid + # redundant compilation. The first test target (test_nvfuser) creates the PCH, + # and all subsequent tests reuse it via REUSE_FROM. + # Note: Can't reuse from codegen_internal due to -fPIC flag difference. if(NVFUSER_USE_POLYMORPHIC_PCH) - target_precompile_headers(${TEST_NAME} PRIVATE - "${NVFUSER_SRCS_DIR}/polymorphic_value.h" - ) + get_property(NVFUSER_TEST_PCH_TARGET GLOBAL PROPERTY NVFUSER_TEST_PCH_TARGET) + if(NOT NVFUSER_TEST_PCH_TARGET) + # First test target: create the PCH with top 10 nvFuser headers + message(STATUS "Creating shared test PCH on target: ${TEST_NAME}") + target_precompile_headers(${TEST_NAME} PRIVATE + "${NVFUSER_SRCS_DIR}/polymorphic_value.h" + "${NVFUSER_ROOT}/lib/dynamic_type/src/dynamic_type/type_traits.h" + "${NVFUSER_SRCS_DIR}/ir/base_nodes.h" + "${NVFUSER_SRCS_DIR}/scheduler/tools/abstract_tensor.h" + "${NVFUSER_SRCS_DIR}/type.h" + "${NVFUSER_SRCS_DIR}/ir/container.h" + "${NVFUSER_SRCS_DIR}/serde/fusion_cache_generated.h" + "${NVFUSER_SRCS_DIR}/iter_visitor.h" + "${NVFUSER_SRCS_DIR}/ir/internal_nodes.h" + "${NVFUSER_SRCS_DIR}/ir/interface_nodes.h" + ) + set_property(GLOBAL PROPERTY NVFUSER_TEST_PCH_TARGET ${TEST_NAME}) + else() + # Subsequent test targets: reuse existing PCH + target_precompile_headers(${TEST_NAME} REUSE_FROM ${NVFUSER_TEST_PCH_TARGET}) + endif() endif() target_include_directories(${TEST_NAME} PRIVATE "${NVFUSER_ROOT}") diff --git a/csrc/multidevice/symmetric_tensor.h b/csrc/multidevice/symmetric_tensor.h index 5608153e0ce..10a143516f0 100644 --- a/csrc/multidevice/symmetric_tensor.h +++ b/csrc/multidevice/symmetric_tensor.h @@ -71,12 +71,12 @@ class SymmetricTensor { size_t requested_size_; mutable bool are_remote_tensors_setup_ = false; bool is_multicast_setup_ = false; - CUmemGenericAllocationHandle mcast_handle_{}; - CUdevice cu_dev_{}; + [[maybe_unused]] CUmemGenericAllocationHandle mcast_handle_{}; + [[maybe_unused]] CUdevice cu_dev_{}; void* mc_ptr_{nullptr}; CUdeviceptr mc_base_ptr_{0}; - int exporter_rank_{-1}; - int peer_fd_{-1}; + [[maybe_unused]] int exporter_rank_{-1}; + [[maybe_unused]] int peer_fd_{-1}; bool is_contiguous_view_setup_ = false; at::Tensor contiguous_view_; }; From 2da82fbac0cdde2475950e2e4a99affead1e1fb1 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Sun, 11 Jan 2026 16:02:24 -0500 Subject: [PATCH 4/4] Fix Clang unused-private-field warning for mc_base_ptr_ --- csrc/multidevice/symmetric_tensor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/multidevice/symmetric_tensor.h b/csrc/multidevice/symmetric_tensor.h index 10a143516f0..64ce111498c 100644 --- a/csrc/multidevice/symmetric_tensor.h +++ b/csrc/multidevice/symmetric_tensor.h @@ -74,7 +74,7 @@ class SymmetricTensor { [[maybe_unused]] CUmemGenericAllocationHandle mcast_handle_{}; [[maybe_unused]] CUdevice cu_dev_{}; void* mc_ptr_{nullptr}; - CUdeviceptr mc_base_ptr_{0}; + [[maybe_unused]] CUdeviceptr mc_base_ptr_{0}; [[maybe_unused]] int exporter_rank_{-1}; [[maybe_unused]] int peer_fd_{-1}; bool is_contiguous_view_setup_ = false;