From 379e88d8b7e7fad6b810332f6244e34a4726eb09 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Thu, 29 Jan 2026 13:31:45 -0800 Subject: [PATCH 1/3] Print propagation for debugging --- csrc/base.h | 24 ++++++++++++---------- csrc/multidevice/propagation.cpp | 7 +++++++ csrc/preseg_passes/propagate_shardings.cpp | 1 - csrc/scheduler/utils.cpp | 16 +++++++++++++++ csrc/scheduler/utils.h | 2 ++ 5 files changed, 38 insertions(+), 12 deletions(-) diff --git a/csrc/base.h b/csrc/base.h index 88052e9e259..e1d69c621fd 100644 --- a/csrc/base.h +++ b/csrc/base.h @@ -281,25 +281,27 @@ SPECIALIZE_PRINTER(VoidStar); SPECIALIZE_PRINTER(uint32_t); SPECIALIZE_PRINTER(int64_t); SPECIALIZE_PRINTER(uint64_t); -SPECIALIZE_PRINTER(DataType); -SPECIALIZE_PRINTER(MemoryType); -SPECIALIZE_PRINTER(UnaryOpType); + SPECIALIZE_PRINTER(BinaryOpType); -SPECIALIZE_PRINTER(TernaryOpType); -SPECIALIZE_PRINTER(LoadStoreOpType); SPECIALIZE_PRINTER(CircularBufferLoopStage); -SPECIALIZE_PRINTER(tma::TensorMapInterleave); -SPECIALIZE_PRINTER(tma::TensorMapL2Promotion); -SPECIALIZE_PRINTER(tma::TensorMapFloatOOBFill); +SPECIALIZE_PRINTER(DataType); +SPECIALIZE_PRINTER(LoadStoreOpType); +SPECIALIZE_PRINTER(MemoryType); SPECIALIZE_PRINTER(MmaInputSmemSwizzle); -SPECIALIZE_PRINTER(SwizzleType); +SPECIALIZE_PRINTER(ParallelType); SPECIALIZE_PRINTER(Swizzle2DType); SPECIALIZE_PRINTER(SwizzleMode); +SPECIALIZE_PRINTER(SwizzleType); +SPECIALIZE_PRINTER(TernaryOpType); +SPECIALIZE_PRINTER(UnaryOpType); +SPECIALIZE_PRINTER(std::optional); +SPECIALIZE_PRINTER(std::vector); SPECIALIZE_PRINTER(std::vector); SPECIALIZE_PRINTER(std::vector); -SPECIALIZE_PRINTER(std::vector); SPECIALIZE_PRINTER(std::vector); -SPECIALIZE_PRINTER(std::optional); +SPECIALIZE_PRINTER(tma::TensorMapFloatOOBFill); +SPECIALIZE_PRINTER(tma::TensorMapInterleave); +SPECIALIZE_PRINTER(tma::TensorMapL2Promotion); #undef SPECIALIZE_PRINTER diff --git a/csrc/multidevice/propagation.cpp b/csrc/multidevice/propagation.cpp index d5732333bc6..2c24fc9a0d2 100644 --- a/csrc/multidevice/propagation.cpp +++ b/csrc/multidevice/propagation.cpp @@ -12,6 +12,7 @@ #include #include +#include "base.h" #include "ir/interface_nodes.h" #include "ir/internal_base_nodes.h" #include "ir/internal_nodes.h" @@ -255,6 +256,12 @@ void shardLoopLike( TensorView* tv, const std::unordered_set& selected_parallel_types, PropagateDirection direction) { + if (isDebugDumpEnabled(DebugDumpOption::PreSegmenterLogging)) { + debug() << "Propagating shardings from " << ref->toString() << " to " + << tv->toString() << " in " << direction << " for " + << toDelimitedString(selected_parallel_types) << std::endl; + } + std::unordered_set device_or_stream_ids; const std::unordered_map ref2target = getRef2TargetMap(ref, tv, direction); diff --git a/csrc/preseg_passes/propagate_shardings.cpp b/csrc/preseg_passes/propagate_shardings.cpp index d0f326f992d..b6867d238c9 100644 --- a/csrc/preseg_passes/propagate_shardings.cpp +++ b/csrc/preseg_passes/propagate_shardings.cpp @@ -13,7 +13,6 @@ #include "ir/iostream.h" #include "ir/utils.h" #include "multidevice/propagation.h" -#include "multidevice/utils.h" #include "scheduler/utils.h" namespace nvfuser::preseg_passes { diff --git a/csrc/scheduler/utils.cpp b/csrc/scheduler/utils.cpp index e0bc874dcbb..9da73acb46c 100644 --- a/csrc/scheduler/utils.cpp +++ b/csrc/scheduler/utils.cpp @@ -43,6 +43,22 @@ #include #include +namespace nvfuser { + +std::ostream& operator<<(std::ostream& os, PropagateDirection direction) { + switch (direction) { + case PropagateDirection::kForward: + os << "Forward"; + break; + case PropagateDirection::kBackward: + os << "Backward"; + break; + } + return os; +} + +} // namespace nvfuser + namespace nvfuser::scheduler_utils { // Minimal PTX code for a no-op kernel, used for occupancy queries diff --git a/csrc/scheduler/utils.h b/csrc/scheduler/utils.h index 305cbfcdf67..b1eb8d50c33 100644 --- a/csrc/scheduler/utils.h +++ b/csrc/scheduler/utils.h @@ -29,6 +29,8 @@ class HeuristicDataCache; //! BoundedDirectionalTransformPropagator. enum class PropagateDirection { kBackward = 0, kForward }; +std::ostream& operator<<(std::ostream& os, PropagateDirection direction); + namespace scheduler_utils { // Assume any only half of the register file is available to spend on buffers, From 0ee71013413bcce2e04f2cc1f8b20de86ffe4ef1 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Fri, 30 Jan 2026 15:53:52 -0800 Subject: [PATCH 2/3] Use TransformPropagator's debugging option --- csrc/multidevice/propagation.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/multidevice/propagation.cpp b/csrc/multidevice/propagation.cpp index 2c24fc9a0d2..180797af2e3 100644 --- a/csrc/multidevice/propagation.cpp +++ b/csrc/multidevice/propagation.cpp @@ -256,7 +256,7 @@ void shardLoopLike( TensorView* tv, const std::unordered_set& selected_parallel_types, PropagateDirection direction) { - if (isDebugDumpEnabled(DebugDumpOption::PreSegmenterLogging)) { + if (isDebugDumpEnabled(DebugDumpOption::TransformPropagator)) { debug() << "Propagating shardings from " << ref->toString() << " to " << tv->toString() << " in " << direction << " for " << toDelimitedString(selected_parallel_types) << std::endl; From 080fdea4d34173ae4cf83abe94c372c9a4e9c5ad Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Mon, 2 Feb 2026 21:45:43 -0800 Subject: [PATCH 3/3] Update csrc/multidevice/propagation.cpp Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- csrc/multidevice/propagation.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/multidevice/propagation.cpp b/csrc/multidevice/propagation.cpp index 5985f1235d2..2e372b63f34 100644 --- a/csrc/multidevice/propagation.cpp +++ b/csrc/multidevice/propagation.cpp @@ -258,7 +258,7 @@ void shardLoopLike( PropagateDirection direction) { if (isDebugDumpEnabled(DebugDumpOption::TransformPropagator)) { debug() << "Propagating shardings from " << ref->toString() << " to " - << tv->toString() << " in " << direction << " for " + << target->toString() << " in " << direction << " for " << toDelimitedString(selected_parallel_types) << std::endl; }