From cd122c8d063ecb19f8d32a5331312bba552f33dd Mon Sep 17 00:00:00 2001 From: Priya Mishra <26priya11@gmail.com> Date: Tue, 27 Jan 2026 20:53:37 -0800 Subject: [PATCH 01/10] working setup of a swizzled stream id --- csrc/host_ir/evaluator.cpp | 15 ++++-- csrc/host_ir/ir.cpp | 33 +++++++++++++ csrc/host_ir/ir.h | 39 ++++++++++++++++ csrc/host_ir/ops.cpp | 26 +++++++++++ csrc/host_ir/ops.h | 4 ++ csrc/multidevice/utils.cpp | 3 ++ tests/cpp/test_host_ir_evaluator.cpp | 69 ++++++++++++++++++++++++++++ 7 files changed, 185 insertions(+), 4 deletions(-) diff --git a/csrc/host_ir/evaluator.cpp b/csrc/host_ir/evaluator.cpp index 2ceedfddc40..fee623992f8 100644 --- a/csrc/host_ir/evaluator.cpp +++ b/csrc/host_ir/evaluator.cpp @@ -806,14 +806,21 @@ void HostIrEvaluator::handle(ShardByStream* shard) { IterDomain* stream_id = *i; auto in_tensor = getKnownConcreteValue(shard->in()).as(); - auto stream_index = - expr_evaluator_.evaluate(shard->stream_index()).as(); + auto index = expr_evaluator_.evaluate(shard->stream_index()).as(); + + if (stream_id->definition() != nullptr) { + NVF_CHECK(stream_id->definition()->isA()); + auto* swizzle = stream_id->definition()->as(); + int64_t offset = swizzle->offset()->evaluate().as(); + index += offset; + } + at::Tensor out_tensor = in_tensor .chunk( - stream_id->extent()->evaluate().as(), + expr_evaluator_.evaluate(stream_id->extent()).as(), getShardedLogicalAxis(out_tv, ParallelType::Stream)) - .at(stream_index); + .at(index); expr_evaluator_.bind(out_tv, out_tensor); } diff --git a/csrc/host_ir/ir.cpp b/csrc/host_ir/ir.cpp index 198601355fb..b29df6248f9 100644 --- a/csrc/host_ir/ir.cpp +++ b/csrc/host_ir/ir.cpp @@ -503,4 +503,37 @@ std::string ForLoop::toInlineString(int indent_size) const { index, iter_domain->start(), iter_domain->stop()); } +Swizzle::Swizzle( + IrBuilderPasskey passkey, + IterDomain* in, + IterDomain* out, + Val* offset) + : Expr(passkey, {in}, {out}, {offset}) { + NVF_ERROR(passkey.ir_container_ != nullptr); + NVF_ERROR( + passkey.ir_container_->isA(), + this, + "must be registered in a HostIrContainer"); + NVF_ERROR(in != nullptr); + NVF_ERROR(out != nullptr); + NVF_ERROR(offset != nullptr); +} + +NVFUSER_DEFINE_CLONE_AND_CREATE(Swizzle) + +std::string Swizzle::toString(int indent_size) const { + std::stringstream ss; + indent(ss, indent_size) << out()->toString() << " = Swizzle(" + << in()->toString() + << ", offset=" << offset()->toString() << std::endl; + return ss.str(); +} + +std::string Swizzle::toInlineString(int indent_size) const { + std::stringstream ss; + indent(ss, indent_size) << "Swizzle(" << in()->toInlineString() + << ", offset=" << offset()->toInlineString() << ")"; + return ss.str(); +} + } // namespace nvfuser::hir diff --git a/csrc/host_ir/ir.h b/csrc/host_ir/ir.h index d01263f2e0d..7c2584c57c4 100644 --- a/csrc/host_ir/ir.h +++ b/csrc/host_ir/ir.h @@ -569,4 +569,43 @@ class ForLoop : public Expr { } }; +class Swizzle : public Expr { + public: + using Expr::Expr; + + Swizzle( + IrBuilderPasskey passkey, + IterDomain* in, + IterDomain* out, + Val* offset); + + Swizzle(const Swizzle& other) = delete; + Swizzle& operator=(const Swizzle& other) = delete; + Swizzle(Swizzle&& other) = delete; + Swizzle& operator=(Swizzle&& other) = delete; + + NVFUSER_DECLARE_CLONE_AND_CREATE + + std::string toString(int indent_size = 0) const override; + std::string toInlineString(int indent_size = 0) const override; + const char* getOpString() const override { + return "hir::Swizzle"; + } + + // Input iterdomain to be swizzled + IterDomain* in() const { + return inputs().at(0)->as(); + } + + // Output swizzled iterdomain + IterDomain* out() const { + return outputs().at(0)->as(); + } + + // Swizzle offset parameter + Val* offset() const { + return attributeVal(0); + } +}; + } // namespace nvfuser::hir diff --git a/csrc/host_ir/ops.cpp b/csrc/host_ir/ops.cpp index 05fd42e2764..d862822cd97 100644 --- a/csrc/host_ir/ops.cpp +++ b/csrc/host_ir/ops.cpp @@ -24,6 +24,32 @@ namespace nvfuser::hir { +IterDomain* swizzle(IterDomain* in, Val* offset) { + NVF_ERROR(in != nullptr, "Input IterDomain cannot be null"); + NVF_ERROR(offset != nullptr, "Swizzle offset parameter cannot be null"); + + // Create output IterDomain with same properties as input + auto* out = IterDomainBuilder(in).build(); + + // Create the Swizzle expression + IrBuilder::create(in, out, offset); + + return out; +} + +TensorView* swizzle(TensorView* in, int64_t axis, Val* offset) { + NVF_ERROR(in != nullptr); + NVF_ERROR(offset != nullptr); + + IterDomain* out_id = swizzle(in->axis(axis), offset); + std::vector loop_domain = in->getLoopDomain(); + loop_domain.erase(loop_domain.begin() + axis); + loop_domain.insert(loop_domain.begin() + axis, out_id); + in->setLoopDomain(loop_domain); + + return in; +} + TensorView* shardByStream(TensorView* source, Val* stream_index, Expr* e) { NVF_ERROR( getShardedIterDomain( diff --git a/csrc/host_ir/ops.h b/csrc/host_ir/ops.h index 66c90082427..9a3257874e2 100644 --- a/csrc/host_ir/ops.h +++ b/csrc/host_ir/ops.h @@ -20,6 +20,10 @@ namespace nvfuser::hir { +IterDomain* swizzle(IterDomain* in, Val* offset); + +TensorView* swizzle(TensorView* in, int64_t axis, Val* offset); + // Creates a ShardByStream without needing the destination TensorView. Returns // the destination TensorView. `e` is the Expr from which we propagate the loop // domain from. `source` must be either an input or an output of `e`. The diff --git a/csrc/multidevice/utils.cpp b/csrc/multidevice/utils.cpp index 21d459889b8..6fc9933226d 100644 --- a/csrc/multidevice/utils.cpp +++ b/csrc/multidevice/utils.cpp @@ -14,6 +14,7 @@ #include #include "compute_at_map.h" +#include "host_ir/ir.h" #include "ir/internal_base_nodes.h" #include "ir/internal_nodes.h" #include "transform_replay.h" @@ -178,6 +179,8 @@ int64_t getProducingLogicalAxis(const TensorView* tv, IterDomain* id) { // When `unshardedSizes` is given a local tensor of shape [1, 1], it's // unclear the global shape is [1, D] or [D, 1] or even [2, D/2], etc. id = merge->outer(); + } else if (auto* swizzle = dynamic_cast(def)) { + id = swizzle->in(); } else { NVF_THROW( "Unexpected transforms from logical to a DID-parallel allocation " diff --git a/tests/cpp/test_host_ir_evaluator.cpp b/tests/cpp/test_host_ir_evaluator.cpp index aa4f933f65e..2be4c60d3ed 100644 --- a/tests/cpp/test_host_ir_evaluator.cpp +++ b/tests/cpp/test_host_ir_evaluator.cpp @@ -16,6 +16,7 @@ #include "fusion.h" #include "host_ir/container.h" #include "host_ir/evaluator.h" +#include "host_ir/ops.h" #include "ir/builder.h" #include "ir/interface_nodes.h" #include "ops/alias.h" @@ -222,4 +223,72 @@ TEST_F(HostIrEvaluatorTest, AddInLoop) { << out_tensor << " vs " << expected_out_tensor; } +TEST_F(HostIrEvaluatorTest, SwizzleCopy) { + constexpr int64_t c = 3; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA); + at::Tensor in_tensor = at::randn({c * 5}, options); + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + { + TensorView* in_tv = makeContigTensor(1); + TensorView* out_tv = set(in_tv); + hic->addInput(in_tv); + hic->addOutput(out_tv); + + for (auto* tv : {in_tv, out_tv}) { + tv->setMemoryType(MemoryType::Global); + tv->outer_split(0, c); + } + auto* allocate_out = IrBuilder::create( + out_tv, MemoryType::Global, std::vector({}), /*zero_init=*/true); + + Val* offset = IrBuilder::create(1, DataType::Index); + in_tv = swizzle(in_tv, 0, offset); + out_tv = swizzle(out_tv, 0, offset); + in_tv->axis(0)->parallelize(ParallelType::Stream); + out_tv->axis(0)->parallelize(ParallelType::Stream); + + auto* stream_index = IrBuilder::create(DataType::Index); + auto* for_loop = IrBuilder::create( + stream_index, + /*start=*/hic->zeroVal(DataType::Index), + /*stop=*/IrBuilder::create(c - 1, DataType::Index)); + + TensorView* in_shard = + ops::newValLike(in_tv, *in_tv->getDataType())->as(); + TensorView* out_shard = + ops::newValLike(out_tv, *out_tv->getDataType())->as(); + + for (auto* tv : {in_shard, out_shard}) { + tv->outer_split(0, c); + tv = swizzle(tv, 0, offset); + tv->axis(0)->parallelize(ParallelType::Stream); + tv->setAllocationDomain(tv->getLoopDomain(), true); + } + + IrBuilder::create(in_shard, in_tv, stream_index); + IrBuilder::create(out_shard, out_tv, stream_index); + auto* copy = IrBuilder::create( + LoadStoreOpType::Set, out_shard, in_shard); + + for_loop->body().pushBack(in_shard->definition()); + for_loop->body().pushBack(out_shard->definition()); + for_loop->body().pushBack(copy); + + hic->pushBackTopLevelExprs(allocate_out); + hic->pushBackTopLevelExprs(for_loop); + } + + HostIrEvaluator hie(std::move(hic)); + KernelArgumentHolder ins(in_tensor); + ins.setCacheId(0); + KernelArgumentHolder outs = hie.runWithInputs(ins); + auto out_tensor = outs[0].as(); + auto expected_out_tensor = in_tensor; + expected_out_tensor.chunk(c, 0)[0].zero_(); + EXPECT_TRUE(at::allclose(out_tensor, expected_out_tensor)); +} + } // namespace nvfuser::hir From d62548409ab032ffb87f3dd7ff533e7790a9c4e2 Mon Sep 17 00:00:00 2001 From: Priya Mishra <26priya11@gmail.com> Date: Wed, 28 Jan 2026 14:01:41 -0800 Subject: [PATCH 02/10] swizzle version with parallel type --- csrc/host_ir/evaluator.cpp | 13 ++++- csrc/host_ir/ir.cpp | 11 ++-- csrc/host_ir/ir.h | 7 +-- csrc/host_ir/ops.cpp | 23 ++------ csrc/host_ir/ops.h | 4 +- tests/cpp/test_host_ir_evaluator.cpp | 68 ---------------------- tests/cpp/test_multidevice_host_ir.cpp | 78 ++++++++++++++++++++++++++ 7 files changed, 104 insertions(+), 100 deletions(-) diff --git a/csrc/host_ir/evaluator.cpp b/csrc/host_ir/evaluator.cpp index fee623992f8..0ea068c751a 100644 --- a/csrc/host_ir/evaluator.cpp +++ b/csrc/host_ir/evaluator.cpp @@ -811,8 +811,17 @@ void HostIrEvaluator::handle(ShardByStream* shard) { if (stream_id->definition() != nullptr) { NVF_CHECK(stream_id->definition()->isA()); auto* swizzle = stream_id->definition()->as(); - int64_t offset = swizzle->offset()->evaluate().as(); - index += offset; + ParallelType pt = swizzle->pt(); + + auto mesh = out_tv->getDeviceMesh(); + // Find the index of the current device in the slice of mesh corresponding + // to the parallel type + auto team_size = mesh.size(pt); + at::Tensor md_index = + mesh.multiDimensionalIndexOf(communicator_->deviceId()); + auto pt_axis = mesh.parallelTypeToAxis(pt); + int64_t team_index = md_index[pt_axis].item(); + index = (index + team_index) % team_size; } at::Tensor out_tensor = diff --git a/csrc/host_ir/ir.cpp b/csrc/host_ir/ir.cpp index b29df6248f9..19a709d1d70 100644 --- a/csrc/host_ir/ir.cpp +++ b/csrc/host_ir/ir.cpp @@ -507,8 +507,8 @@ Swizzle::Swizzle( IrBuilderPasskey passkey, IterDomain* in, IterDomain* out, - Val* offset) - : Expr(passkey, {in}, {out}, {offset}) { + ParallelType pt) + : Expr(passkey, {in}, {out}, {}) { NVF_ERROR(passkey.ir_container_ != nullptr); NVF_ERROR( passkey.ir_container_->isA(), @@ -516,7 +516,7 @@ Swizzle::Swizzle( "must be registered in a HostIrContainer"); NVF_ERROR(in != nullptr); NVF_ERROR(out != nullptr); - NVF_ERROR(offset != nullptr); + addDataAttribute(pt); } NVFUSER_DEFINE_CLONE_AND_CREATE(Swizzle) @@ -524,15 +524,14 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(Swizzle) std::string Swizzle::toString(int indent_size) const { std::stringstream ss; indent(ss, indent_size) << out()->toString() << " = Swizzle(" - << in()->toString() - << ", offset=" << offset()->toString() << std::endl; + << in()->toString() << ", pt=" << pt() << std::endl; return ss.str(); } std::string Swizzle::toInlineString(int indent_size) const { std::stringstream ss; indent(ss, indent_size) << "Swizzle(" << in()->toInlineString() - << ", offset=" << offset()->toInlineString() << ")"; + << ", pt=" << pt() << ")"; return ss.str(); } diff --git a/csrc/host_ir/ir.h b/csrc/host_ir/ir.h index 7c2584c57c4..9b75de0372e 100644 --- a/csrc/host_ir/ir.h +++ b/csrc/host_ir/ir.h @@ -577,7 +577,7 @@ class Swizzle : public Expr { IrBuilderPasskey passkey, IterDomain* in, IterDomain* out, - Val* offset); + ParallelType pt); Swizzle(const Swizzle& other) = delete; Swizzle& operator=(const Swizzle& other) = delete; @@ -602,9 +602,8 @@ class Swizzle : public Expr { return outputs().at(0)->as(); } - // Swizzle offset parameter - Val* offset() const { - return attributeVal(0); + ParallelType pt() const { + return attribute(0); } }; diff --git a/csrc/host_ir/ops.cpp b/csrc/host_ir/ops.cpp index d862822cd97..00532eec851 100644 --- a/csrc/host_ir/ops.cpp +++ b/csrc/host_ir/ops.cpp @@ -24,27 +24,16 @@ namespace nvfuser::hir { -IterDomain* swizzle(IterDomain* in, Val* offset) { - NVF_ERROR(in != nullptr, "Input IterDomain cannot be null"); - NVF_ERROR(offset != nullptr, "Swizzle offset parameter cannot be null"); - - // Create output IterDomain with same properties as input - auto* out = IterDomainBuilder(in).build(); - - // Create the Swizzle expression - IrBuilder::create(in, out, offset); - - return out; -} - -TensorView* swizzle(TensorView* in, int64_t axis, Val* offset) { +TensorView* swizzle(TensorView* in, int64_t axis, ParallelType pt) { NVF_ERROR(in != nullptr); - NVF_ERROR(offset != nullptr); - IterDomain* out_id = swizzle(in->axis(axis), offset); + IterDomain* swizzle_in = in->axis(axis); + IterDomain* swizzle_out = IterDomainBuilder(swizzle_in).build(); + IrBuilder::create(swizzle_in, swizzle_out, pt); + std::vector loop_domain = in->getLoopDomain(); loop_domain.erase(loop_domain.begin() + axis); - loop_domain.insert(loop_domain.begin() + axis, out_id); + loop_domain.insert(loop_domain.begin() + axis, swizzle_out); in->setLoopDomain(loop_domain); return in; diff --git a/csrc/host_ir/ops.h b/csrc/host_ir/ops.h index 9a3257874e2..765e58d4e4b 100644 --- a/csrc/host_ir/ops.h +++ b/csrc/host_ir/ops.h @@ -20,9 +20,7 @@ namespace nvfuser::hir { -IterDomain* swizzle(IterDomain* in, Val* offset); - -TensorView* swizzle(TensorView* in, int64_t axis, Val* offset); +TensorView* swizzle(TensorView* in, int64_t axis, ParallelType pt); // Creates a ShardByStream without needing the destination TensorView. Returns // the destination TensorView. `e` is the Expr from which we propagate the loop diff --git a/tests/cpp/test_host_ir_evaluator.cpp b/tests/cpp/test_host_ir_evaluator.cpp index 2be4c60d3ed..19846d4c403 100644 --- a/tests/cpp/test_host_ir_evaluator.cpp +++ b/tests/cpp/test_host_ir_evaluator.cpp @@ -223,72 +223,4 @@ TEST_F(HostIrEvaluatorTest, AddInLoop) { << out_tensor << " vs " << expected_out_tensor; } -TEST_F(HostIrEvaluatorTest, SwizzleCopy) { - constexpr int64_t c = 3; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA); - at::Tensor in_tensor = at::randn({c * 5}, options); - - auto hic = std::make_unique(); - FusionGuard fg(hic.get()); - { - TensorView* in_tv = makeContigTensor(1); - TensorView* out_tv = set(in_tv); - hic->addInput(in_tv); - hic->addOutput(out_tv); - - for (auto* tv : {in_tv, out_tv}) { - tv->setMemoryType(MemoryType::Global); - tv->outer_split(0, c); - } - auto* allocate_out = IrBuilder::create( - out_tv, MemoryType::Global, std::vector({}), /*zero_init=*/true); - - Val* offset = IrBuilder::create(1, DataType::Index); - in_tv = swizzle(in_tv, 0, offset); - out_tv = swizzle(out_tv, 0, offset); - in_tv->axis(0)->parallelize(ParallelType::Stream); - out_tv->axis(0)->parallelize(ParallelType::Stream); - - auto* stream_index = IrBuilder::create(DataType::Index); - auto* for_loop = IrBuilder::create( - stream_index, - /*start=*/hic->zeroVal(DataType::Index), - /*stop=*/IrBuilder::create(c - 1, DataType::Index)); - - TensorView* in_shard = - ops::newValLike(in_tv, *in_tv->getDataType())->as(); - TensorView* out_shard = - ops::newValLike(out_tv, *out_tv->getDataType())->as(); - - for (auto* tv : {in_shard, out_shard}) { - tv->outer_split(0, c); - tv = swizzle(tv, 0, offset); - tv->axis(0)->parallelize(ParallelType::Stream); - tv->setAllocationDomain(tv->getLoopDomain(), true); - } - - IrBuilder::create(in_shard, in_tv, stream_index); - IrBuilder::create(out_shard, out_tv, stream_index); - auto* copy = IrBuilder::create( - LoadStoreOpType::Set, out_shard, in_shard); - - for_loop->body().pushBack(in_shard->definition()); - for_loop->body().pushBack(out_shard->definition()); - for_loop->body().pushBack(copy); - - hic->pushBackTopLevelExprs(allocate_out); - hic->pushBackTopLevelExprs(for_loop); - } - - HostIrEvaluator hie(std::move(hic)); - KernelArgumentHolder ins(in_tensor); - ins.setCacheId(0); - KernelArgumentHolder outs = hie.runWithInputs(ins); - auto out_tensor = outs[0].as(); - auto expected_out_tensor = in_tensor; - expected_out_tensor.chunk(c, 0)[0].zero_(); - EXPECT_TRUE(at::allclose(out_tensor, expected_out_tensor)); -} - } // namespace nvfuser::hir diff --git a/tests/cpp/test_multidevice_host_ir.cpp b/tests/cpp/test_multidevice_host_ir.cpp index 579f3d8f661..5da45d67446 100644 --- a/tests/cpp/test_multidevice_host_ir.cpp +++ b/tests/cpp/test_multidevice_host_ir.cpp @@ -10,6 +10,7 @@ #include "fusion.h" #include "host_ir/container.h" #include "host_ir/evaluator.h" +#include "host_ir/ops.h" #include "host_ir/pass/stream_parallel_type.h" #include "ir/all_nodes.h" #include "multidevice/symmetric_tensor.h" @@ -507,6 +508,83 @@ TEST_F(MultiDeviceHostIrTest, SymmetricContiguousView) { << "Output tensor does not match expected values"; } +TEST_F(MultiDeviceTest, SwizzleWithParallelType) { + const int64_t d = communicator_->size(); + const int64_t my_rank = communicator_->deviceId(); + auto mesh = DeviceMesh::createForNumDevices(d); + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + { + TensorView* in_tv = makeContigTensor(2); + TensorView* out_tv = set(in_tv); + hic->addInput(in_tv); + hic->addOutput(out_tv); + + for (auto* tv : {in_tv, out_tv}) { + tv->setMemoryType(MemoryType::Global); + tv->setDeviceMesh(mesh); + tv->outer_split(1, d); + tv->axis(1)->parallelize(ParallelType::DIDx); + tv->setAllocationDomain(tv->getLoopDomain(), true); + } + auto* allocate_out = IrBuilder::create( + out_tv, MemoryType::Global, std::vector({}), /*zero_init=*/true); + + for (auto* tv : {in_tv, out_tv}) { + tv->outer_split(0, d); + tv = hir::swizzle(tv, 0, ParallelType::DIDx); + tv->axis(0)->parallelize(ParallelType::Stream); + } + + auto* stream_index = IrBuilder::create(DataType::Index); + auto* for_loop = IrBuilder::create( + stream_index, + /*start=*/hic->zeroVal(DataType::Index), + /*stop=*/IrBuilder::create(d - 1, DataType::Index)); + + TensorView* in_shard = + ops::newValLike(in_tv, *in_tv->getDataType())->as(); + TensorView* out_shard = + ops::newValLike(out_tv, *out_tv->getDataType())->as(); + + for (auto* tv : {in_shard, out_shard}) { + tv->setDeviceMesh(mesh); + tv->outer_split(1, d); + tv->axis(1)->parallelize(ParallelType::DIDx); + tv->outer_split(0, d); + tv = hir::swizzle(tv, 0, ParallelType::DIDx); + tv->axis(0)->parallelize(ParallelType::Stream); + tv->setAllocationDomain(tv->getLoopDomain(), true); + } + + IrBuilder::create(in_shard, in_tv, stream_index); + IrBuilder::create(out_shard, out_tv, stream_index); + auto* copy = IrBuilder::create( + LoadStoreOpType::Set, out_shard, in_shard); + + for_loop->body().pushBack(in_shard->definition()); + for_loop->body().pushBack(out_shard->definition()); + for_loop->body().pushBack(copy); + + hic->pushBackTopLevelExprs(allocate_out); + hic->pushBackTopLevelExprs(for_loop); + } + + HostIrEvaluator hie(std::move(hic)); + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA); + at::Tensor unsharded_in = at::randn({d * 3, d * 5}, options); + at::Tensor sharded_in = shardTensor1D(unsharded_in, 1, mesh); + + KernelArgumentHolder ins(sharded_in); + ins.setCacheId(0); + KernelArgumentHolder outs = hie.runWithInputs(ins); + at::Tensor out = outs[0].as(); + at::Tensor expected_out = sharded_in; + expected_out.chunk(d, 0)[(my_rank + d - 1) % d].zero_(); + EXPECT_TRUE(at::allclose(out, expected_out)) << out << " vs " << expected_out; +} + } // namespace hir } // namespace nvfuser From 606c9a8537818538a5e042473fc71671b9467de1 Mon Sep 17 00:00:00 2001 From: Priya Mishra <26priya11@gmail.com> Date: Fri, 30 Jan 2026 13:24:02 -0800 Subject: [PATCH 03/10] add cyclic shift comment --- csrc/host_ir/evaluator.cpp | 8 +++++++- csrc/host_ir/ir.h | 2 -- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/csrc/host_ir/evaluator.cpp b/csrc/host_ir/evaluator.cpp index 0ea068c751a..aea22620301 100644 --- a/csrc/host_ir/evaluator.cpp +++ b/csrc/host_ir/evaluator.cpp @@ -809,6 +809,11 @@ void HostIrEvaluator::handle(ShardByStream* shard) { auto index = expr_evaluator_.evaluate(shard->stream_index()).as(); if (stream_id->definition() != nullptr) { + // If the stream axis is defined by a swizzle, the input to + // the swizzle is the index into the `in_tensor`. + // Currently, we use cyclic shift swizzle to compute the index: + // in_index = (out_index (stream index) + device_id) % num_devices + NVF_CHECK(stream_id->definition()->isA()); auto* swizzle = stream_id->definition()->as(); ParallelType pt = swizzle->pt(); @@ -827,7 +832,8 @@ void HostIrEvaluator::handle(ShardByStream* shard) { at::Tensor out_tensor = in_tensor .chunk( - expr_evaluator_.evaluate(stream_id->extent()).as(), + stream_id->extent()->evaluate().as(), + index, getShardedLogicalAxis(out_tv, ParallelType::Stream)) .at(index); diff --git a/csrc/host_ir/ir.h b/csrc/host_ir/ir.h index 9b75de0372e..61c2c962432 100644 --- a/csrc/host_ir/ir.h +++ b/csrc/host_ir/ir.h @@ -592,12 +592,10 @@ class Swizzle : public Expr { return "hir::Swizzle"; } - // Input iterdomain to be swizzled IterDomain* in() const { return inputs().at(0)->as(); } - // Output swizzled iterdomain IterDomain* out() const { return outputs().at(0)->as(); } From 41163693f496df1861fcbf5a02c8594bb0955774 Mon Sep 17 00:00:00 2001 From: Priya Mishra <26priya11@gmail.com> Date: Fri, 30 Jan 2026 13:25:37 -0800 Subject: [PATCH 04/10] unused import --- tests/cpp/test_host_ir_evaluator.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/cpp/test_host_ir_evaluator.cpp b/tests/cpp/test_host_ir_evaluator.cpp index 19846d4c403..aa4f933f65e 100644 --- a/tests/cpp/test_host_ir_evaluator.cpp +++ b/tests/cpp/test_host_ir_evaluator.cpp @@ -16,7 +16,6 @@ #include "fusion.h" #include "host_ir/container.h" #include "host_ir/evaluator.h" -#include "host_ir/ops.h" #include "ir/builder.h" #include "ir/interface_nodes.h" #include "ops/alias.h" From fc87e4970bcd2ac45379b83c5a9f0be4e8b20f33 Mon Sep 17 00:00:00 2001 From: Priya Mishra <26priya11@gmail.com> Date: Mon, 2 Feb 2026 18:49:32 -0800 Subject: [PATCH 05/10] move swizzle to ir_nodes --- csrc/host_ir/evaluator.cpp | 12 +++++---- csrc/host_ir/ir.cpp | 32 ----------------------- csrc/host_ir/ir.h | 36 -------------------------- csrc/host_ir/ops.cpp | 15 ----------- csrc/ir/interface_nodes.h | 3 +++ csrc/ir/internal_base_nodes.cpp | 16 ++++++++++++ csrc/ir/internal_base_nodes.h | 4 +++ csrc/ir/internal_nodes.cpp | 25 ++++++++++++++++++ csrc/ir/internal_nodes.h | 32 +++++++++++++++++++++++ csrc/multidevice/utils.cpp | 2 +- csrc/tensor_view.cpp | 5 ++++ tests/cpp/test_multidevice_host_ir.cpp | 4 +-- 12 files changed, 95 insertions(+), 91 deletions(-) diff --git a/csrc/host_ir/evaluator.cpp b/csrc/host_ir/evaluator.cpp index aea22620301..bdde8d54fdc 100644 --- a/csrc/host_ir/evaluator.cpp +++ b/csrc/host_ir/evaluator.cpp @@ -813,10 +813,13 @@ void HostIrEvaluator::handle(ShardByStream* shard) { // the swizzle is the index into the `in_tensor`. // Currently, we use cyclic shift swizzle to compute the index: // in_index = (out_index (stream index) + device_id) % num_devices - - NVF_CHECK(stream_id->definition()->isA()); - auto* swizzle = stream_id->definition()->as(); - ParallelType pt = swizzle->pt(); + // TODO(prmishra): In the future, the swizzle compute should be done outside + // of `shardByStream` such that `add` and `mod` are in the HostIrContainer + // similar to + // https://github.com/NVIDIA/Fuser/blob/0a6adb140d440cc1b6d5f21dfd05874f9699b2c6/csrc/swizzle.h#L26-L31. + NVF_CHECK(stream_id->definition()->isA()); + auto* swizzle = stream_id->definition()->as(); + ParallelType pt = swizzle->parallelType(); auto mesh = out_tv->getDeviceMesh(); // Find the index of the current device in the slice of mesh corresponding @@ -833,7 +836,6 @@ void HostIrEvaluator::handle(ShardByStream* shard) { in_tensor .chunk( stream_id->extent()->evaluate().as(), - index, getShardedLogicalAxis(out_tv, ParallelType::Stream)) .at(index); diff --git a/csrc/host_ir/ir.cpp b/csrc/host_ir/ir.cpp index 19a709d1d70..198601355fb 100644 --- a/csrc/host_ir/ir.cpp +++ b/csrc/host_ir/ir.cpp @@ -503,36 +503,4 @@ std::string ForLoop::toInlineString(int indent_size) const { index, iter_domain->start(), iter_domain->stop()); } -Swizzle::Swizzle( - IrBuilderPasskey passkey, - IterDomain* in, - IterDomain* out, - ParallelType pt) - : Expr(passkey, {in}, {out}, {}) { - NVF_ERROR(passkey.ir_container_ != nullptr); - NVF_ERROR( - passkey.ir_container_->isA(), - this, - "must be registered in a HostIrContainer"); - NVF_ERROR(in != nullptr); - NVF_ERROR(out != nullptr); - addDataAttribute(pt); -} - -NVFUSER_DEFINE_CLONE_AND_CREATE(Swizzle) - -std::string Swizzle::toString(int indent_size) const { - std::stringstream ss; - indent(ss, indent_size) << out()->toString() << " = Swizzle(" - << in()->toString() << ", pt=" << pt() << std::endl; - return ss.str(); -} - -std::string Swizzle::toInlineString(int indent_size) const { - std::stringstream ss; - indent(ss, indent_size) << "Swizzle(" << in()->toInlineString() - << ", pt=" << pt() << ")"; - return ss.str(); -} - } // namespace nvfuser::hir diff --git a/csrc/host_ir/ir.h b/csrc/host_ir/ir.h index 61c2c962432..d01263f2e0d 100644 --- a/csrc/host_ir/ir.h +++ b/csrc/host_ir/ir.h @@ -569,40 +569,4 @@ class ForLoop : public Expr { } }; -class Swizzle : public Expr { - public: - using Expr::Expr; - - Swizzle( - IrBuilderPasskey passkey, - IterDomain* in, - IterDomain* out, - ParallelType pt); - - Swizzle(const Swizzle& other) = delete; - Swizzle& operator=(const Swizzle& other) = delete; - Swizzle(Swizzle&& other) = delete; - Swizzle& operator=(Swizzle&& other) = delete; - - NVFUSER_DECLARE_CLONE_AND_CREATE - - std::string toString(int indent_size = 0) const override; - std::string toInlineString(int indent_size = 0) const override; - const char* getOpString() const override { - return "hir::Swizzle"; - } - - IterDomain* in() const { - return inputs().at(0)->as(); - } - - IterDomain* out() const { - return outputs().at(0)->as(); - } - - ParallelType pt() const { - return attribute(0); - } -}; - } // namespace nvfuser::hir diff --git a/csrc/host_ir/ops.cpp b/csrc/host_ir/ops.cpp index 00532eec851..05fd42e2764 100644 --- a/csrc/host_ir/ops.cpp +++ b/csrc/host_ir/ops.cpp @@ -24,21 +24,6 @@ namespace nvfuser::hir { -TensorView* swizzle(TensorView* in, int64_t axis, ParallelType pt) { - NVF_ERROR(in != nullptr); - - IterDomain* swizzle_in = in->axis(axis); - IterDomain* swizzle_out = IterDomainBuilder(swizzle_in).build(); - IrBuilder::create(swizzle_in, swizzle_out, pt); - - std::vector loop_domain = in->getLoopDomain(); - loop_domain.erase(loop_domain.begin() + axis); - loop_domain.insert(loop_domain.begin() + axis, swizzle_out); - in->setLoopDomain(loop_domain); - - return in; -} - TensorView* shardByStream(TensorView* source, Val* stream_index, Expr* e) { NVF_ERROR( getShardedIterDomain( diff --git a/csrc/ir/interface_nodes.h b/csrc/ir/interface_nodes.h index 1b338d84eaf..58f35cacd9f 100644 --- a/csrc/ir/interface_nodes.h +++ b/csrc/ir/interface_nodes.h @@ -646,6 +646,9 @@ class NVF_API TensorView : public Val { //! to the 2 given indices. TensorView* swizzle(SwizzleType swizzle_type, int64_t x, int64_t y); + //! Swizzle the iterdomain corresponding to the given index. + TensorView* swizzle1d(int64_t x, ParallelType pt); + //! Resize an IterDomain by expanding both the left and right sides //! by given widths. The resulting IterDomain has an extent of //! (left_expansion + axis->extent() + right_expansion). diff --git a/csrc/ir/internal_base_nodes.cpp b/csrc/ir/internal_base_nodes.cpp index 77b84123ed1..f81fb34994c 100644 --- a/csrc/ir/internal_base_nodes.cpp +++ b/csrc/ir/internal_base_nodes.cpp @@ -576,6 +576,12 @@ std::pair IterDomain::swizzle( return std::make_pair(out_x, out_y); } +IterDomain* IterDomain::swizzle1d(IterDomain* in, ParallelType pt) { + IterDomain* out = IterDomainBuilder(in).build(); + IrBuilder::createInContainer(in->container(), out, in, pt); + return out; +} + IterDomain* IterDomain::resize( IterDomain* in, Val* left_expansion, @@ -1856,6 +1862,16 @@ void TensorDomain::swizzle( loop_domain_.insert(loop_domain_.begin() + y, axis_out_y); } +void TensorDomain::swizzle1d(int64_t x, ParallelType pt) { + x = wrapDim(x); + + IterDomain* swizzle_in = axis(x); + IterDomain* swizzle_out = IterDomain::swizzle1d(swizzle_in, pt); + + loop_domain_.erase(loop_domain_.begin() + x); + loop_domain_.insert(loop_domain_.begin() + x, swizzle_out); +} + void TensorDomain::resize( int64_t axis, Val* left_expansion, diff --git a/csrc/ir/internal_base_nodes.h b/csrc/ir/internal_base_nodes.h index 84505694ceb..238d4c22bf4 100644 --- a/csrc/ir/internal_base_nodes.h +++ b/csrc/ir/internal_base_nodes.h @@ -391,6 +391,8 @@ class NVF_API IterDomain : public Val { IterDomain* in_y, SwizzleMode swizzle_mode = SwizzleMode::Data); + static IterDomain* swizzle1d(IterDomain* in, ParallelType pt); + protected: friend TensorDomain; friend ReplayTransformations; @@ -835,6 +837,8 @@ class NVF_API TensorDomain : public Val { int64_t y, SwizzleMode swizzle_mode = SwizzleMode::Data); + void swizzle1d(int64_t x, ParallelType pt); + // Resize an axis by left_expansion and right_expansion void resize( int64_t axis, diff --git a/csrc/ir/internal_nodes.cpp b/csrc/ir/internal_nodes.cpp index 3e1e18d9589..b028651d039 100644 --- a/csrc/ir/internal_nodes.cpp +++ b/csrc/ir/internal_nodes.cpp @@ -2808,6 +2808,31 @@ std::string Swizzle2D::toInlineString(int indent_size) const { NVFUSER_DEFINE_CLONE_AND_CREATE(Swizzle2D) +Swizzle1D::Swizzle1D( + IrBuilderPasskey passkey, + IterDomain* out, + IterDomain* in, + ParallelType pt) + : Expr(passkey) { + addOutput(out); + addInput(in); + addDataAttribute(pt); +} + +NVFUSER_DEFINE_CLONE_AND_CREATE(Swizzle1D) + +std::string Swizzle1D::toString(int indent_size) const { + std::stringstream ss; + indent(ss, indent_size) << out()->toString() << " = Swizzle1D(" + << in()->toString() + << ", parallelType=" << parallelType() << std::endl; + return ss.str(); +} + +std::string Swizzle1D::toInlineString(int indent_size) const { + NVF_THROW("Swizzle1D can not be printed inline"); +} + Resize::Resize( IrBuilderPasskey passkey, IterDomain* out, diff --git a/csrc/ir/internal_nodes.h b/csrc/ir/internal_nodes.h index f0eafd35000..617cf30879c 100644 --- a/csrc/ir/internal_nodes.h +++ b/csrc/ir/internal_nodes.h @@ -2089,6 +2089,38 @@ class Swizzle2D : public Expr { } }; +class Swizzle1D : public Expr { + public: + using Expr::Expr; + + Swizzle1D( + IrBuilderPasskey passkey, + IterDomain* out, + IterDomain* in, + ParallelType pt); + + NVFUSER_DECLARE_CLONE_AND_CREATE + + const char* getOpString() const override { + return "Swizzle1D"; + } + + std::string toString(int indent_size = 0) const override; + std::string toInlineString(int indent_size = 0) const override; + + IterDomain* in() const { + return inputs().at(0)->as(); + } + + IterDomain* out() const { + return outputs().at(0)->as(); + } + + ParallelType parallelType() const { + return attribute(0); + } +}; + //! IterDomain expression to resize class Resize : public Expr { public: diff --git a/csrc/multidevice/utils.cpp b/csrc/multidevice/utils.cpp index 6fc9933226d..0581955639d 100644 --- a/csrc/multidevice/utils.cpp +++ b/csrc/multidevice/utils.cpp @@ -179,7 +179,7 @@ int64_t getProducingLogicalAxis(const TensorView* tv, IterDomain* id) { // When `unshardedSizes` is given a local tensor of shape [1, 1], it's // unclear the global shape is [1, D] or [D, 1] or even [2, D/2], etc. id = merge->outer(); - } else if (auto* swizzle = dynamic_cast(def)) { + } else if (auto* swizzle = dynamic_cast(def)) { id = swizzle->in(); } else { NVF_THROW( diff --git a/csrc/tensor_view.cpp b/csrc/tensor_view.cpp index 6de9a30d84e..51e63a268a5 100644 --- a/csrc/tensor_view.cpp +++ b/csrc/tensor_view.cpp @@ -769,6 +769,11 @@ TensorView* TensorView::swizzle( return this; } +TensorView* TensorView::swizzle1d(int64_t x, ParallelType pt) { + domain()->swizzle1d(x, pt); + return this; +} + TensorView* TensorView::rFactor(const std::vector& axes) { NVF_ERROR( !container()->isA(), diff --git a/tests/cpp/test_multidevice_host_ir.cpp b/tests/cpp/test_multidevice_host_ir.cpp index 5da45d67446..a669c806004 100644 --- a/tests/cpp/test_multidevice_host_ir.cpp +++ b/tests/cpp/test_multidevice_host_ir.cpp @@ -533,7 +533,7 @@ TEST_F(MultiDeviceTest, SwizzleWithParallelType) { for (auto* tv : {in_tv, out_tv}) { tv->outer_split(0, d); - tv = hir::swizzle(tv, 0, ParallelType::DIDx); + tv->swizzle1d(0, ParallelType::DIDx); tv->axis(0)->parallelize(ParallelType::Stream); } @@ -553,7 +553,7 @@ TEST_F(MultiDeviceTest, SwizzleWithParallelType) { tv->outer_split(1, d); tv->axis(1)->parallelize(ParallelType::DIDx); tv->outer_split(0, d); - tv = hir::swizzle(tv, 0, ParallelType::DIDx); + tv->swizzle1d(0, ParallelType::DIDx); tv->axis(0)->parallelize(ParallelType::Stream); tv->setAllocationDomain(tv->getLoopDomain(), true); } From 1557cd02f72a37f7239ef4ef5d2a026a90e0b880 Mon Sep 17 00:00:00 2001 From: Priya Mishra <26priya11@gmail.com> Date: Mon, 2 Feb 2026 18:52:22 -0800 Subject: [PATCH 06/10] unused imports --- csrc/host_ir/ops.h | 2 -- csrc/multidevice/utils.cpp | 1 - tests/cpp/test_multidevice_host_ir.cpp | 1 - 3 files changed, 4 deletions(-) diff --git a/csrc/host_ir/ops.h b/csrc/host_ir/ops.h index 765e58d4e4b..66c90082427 100644 --- a/csrc/host_ir/ops.h +++ b/csrc/host_ir/ops.h @@ -20,8 +20,6 @@ namespace nvfuser::hir { -TensorView* swizzle(TensorView* in, int64_t axis, ParallelType pt); - // Creates a ShardByStream without needing the destination TensorView. Returns // the destination TensorView. `e` is the Expr from which we propagate the loop // domain from. `source` must be either an input or an output of `e`. The diff --git a/csrc/multidevice/utils.cpp b/csrc/multidevice/utils.cpp index 0581955639d..beb7283c5a1 100644 --- a/csrc/multidevice/utils.cpp +++ b/csrc/multidevice/utils.cpp @@ -14,7 +14,6 @@ #include #include "compute_at_map.h" -#include "host_ir/ir.h" #include "ir/internal_base_nodes.h" #include "ir/internal_nodes.h" #include "transform_replay.h" diff --git a/tests/cpp/test_multidevice_host_ir.cpp b/tests/cpp/test_multidevice_host_ir.cpp index a669c806004..a14b225c2f2 100644 --- a/tests/cpp/test_multidevice_host_ir.cpp +++ b/tests/cpp/test_multidevice_host_ir.cpp @@ -10,7 +10,6 @@ #include "fusion.h" #include "host_ir/container.h" #include "host_ir/evaluator.h" -#include "host_ir/ops.h" #include "host_ir/pass/stream_parallel_type.h" #include "ir/all_nodes.h" #include "multidevice/symmetric_tensor.h" From c99f003a0d3777ce1ad392b09fdc5a577f435b1a Mon Sep 17 00:00:00 2001 From: Priya Mishra <26priya11@gmail.com> Date: Mon, 2 Feb 2026 19:02:09 -0800 Subject: [PATCH 07/10] comments --- csrc/ir/interface_nodes.h | 5 ++++- csrc/ir/internal_nodes.h | 6 ++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/csrc/ir/interface_nodes.h b/csrc/ir/interface_nodes.h index fd9f0de5286..017c0a7db8e 100644 --- a/csrc/ir/interface_nodes.h +++ b/csrc/ir/interface_nodes.h @@ -646,7 +646,10 @@ class NVF_API TensorView : public Val { //! to the 2 given indices. TensorView* swizzle(SwizzleType swizzle_type, int64_t x, int64_t y); - //! Swizzle the iterdomain corresponding to the given index. + //! Swizzle1D is currently only used and handled in HostIr + //! It computes the `in` id to the swizzle as a function of the device id + //! (corresponding to the parallel type) and `out` id. See + //! `HostIrEvaluator::handle(ShardByStream)` for usage. TensorView* swizzle1d(int64_t x, ParallelType pt); //! Resize an IterDomain by expanding both the left and right sides diff --git a/csrc/ir/internal_nodes.h b/csrc/ir/internal_nodes.h index 9eff92b7417..a3ef4782df8 100644 --- a/csrc/ir/internal_nodes.h +++ b/csrc/ir/internal_nodes.h @@ -1995,8 +1995,10 @@ class Swizzle : public Expr { } }; -//! Applies 2D swizzles on a rectangular tile defined by 2 iterdomains. - +// Swizzle1D is currently only used and handled in HostIr. +// The main use case is to compute the indexing for ring-based overlap, where +// `out` is stream-parallel and `in` is a function of the device id and stream +// index. See `HostIrEvaluator::handle(ShardByStream)` for usage. class Swizzle1D : public Expr { public: using Expr::Expr; From a4d7b8e83d20be341a93e17eaf650dc74d4ff38f Mon Sep 17 00:00:00 2001 From: Priya Mishra <26priya11@gmail.com> Date: Mon, 2 Feb 2026 19:04:14 -0800 Subject: [PATCH 08/10] validate parallel type --- csrc/tensor_view.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/csrc/tensor_view.cpp b/csrc/tensor_view.cpp index 150b947687c..a25be606b98 100644 --- a/csrc/tensor_view.cpp +++ b/csrc/tensor_view.cpp @@ -769,6 +769,10 @@ TensorView* TensorView::swizzle( } TensorView* TensorView::swizzle1d(int64_t x, ParallelType pt) { + NVF_CHECK( + deviceParallelTypes().contains(pt), + "Swizzle1D only supports device parallel types, given: ", + pt); domain()->swizzle1d(x, pt); return this; } From 102a5f46ab551a195856ccd3850cf5c5ab93c75a Mon Sep 17 00:00:00 2001 From: Priya Mishra <52657555+Priya2698@users.noreply.github.com> Date: Mon, 2 Feb 2026 19:14:37 -0800 Subject: [PATCH 09/10] Update csrc/ir/internal_nodes.cpp Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- csrc/ir/internal_nodes.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/ir/internal_nodes.cpp b/csrc/ir/internal_nodes.cpp index ff57913c071..407771386c6 100644 --- a/csrc/ir/internal_nodes.cpp +++ b/csrc/ir/internal_nodes.cpp @@ -2788,7 +2788,7 @@ std::string Swizzle1D::toString(int indent_size) const { std::stringstream ss; indent(ss, indent_size) << out()->toString() << " = Swizzle1D(" << in()->toString() - << ", parallelType=" << parallelType() << std::endl; + << ", parallelType=" << parallelType() << ")" << std::endl; return ss.str(); } From b543dec2f07f8cc697af6adbebdcd66266e0b19d Mon Sep 17 00:00:00 2001 From: Priya Mishra <26priya11@gmail.com> Date: Mon, 2 Feb 2026 21:55:17 -0800 Subject: [PATCH 10/10] lintrunner, condition error --- csrc/host_ir/evaluator.cpp | 3 +-- csrc/ir/internal_nodes.cpp | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/csrc/host_ir/evaluator.cpp b/csrc/host_ir/evaluator.cpp index bdde8d54fdc..2396767d5b0 100644 --- a/csrc/host_ir/evaluator.cpp +++ b/csrc/host_ir/evaluator.cpp @@ -808,7 +808,7 @@ void HostIrEvaluator::handle(ShardByStream* shard) { auto in_tensor = getKnownConcreteValue(shard->in()).as(); auto index = expr_evaluator_.evaluate(shard->stream_index()).as(); - if (stream_id->definition() != nullptr) { + if (stream_id->definition()->isA()) { // If the stream axis is defined by a swizzle, the input to // the swizzle is the index into the `in_tensor`. // Currently, we use cyclic shift swizzle to compute the index: @@ -817,7 +817,6 @@ void HostIrEvaluator::handle(ShardByStream* shard) { // of `shardByStream` such that `add` and `mod` are in the HostIrContainer // similar to // https://github.com/NVIDIA/Fuser/blob/0a6adb140d440cc1b6d5f21dfd05874f9699b2c6/csrc/swizzle.h#L26-L31. - NVF_CHECK(stream_id->definition()->isA()); auto* swizzle = stream_id->definition()->as(); ParallelType pt = swizzle->parallelType(); diff --git a/csrc/ir/internal_nodes.cpp b/csrc/ir/internal_nodes.cpp index 407771386c6..0a3666bd6aa 100644 --- a/csrc/ir/internal_nodes.cpp +++ b/csrc/ir/internal_nodes.cpp @@ -2788,7 +2788,8 @@ std::string Swizzle1D::toString(int indent_size) const { std::stringstream ss; indent(ss, indent_size) << out()->toString() << " = Swizzle1D(" << in()->toString() - << ", parallelType=" << parallelType() << ")" << std::endl; + << ", parallelType=" << parallelType() << ")" + << std::endl; return ss.str(); }