From cd122c8d063ecb19f8d32a5331312bba552f33dd Mon Sep 17 00:00:00 2001
From: Priya Mishra <26priya11@gmail.com>
Date: Tue, 27 Jan 2026 20:53:37 -0800
Subject: [PATCH 01/10] working setup of a swizzled stream id

---
 csrc/host_ir/evaluator.cpp           | 15 ++++--
 csrc/host_ir/ir.cpp                  | 33 +++++++++++++
 csrc/host_ir/ir.h                    | 39 ++++++++++++++++
 csrc/host_ir/ops.cpp                 | 26 +++++++++++
 csrc/host_ir/ops.h                   |  4 ++
 csrc/multidevice/utils.cpp           |  3 ++
 tests/cpp/test_host_ir_evaluator.cpp | 69 ++++++++++++++++++++++++++++
 7 files changed, 185 insertions(+), 4 deletions(-)
diff --git a/csrc/host_ir/evaluator.cpp b/csrc/host_ir/evaluator.cpp
index 2ceedfddc40..fee623992f8 100644
--- a/csrc/host_ir/evaluator.cpp
+++ b/csrc/host_ir/evaluator.cpp
@@ -806,14 +806,21 @@ void HostIrEvaluator::handle(ShardByStream* shard) {
   IterDomain* stream_id = *i;
 
   auto in_tensor = getKnownConcreteValue(shard->in()).as<at::Tensor>();
-  auto stream_index =
-      expr_evaluator_.evaluate(shard->stream_index()).as<int64_t>();
+  auto index = expr_evaluator_.evaluate(shard->stream_index()).as<int64_t>();
+
+  if (stream_id->definition() != nullptr) {
+    NVF_CHECK(stream_id->definition()->isA<hir::Swizzle>());
+    auto* swizzle = stream_id->definition()->as<hir::Swizzle>();
+    int64_t offset = swizzle->offset()->evaluate().as<int64_t>();
+    index += offset;
+  }
+
   at::Tensor out_tensor =
       in_tensor
           .chunk(
-              stream_id->extent()->evaluate().as<int64_t>(),
+              expr_evaluator_.evaluate(stream_id->extent()).as<int64_t>(),
               getShardedLogicalAxis(out_tv, ParallelType::Stream))
-          .at(stream_index);
+          .at(index);
 
   expr_evaluator_.bind(out_tv, out_tensor);
 }
diff --git a/csrc/host_ir/ir.cpp b/csrc/host_ir/ir.cpp
index 198601355fb..b29df6248f9 100644
--- a/csrc/host_ir/ir.cpp
+++ b/csrc/host_ir/ir.cpp
@@ -503,4 +503,37 @@ std::string ForLoop::toInlineString(int indent_size) const {
       index, iter_domain->start(), iter_domain->stop());
 }
 
+Swizzle::Swizzle(
+    IrBuilderPasskey passkey,
+    IterDomain* in,
+    IterDomain* out,
+    Val* offset)
+    : Expr(passkey, {in}, {out}, {offset}) {
+  NVF_ERROR(passkey.ir_container_ != nullptr);
+  NVF_ERROR(
+      passkey.ir_container_->isA<HostIrContainer>(),
+      this,
+      "must be registered in a HostIrContainer");
+  NVF_ERROR(in != nullptr);
+  NVF_ERROR(out != nullptr);
+  NVF_ERROR(offset != nullptr);
+}
+
+NVFUSER_DEFINE_CLONE_AND_CREATE(Swizzle)
+
+std::string Swizzle::toString(int indent_size) const {
+  std::stringstream ss;
+  indent(ss, indent_size) << out()->toString() << " = Swizzle("
+                          << in()->toString()
+                          << ", offset=" << offset()->toString() << std::endl;
+  return ss.str();
+}
+
+std::string Swizzle::toInlineString(int indent_size) const {
+  std::stringstream ss;
+  indent(ss, indent_size) << "Swizzle(" << in()->toInlineString()
+                          << ", offset=" << offset()->toInlineString() << ")";
+  return ss.str();
+}
+
 } // namespace nvfuser::hir
diff --git a/csrc/host_ir/ir.h b/csrc/host_ir/ir.h
index d01263f2e0d..7c2584c57c4 100644
--- a/csrc/host_ir/ir.h
+++ b/csrc/host_ir/ir.h
@@ -569,4 +569,43 @@ class ForLoop : public Expr {
   }
 };
 
+class Swizzle : public Expr {
+ public:
+  using Expr::Expr;
+
+  Swizzle(
+      IrBuilderPasskey passkey,
+      IterDomain* in,
+      IterDomain* out,
+      Val* offset);
+
+  Swizzle(const Swizzle& other) = delete;
+  Swizzle& operator=(const Swizzle& other) = delete;
+  Swizzle(Swizzle&& other) = delete;
+  Swizzle& operator=(Swizzle&& other) = delete;
+
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  const char* getOpString() const override {
+    return "hir::Swizzle";
+  }
+
+  // Input iterdomain to be swizzled
+  IterDomain* in() const {
+    return inputs().at(0)->as<IterDomain>();
+  }
+
+  // Output swizzled iterdomain
+  IterDomain* out() const {
+    return outputs().at(0)->as<IterDomain>();
+  }
+
+  // Swizzle offset parameter
+  Val* offset() const {
+    return attributeVal(0);
+  }
+};
+
 } // namespace nvfuser::hir
diff --git a/csrc/host_ir/ops.cpp b/csrc/host_ir/ops.cpp
index 05fd42e2764..d862822cd97 100644
--- a/csrc/host_ir/ops.cpp
+++ b/csrc/host_ir/ops.cpp
@@ -24,6 +24,32 @@
 
 namespace nvfuser::hir {
 
+IterDomain* swizzle(IterDomain* in, Val* offset) {
+  NVF_ERROR(in != nullptr, "Input IterDomain cannot be null");
+  NVF_ERROR(offset != nullptr, "Swizzle offset parameter cannot be null");
+
+  // Create output IterDomain with same properties as input
+  auto* out = IterDomainBuilder(in).build();
+
+  // Create the Swizzle expression
+  IrBuilder::create<Swizzle>(in, out, offset);
+
+  return out;
+}
+
+TensorView* swizzle(TensorView* in, int64_t axis, Val* offset) {
+  NVF_ERROR(in != nullptr);
+  NVF_ERROR(offset != nullptr);
+
+  IterDomain* out_id = swizzle(in->axis(axis), offset);
+  std::vector<IterDomain*> loop_domain = in->getLoopDomain();
+  loop_domain.erase(loop_domain.begin() + axis);
+  loop_domain.insert(loop_domain.begin() + axis, out_id);
+  in->setLoopDomain(loop_domain);
+
+  return in;
+}
+
 TensorView* shardByStream(TensorView* source, Val* stream_index, Expr* e) {
   NVF_ERROR(
       getShardedIterDomain(
diff --git a/csrc/host_ir/ops.h b/csrc/host_ir/ops.h
index 66c90082427..9a3257874e2 100644
--- a/csrc/host_ir/ops.h
+++ b/csrc/host_ir/ops.h
@@ -20,6 +20,10 @@
 
 namespace nvfuser::hir {
 
+IterDomain* swizzle(IterDomain* in, Val* offset);
+
+TensorView* swizzle(TensorView* in, int64_t axis, Val* offset);
+
 // Creates a ShardByStream without needing the destination TensorView. Returns
 // the destination TensorView. `e` is the Expr from which we propagate the loop
 // domain from. `source` must be either an input or an output of `e`. The
diff --git a/csrc/multidevice/utils.cpp b/csrc/multidevice/utils.cpp
index 21d459889b8..6fc9933226d 100644
--- a/csrc/multidevice/utils.cpp
+++ b/csrc/multidevice/utils.cpp
@@ -14,6 +14,7 @@
 #include <vector>
 
 #include "compute_at_map.h"
+#include "host_ir/ir.h"
 #include "ir/internal_base_nodes.h"
 #include "ir/internal_nodes.h"
 #include "transform_replay.h"
@@ -178,6 +179,8 @@ int64_t getProducingLogicalAxis(const TensorView* tv, IterDomain* id) {
       // When `unshardedSizes` is given a local tensor of shape [1, 1], it's
       // unclear the global shape is [1, D] or [D, 1] or even [2, D/2], etc.
       id = merge->outer();
+    } else if (auto* swizzle = dynamic_cast<hir::Swizzle*>(def)) {
+      id = swizzle->in();
     } else {
       NVF_THROW(
           "Unexpected transforms from logical to a DID-parallel allocation "
diff --git a/tests/cpp/test_host_ir_evaluator.cpp b/tests/cpp/test_host_ir_evaluator.cpp
index aa4f933f65e..2be4c60d3ed 100644
--- a/tests/cpp/test_host_ir_evaluator.cpp
+++ b/tests/cpp/test_host_ir_evaluator.cpp
@@ -16,6 +16,7 @@
 #include "fusion.h"
 #include "host_ir/container.h"
 #include "host_ir/evaluator.h"
+#include "host_ir/ops.h"
 #include "ir/builder.h"
 #include "ir/interface_nodes.h"
 #include "ops/alias.h"
@@ -222,4 +223,72 @@ TEST_F(HostIrEvaluatorTest, AddInLoop) {
       << out_tensor << " vs " << expected_out_tensor;
 }
 
+TEST_F(HostIrEvaluatorTest, SwizzleCopy) {
+  constexpr int64_t c = 3;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA);
+  at::Tensor in_tensor = at::randn({c * 5}, options);
+
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+  {
+    TensorView* in_tv = makeContigTensor(1);
+    TensorView* out_tv = set(in_tv);
+    hic->addInput(in_tv);
+    hic->addOutput(out_tv);
+
+    for (auto* tv : {in_tv, out_tv}) {
+      tv->setMemoryType(MemoryType::Global);
+      tv->outer_split(0, c);
+    }
+    auto* allocate_out = IrBuilder::create<kir::Allocate>(
+        out_tv, MemoryType::Global, std::vector<Val*>({}), /*zero_init=*/true);
+
+    Val* offset = IrBuilder::create<Val>(1, DataType::Index);
+    in_tv = swizzle(in_tv, 0, offset);
+    out_tv = swizzle(out_tv, 0, offset);
+    in_tv->axis(0)->parallelize(ParallelType::Stream);
+    out_tv->axis(0)->parallelize(ParallelType::Stream);
+
+    auto* stream_index = IrBuilder::create<Val>(DataType::Index);
+    auto* for_loop = IrBuilder::create<ForLoop>(
+        stream_index,
+        /*start=*/hic->zeroVal(DataType::Index),
+        /*stop=*/IrBuilder::create<Val>(c - 1, DataType::Index));
+
+    TensorView* in_shard =
+        ops::newValLike(in_tv, *in_tv->getDataType())->as<TensorView>();
+    TensorView* out_shard =
+        ops::newValLike(out_tv, *out_tv->getDataType())->as<TensorView>();
+
+    for (auto* tv : {in_shard, out_shard}) {
+      tv->outer_split(0, c);
+      tv = swizzle(tv, 0, offset);
+      tv->axis(0)->parallelize(ParallelType::Stream);
+      tv->setAllocationDomain(tv->getLoopDomain(), true);
+    }
+
+    IrBuilder::create<ShardByStream>(in_shard, in_tv, stream_index);
+    IrBuilder::create<ShardByStream>(out_shard, out_tv, stream_index);
+    auto* copy = IrBuilder::create<LoadStoreOp>(
+        LoadStoreOpType::Set, out_shard, in_shard);
+
+    for_loop->body().pushBack(in_shard->definition());
+    for_loop->body().pushBack(out_shard->definition());
+    for_loop->body().pushBack(copy);
+
+    hic->pushBackTopLevelExprs(allocate_out);
+    hic->pushBackTopLevelExprs(for_loop);
+  }
+
+  HostIrEvaluator hie(std::move(hic));
+  KernelArgumentHolder ins(in_tensor);
+  ins.setCacheId(0);
+  KernelArgumentHolder outs = hie.runWithInputs(ins);
+  auto out_tensor = outs[0].as<at::Tensor>();
+  auto expected_out_tensor = in_tensor;
+  expected_out_tensor.chunk(c, 0)[0].zero_();
+  EXPECT_TRUE(at::allclose(out_tensor, expected_out_tensor));
+}
+
 } // namespace nvfuser::hir

From d62548409ab032ffb87f3dd7ff533e7790a9c4e2 Mon Sep 17 00:00:00 2001
From: Priya Mishra <26priya11@gmail.com>
Date: Wed, 28 Jan 2026 14:01:41 -0800
Subject: [PATCH 02/10] swizzle version with parallel type

---
 csrc/host_ir/evaluator.cpp             | 13 ++++-
 csrc/host_ir/ir.cpp                    | 11 ++--
 csrc/host_ir/ir.h                      |  7 +--
 csrc/host_ir/ops.cpp                   | 23 ++------
 csrc/host_ir/ops.h                     |  4 +-
 tests/cpp/test_host_ir_evaluator.cpp   | 68 ----------------------
 tests/cpp/test_multidevice_host_ir.cpp | 78 ++++++++++++++++++++++++++
 7 files changed, 104 insertions(+), 100 deletions(-)

diff --git a/csrc/host_ir/evaluator.cpp b/csrc/host_ir/evaluator.cpp
index fee623992f8..0ea068c751a 100644
--- a/csrc/host_ir/evaluator.cpp
+++ b/csrc/host_ir/evaluator.cpp
@@ -811,8 +811,17 @@ void HostIrEvaluator::handle(ShardByStream* shard) {
   if (stream_id->definition() != nullptr) {
     NVF_CHECK(stream_id->definition()->isA<hir::Swizzle>());
     auto* swizzle = stream_id->definition()->as<hir::Swizzle>();
-    int64_t offset = swizzle->offset()->evaluate().as<int64_t>();
-    index += offset;
+    ParallelType pt = swizzle->pt();
+
+    auto mesh = out_tv->getDeviceMesh();
+    // Find the index of the current device in the slice of mesh corresponding
+    // to the parallel type
+    auto team_size = mesh.size(pt);
+    at::Tensor md_index =
+        mesh.multiDimensionalIndexOf(communicator_->deviceId());
+    auto pt_axis = mesh.parallelTypeToAxis(pt);
+    int64_t team_index = md_index[pt_axis].item<int64_t>();
+    index = (index + team_index) % team_size;
   }
 
   at::Tensor out_tensor =
diff --git a/csrc/host_ir/ir.cpp b/csrc/host_ir/ir.cpp
index b29df6248f9..19a709d1d70 100644
--- a/csrc/host_ir/ir.cpp
+++ b/csrc/host_ir/ir.cpp
@@ -507,8 +507,8 @@ Swizzle::Swizzle(
     IrBuilderPasskey passkey,
     IterDomain* in,
     IterDomain* out,
-    Val* offset)
-    : Expr(passkey, {in}, {out}, {offset}) {
+    ParallelType pt)
+    : Expr(passkey, {in}, {out}, {}) {
   NVF_ERROR(passkey.ir_container_ != nullptr);
   NVF_ERROR(
       passkey.ir_container_->isA<HostIrContainer>(),
@@ -516,7 +516,7 @@ Swizzle::Swizzle(
       "must be registered in a HostIrContainer");
   NVF_ERROR(in != nullptr);
   NVF_ERROR(out != nullptr);
-  NVF_ERROR(offset != nullptr);
+  addDataAttribute(pt);
 }
 
 NVFUSER_DEFINE_CLONE_AND_CREATE(Swizzle)
@@ -524,15 +524,14 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(Swizzle)
 std::string Swizzle::toString(int indent_size) const {
   std::stringstream ss;
   indent(ss, indent_size) << out()->toString() << " = Swizzle("
-                          << in()->toString()
-                          << ", offset=" << offset()->toString() << std::endl;
+                          << in()->toString() << ", pt=" << pt() << std::endl;
   return ss.str();
 }
 
 std::string Swizzle::toInlineString(int indent_size) const {
   std::stringstream ss;
   indent(ss, indent_size) << "Swizzle(" << in()->toInlineString()
-                          << ", offset=" << offset()->toInlineString() << ")";
+                          << ", pt=" << pt() << ")";
   return ss.str();
 }
 
diff --git a/csrc/host_ir/ir.h b/csrc/host_ir/ir.h
index 7c2584c57c4..9b75de0372e 100644
--- a/csrc/host_ir/ir.h
+++ b/csrc/host_ir/ir.h
@@ -577,7 +577,7 @@ class Swizzle : public Expr {
       IrBuilderPasskey passkey,
       IterDomain* in,
       IterDomain* out,
-      Val* offset);
+      ParallelType pt);
 
   Swizzle(const Swizzle& other) = delete;
   Swizzle& operator=(const Swizzle& other) = delete;
@@ -602,9 +602,8 @@ class Swizzle : public Expr {
     return outputs().at(0)->as<IterDomain>();
   }
 
-  // Swizzle offset parameter
-  Val* offset() const {
-    return attributeVal(0);
+  ParallelType pt() const {
+    return attribute<ParallelType>(0);
   }
 };
 
diff --git a/csrc/host_ir/ops.cpp b/csrc/host_ir/ops.cpp
index d862822cd97..00532eec851 100644
--- a/csrc/host_ir/ops.cpp
+++ b/csrc/host_ir/ops.cpp
@@ -24,27 +24,16 @@
 
 namespace nvfuser::hir {
 
-IterDomain* swizzle(IterDomain* in, Val* offset) {
-  NVF_ERROR(in != nullptr, "Input IterDomain cannot be null");
-  NVF_ERROR(offset != nullptr, "Swizzle offset parameter cannot be null");
-
-  // Create output IterDomain with same properties as input
-  auto* out = IterDomainBuilder(in).build();
-
-  // Create the Swizzle expression
-  IrBuilder::create<Swizzle>(in, out, offset);
-
-  return out;
-}
-
-TensorView* swizzle(TensorView* in, int64_t axis, Val* offset) {
+TensorView* swizzle(TensorView* in, int64_t axis, ParallelType pt) {
   NVF_ERROR(in != nullptr);
-  NVF_ERROR(offset != nullptr);
 
-  IterDomain* out_id = swizzle(in->axis(axis), offset);
+  IterDomain* swizzle_in = in->axis(axis);
+  IterDomain* swizzle_out = IterDomainBuilder(swizzle_in).build();
+  IrBuilder::create<Swizzle>(swizzle_in, swizzle_out, pt);
+
   std::vector<IterDomain*> loop_domain = in->getLoopDomain();
   loop_domain.erase(loop_domain.begin() + axis);
-  loop_domain.insert(loop_domain.begin() + axis, out_id);
+  loop_domain.insert(loop_domain.begin() + axis, swizzle_out);
   in->setLoopDomain(loop_domain);
 
   return in;
diff --git a/csrc/host_ir/ops.h b/csrc/host_ir/ops.h
index 9a3257874e2..765e58d4e4b 100644
--- a/csrc/host_ir/ops.h
+++ b/csrc/host_ir/ops.h
@@ -20,9 +20,7 @@
 
 namespace nvfuser::hir {
 
-IterDomain* swizzle(IterDomain* in, Val* offset);
-
-TensorView* swizzle(TensorView* in, int64_t axis, Val* offset);
+TensorView* swizzle(TensorView* in, int64_t axis, ParallelType pt);
 
 // Creates a ShardByStream without needing the destination TensorView. Returns
 // the destination TensorView. `e` is the Expr from which we propagate the loop
diff --git a/tests/cpp/test_host_ir_evaluator.cpp b/tests/cpp/test_host_ir_evaluator.cpp
index 2be4c60d3ed..19846d4c403 100644
--- a/tests/cpp/test_host_ir_evaluator.cpp
+++ b/tests/cpp/test_host_ir_evaluator.cpp
@@ -223,72 +223,4 @@ TEST_F(HostIrEvaluatorTest, AddInLoop) {
       << out_tensor << " vs " << expected_out_tensor;
 }
 
-TEST_F(HostIrEvaluatorTest, SwizzleCopy) {
-  constexpr int64_t c = 3;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA);
-  at::Tensor in_tensor = at::randn({c * 5}, options);
-
-  auto hic = std::make_unique<HostIrContainer>();
-  FusionGuard fg(hic.get());
-  {
-    TensorView* in_tv = makeContigTensor(1);
-    TensorView* out_tv = set(in_tv);
-    hic->addInput(in_tv);
-    hic->addOutput(out_tv);
-
-    for (auto* tv : {in_tv, out_tv}) {
-      tv->setMemoryType(MemoryType::Global);
-      tv->outer_split(0, c);
-    }
-    auto* allocate_out = IrBuilder::create<kir::Allocate>(
-        out_tv, MemoryType::Global, std::vector<Val*>({}), /*zero_init=*/true);
-
-    Val* offset = IrBuilder::create<Val>(1, DataType::Index);
-    in_tv = swizzle(in_tv, 0, offset);
-    out_tv = swizzle(out_tv, 0, offset);
-    in_tv->axis(0)->parallelize(ParallelType::Stream);
-    out_tv->axis(0)->parallelize(ParallelType::Stream);
-
-    auto* stream_index = IrBuilder::create<Val>(DataType::Index);
-    auto* for_loop = IrBuilder::create<ForLoop>(
-        stream_index,
-        /*start=*/hic->zeroVal(DataType::Index),
-        /*stop=*/IrBuilder::create<Val>(c - 1, DataType::Index));
-
-    TensorView* in_shard =
-        ops::newValLike(in_tv, *in_tv->getDataType())->as<TensorView>();
-    TensorView* out_shard =
-        ops::newValLike(out_tv, *out_tv->getDataType())->as<TensorView>();
-
-    for (auto* tv : {in_shard, out_shard}) {
-      tv->outer_split(0, c);
-      tv = swizzle(tv, 0, offset);
-      tv->axis(0)->parallelize(ParallelType::Stream);
-      tv->setAllocationDomain(tv->getLoopDomain(), true);
-    }
-
-    IrBuilder::create<ShardByStream>(in_shard, in_tv, stream_index);
-    IrBuilder::create<ShardByStream>(out_shard, out_tv, stream_index);
-    auto* copy = IrBuilder::create<LoadStoreOp>(
-        LoadStoreOpType::Set, out_shard, in_shard);
-
-    for_loop->body().pushBack(in_shard->definition());
-    for_loop->body().pushBack(out_shard->definition());
-    for_loop->body().pushBack(copy);
-
-    hic->pushBackTopLevelExprs(allocate_out);
-    hic->pushBackTopLevelExprs(for_loop);
-  }
-
-  HostIrEvaluator hie(std::move(hic));
-  KernelArgumentHolder ins(in_tensor);
-  ins.setCacheId(0);
-  KernelArgumentHolder outs = hie.runWithInputs(ins);
-  auto out_tensor = outs[0].as<at::Tensor>();
-  auto expected_out_tensor = in_tensor;
-  expected_out_tensor.chunk(c, 0)[0].zero_();
-  EXPECT_TRUE(at::allclose(out_tensor, expected_out_tensor));
-}
-
 } // namespace nvfuser::hir
diff --git a/tests/cpp/test_multidevice_host_ir.cpp b/tests/cpp/test_multidevice_host_ir.cpp
index 579f3d8f661..5da45d67446 100644
--- a/tests/cpp/test_multidevice_host_ir.cpp
+++ b/tests/cpp/test_multidevice_host_ir.cpp
@@ -10,6 +10,7 @@
 #include "fusion.h"
 #include "host_ir/container.h"
 #include "host_ir/evaluator.h"
+#include "host_ir/ops.h"
 #include "host_ir/pass/stream_parallel_type.h"
 #include "ir/all_nodes.h"
 #include "multidevice/symmetric_tensor.h"
@@ -507,6 +508,83 @@ TEST_F(MultiDeviceHostIrTest, SymmetricContiguousView) {
       << "Output tensor does not match expected values";
 }
 
+TEST_F(MultiDeviceTest, SwizzleWithParallelType) {
+  const int64_t d = communicator_->size();
+  const int64_t my_rank = communicator_->deviceId();
+  auto mesh = DeviceMesh::createForNumDevices(d);
+
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+  {
+    TensorView* in_tv = makeContigTensor(2);
+    TensorView* out_tv = set(in_tv);
+    hic->addInput(in_tv);
+    hic->addOutput(out_tv);
+
+    for (auto* tv : {in_tv, out_tv}) {
+      tv->setMemoryType(MemoryType::Global);
+      tv->setDeviceMesh(mesh);
+      tv->outer_split(1, d);
+      tv->axis(1)->parallelize(ParallelType::DIDx);
+      tv->setAllocationDomain(tv->getLoopDomain(), true);
+    }
+    auto* allocate_out = IrBuilder::create<kir::Allocate>(
+        out_tv, MemoryType::Global, std::vector<Val*>({}), /*zero_init=*/true);
+
+    for (auto* tv : {in_tv, out_tv}) {
+      tv->outer_split(0, d);
+      tv = hir::swizzle(tv, 0, ParallelType::DIDx);
+      tv->axis(0)->parallelize(ParallelType::Stream);
+    }
+
+    auto* stream_index = IrBuilder::create<Val>(DataType::Index);
+    auto* for_loop = IrBuilder::create<ForLoop>(
+        stream_index,
+        /*start=*/hic->zeroVal(DataType::Index),
+        /*stop=*/IrBuilder::create<Val>(d - 1, DataType::Index));
+
+    TensorView* in_shard =
+        ops::newValLike(in_tv, *in_tv->getDataType())->as<TensorView>();
+    TensorView* out_shard =
+        ops::newValLike(out_tv, *out_tv->getDataType())->as<TensorView>();
+
+    for (auto* tv : {in_shard, out_shard}) {
+      tv->setDeviceMesh(mesh);
+      tv->outer_split(1, d);
+      tv->axis(1)->parallelize(ParallelType::DIDx);
+      tv->outer_split(0, d);
+      tv = hir::swizzle(tv, 0, ParallelType::DIDx);
+      tv->axis(0)->parallelize(ParallelType::Stream);
+      tv->setAllocationDomain(tv->getLoopDomain(), true);
+    }
+
+    IrBuilder::create<ShardByStream>(in_shard, in_tv, stream_index);
+    IrBuilder::create<ShardByStream>(out_shard, out_tv, stream_index);
+    auto* copy = IrBuilder::create<LoadStoreOp>(
+        LoadStoreOpType::Set, out_shard, in_shard);
+
+    for_loop->body().pushBack(in_shard->definition());
+    for_loop->body().pushBack(out_shard->definition());
+    for_loop->body().pushBack(copy);
+
+    hic->pushBackTopLevelExprs(allocate_out);
+    hic->pushBackTopLevelExprs(for_loop);
+  }
+
+  HostIrEvaluator hie(std::move(hic));
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA);
+  at::Tensor unsharded_in = at::randn({d * 3, d * 5}, options);
+  at::Tensor sharded_in = shardTensor1D(unsharded_in, 1, mesh);
+
+  KernelArgumentHolder ins(sharded_in);
+  ins.setCacheId(0);
+  KernelArgumentHolder outs = hie.runWithInputs(ins);
+  at::Tensor out = outs[0].as<at::Tensor>();
+  at::Tensor expected_out = sharded_in;
+  expected_out.chunk(d, 0)[(my_rank + d - 1) % d].zero_();
+  EXPECT_TRUE(at::allclose(out, expected_out)) << out << " vs " << expected_out;
+}
+
 } // namespace hir
 
 } // namespace nvfuser

From 606c9a8537818538a5e042473fc71671b9467de1 Mon Sep 17 00:00:00 2001
From: Priya Mishra <26priya11@gmail.com>
Date: Fri, 30 Jan 2026 13:24:02 -0800
Subject: [PATCH 03/10] add cyclic shift comment

---
 csrc/host_ir/evaluator.cpp | 8 +++++++-
 csrc/host_ir/ir.h          | 2 --
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/csrc/host_ir/evaluator.cpp b/csrc/host_ir/evaluator.cpp
index 0ea068c751a..aea22620301 100644
--- a/csrc/host_ir/evaluator.cpp
+++ b/csrc/host_ir/evaluator.cpp
@@ -809,6 +809,11 @@ void HostIrEvaluator::handle(ShardByStream* shard) {
   auto index = expr_evaluator_.evaluate(shard->stream_index()).as<int64_t>();
 
   if (stream_id->definition() != nullptr) {
+    // If the stream axis is defined by a swizzle, the input to
+    // the swizzle is the index into the `in_tensor`.
+    // Currently, we use cyclic shift swizzle to compute the index:
+    // in_index = (out_index (stream index) + device_id) % num_devices
+
     NVF_CHECK(stream_id->definition()->isA<hir::Swizzle>());
     auto* swizzle = stream_id->definition()->as<hir::Swizzle>();
     ParallelType pt = swizzle->pt();
@@ -827,7 +832,8 @@ void HostIrEvaluator::handle(ShardByStream* shard) {
   at::Tensor out_tensor =
       in_tensor
           .chunk(
-              expr_evaluator_.evaluate(stream_id->extent()).as<int64_t>(),
+              stream_id->extent()->evaluate().as<int64_t>(),
+              index,
               getShardedLogicalAxis(out_tv, ParallelType::Stream))
           .at(index);
 
diff --git a/csrc/host_ir/ir.h b/csrc/host_ir/ir.h
index 9b75de0372e..61c2c962432 100644
--- a/csrc/host_ir/ir.h
+++ b/csrc/host_ir/ir.h
@@ -592,12 +592,10 @@ class Swizzle : public Expr {
     return "hir::Swizzle";
   }
 
-  // Input iterdomain to be swizzled
   IterDomain* in() const {
     return inputs().at(0)->as<IterDomain>();
   }
 
-  // Output swizzled iterdomain
   IterDomain* out() const {
     return outputs().at(0)->as<IterDomain>();
   }

From 41163693f496df1861fcbf5a02c8594bb0955774 Mon Sep 17 00:00:00 2001
From: Priya Mishra <26priya11@gmail.com>
Date: Fri, 30 Jan 2026 13:25:37 -0800
Subject: [PATCH 04/10] unused import

---
 tests/cpp/test_host_ir_evaluator.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/cpp/test_host_ir_evaluator.cpp b/tests/cpp/test_host_ir_evaluator.cpp
index 19846d4c403..aa4f933f65e 100644
--- a/tests/cpp/test_host_ir_evaluator.cpp
+++ b/tests/cpp/test_host_ir_evaluator.cpp
@@ -16,7 +16,6 @@
 #include "fusion.h"
 #include "host_ir/container.h"
 #include "host_ir/evaluator.h"
-#include "host_ir/ops.h"
 #include "ir/builder.h"
 #include "ir/interface_nodes.h"
 #include "ops/alias.h"

From fc87e4970bcd2ac45379b83c5a9f0be4e8b20f33 Mon Sep 17 00:00:00 2001
From: Priya Mishra <26priya11@gmail.com>
Date: Mon, 2 Feb 2026 18:49:32 -0800
Subject: [PATCH 05/10] move swizzle to ir_nodes

---
 csrc/host_ir/evaluator.cpp             | 12 +++++----
 csrc/host_ir/ir.cpp                    | 32 -----------------------
 csrc/host_ir/ir.h                      | 36 --------------------------
 csrc/host_ir/ops.cpp                   | 15 -----------
 csrc/ir/interface_nodes.h              |  3 +++
 csrc/ir/internal_base_nodes.cpp        | 16 ++++++++++++
 csrc/ir/internal_base_nodes.h          |  4 +++
 csrc/ir/internal_nodes.cpp             | 25 ++++++++++++++++++
 csrc/ir/internal_nodes.h               | 32 +++++++++++++++++++++++
 csrc/multidevice/utils.cpp             |  2 +-
 csrc/tensor_view.cpp                   |  5 ++++
 tests/cpp/test_multidevice_host_ir.cpp |  4 +--
 12 files changed, 95 insertions(+), 91 deletions(-)

diff --git a/csrc/host_ir/evaluator.cpp b/csrc/host_ir/evaluator.cpp
index aea22620301..bdde8d54fdc 100644
--- a/csrc/host_ir/evaluator.cpp
+++ b/csrc/host_ir/evaluator.cpp
@@ -813,10 +813,13 @@ void HostIrEvaluator::handle(ShardByStream* shard) {
     // the swizzle is the index into the `in_tensor`.
     // Currently, we use cyclic shift swizzle to compute the index:
     // in_index = (out_index (stream index) + device_id) % num_devices
-
-    NVF_CHECK(stream_id->definition()->isA<hir::Swizzle>());
-    auto* swizzle = stream_id->definition()->as<hir::Swizzle>();
-    ParallelType pt = swizzle->pt();
+    // TODO(prmishra): In the future, the swizzle compute should be done outside
+    // of `shardByStream` such that `add` and `mod` are in the HostIrContainer
+    // similar to
+    // https://github.com/NVIDIA/Fuser/blob/0a6adb140d440cc1b6d5f21dfd05874f9699b2c6/csrc/swizzle.h#L26-L31.
+    NVF_CHECK(stream_id->definition()->isA<Swizzle1D>());
+    auto* swizzle = stream_id->definition()->as<Swizzle1D>();
+    ParallelType pt = swizzle->parallelType();
 
     auto mesh = out_tv->getDeviceMesh();
     // Find the index of the current device in the slice of mesh corresponding
@@ -833,7 +836,6 @@ void HostIrEvaluator::handle(ShardByStream* shard) {
       in_tensor
           .chunk(
               stream_id->extent()->evaluate().as<int64_t>(),
-              index,
               getShardedLogicalAxis(out_tv, ParallelType::Stream))
           .at(index);
 
diff --git a/csrc/host_ir/ir.cpp b/csrc/host_ir/ir.cpp
index 19a709d1d70..198601355fb 100644
--- a/csrc/host_ir/ir.cpp
+++ b/csrc/host_ir/ir.cpp
@@ -503,36 +503,4 @@ std::string ForLoop::toInlineString(int indent_size) const {
       index, iter_domain->start(), iter_domain->stop());
 }
 
-Swizzle::Swizzle(
-    IrBuilderPasskey passkey,
-    IterDomain* in,
-    IterDomain* out,
-    ParallelType pt)
-    : Expr(passkey, {in}, {out}, {}) {
-  NVF_ERROR(passkey.ir_container_ != nullptr);
-  NVF_ERROR(
-      passkey.ir_container_->isA<HostIrContainer>(),
-      this,
-      "must be registered in a HostIrContainer");
-  NVF_ERROR(in != nullptr);
-  NVF_ERROR(out != nullptr);
-  addDataAttribute(pt);
-}
-
-NVFUSER_DEFINE_CLONE_AND_CREATE(Swizzle)
-
-std::string Swizzle::toString(int indent_size) const {
-  std::stringstream ss;
-  indent(ss, indent_size) << out()->toString() << " = Swizzle("
-                          << in()->toString() << ", pt=" << pt() << std::endl;
-  return ss.str();
-}
-
-std::string Swizzle::toInlineString(int indent_size) const {
-  std::stringstream ss;
-  indent(ss, indent_size) << "Swizzle(" << in()->toInlineString()
-                          << ", pt=" << pt() << ")";
-  return ss.str();
-}
-
 } // namespace nvfuser::hir
diff --git a/csrc/host_ir/ir.h b/csrc/host_ir/ir.h
index 61c2c962432..d01263f2e0d 100644
--- a/csrc/host_ir/ir.h
+++ b/csrc/host_ir/ir.h
@@ -569,40 +569,4 @@ class ForLoop : public Expr {
   }
 };
 
-class Swizzle : public Expr {
- public:
-  using Expr::Expr;
-
-  Swizzle(
-      IrBuilderPasskey passkey,
-      IterDomain* in,
-      IterDomain* out,
-      ParallelType pt);
-
-  Swizzle(const Swizzle& other) = delete;
-  Swizzle& operator=(const Swizzle& other) = delete;
-  Swizzle(Swizzle&& other) = delete;
-  Swizzle& operator=(Swizzle&& other) = delete;
-
-  NVFUSER_DECLARE_CLONE_AND_CREATE
-
-  std::string toString(int indent_size = 0) const override;
-  std::string toInlineString(int indent_size = 0) const override;
-  const char* getOpString() const override {
-    return "hir::Swizzle";
-  }
-
-  IterDomain* in() const {
-    return inputs().at(0)->as<IterDomain>();
-  }
-
-  IterDomain* out() const {
-    return outputs().at(0)->as<IterDomain>();
-  }
-
-  ParallelType pt() const {
-    return attribute<ParallelType>(0);
-  }
-};
-
 } // namespace nvfuser::hir
diff --git a/csrc/host_ir/ops.cpp b/csrc/host_ir/ops.cpp
index 00532eec851..05fd42e2764 100644
--- a/csrc/host_ir/ops.cpp
+++ b/csrc/host_ir/ops.cpp
@@ -24,21 +24,6 @@
 
 namespace nvfuser::hir {
 
-TensorView* swizzle(TensorView* in, int64_t axis, ParallelType pt) {
-  NVF_ERROR(in != nullptr);
-
-  IterDomain* swizzle_in = in->axis(axis);
-  IterDomain* swizzle_out = IterDomainBuilder(swizzle_in).build();
-  IrBuilder::create<Swizzle>(swizzle_in, swizzle_out, pt);
-
-  std::vector<IterDomain*> loop_domain = in->getLoopDomain();
-  loop_domain.erase(loop_domain.begin() + axis);
-  loop_domain.insert(loop_domain.begin() + axis, swizzle_out);
-  in->setLoopDomain(loop_domain);
-
-  return in;
-}
-
 TensorView* shardByStream(TensorView* source, Val* stream_index, Expr* e) {
   NVF_ERROR(
       getShardedIterDomain(
diff --git a/csrc/ir/interface_nodes.h b/csrc/ir/interface_nodes.h
index 1b338d84eaf..58f35cacd9f 100644
--- a/csrc/ir/interface_nodes.h
+++ b/csrc/ir/interface_nodes.h
@@ -646,6 +646,9 @@ class NVF_API TensorView : public Val {
   //!  to the 2 given indices.
   TensorView* swizzle(SwizzleType swizzle_type, int64_t x, int64_t y);
 
+  //! Swizzle the iterdomain corresponding to the given index.
+  TensorView* swizzle1d(int64_t x, ParallelType pt);
+
   //! Resize an IterDomain by expanding both the left and right sides
   //! by given widths. The resulting IterDomain has an extent of
   //! (left_expansion + axis->extent() + right_expansion).
diff --git a/csrc/ir/internal_base_nodes.cpp b/csrc/ir/internal_base_nodes.cpp
index 77b84123ed1..f81fb34994c 100644
--- a/csrc/ir/internal_base_nodes.cpp
+++ b/csrc/ir/internal_base_nodes.cpp
@@ -576,6 +576,12 @@ std::pair<IterDomain*, IterDomain*> IterDomain::swizzle(
   return std::make_pair(out_x, out_y);
 }
 
+IterDomain* IterDomain::swizzle1d(IterDomain* in, ParallelType pt) {
+  IterDomain* out = IterDomainBuilder(in).build();
+  IrBuilder::createInContainer<Swizzle1D>(in->container(), out, in, pt);
+  return out;
+}
+
 IterDomain* IterDomain::resize(
     IterDomain* in,
     Val* left_expansion,
@@ -1856,6 +1862,16 @@ void TensorDomain::swizzle(
   loop_domain_.insert(loop_domain_.begin() + y, axis_out_y);
 }
 
+void TensorDomain::swizzle1d(int64_t x, ParallelType pt) {
+  x = wrapDim(x);
+
+  IterDomain* swizzle_in = axis(x);
+  IterDomain* swizzle_out = IterDomain::swizzle1d(swizzle_in, pt);
+
+  loop_domain_.erase(loop_domain_.begin() + x);
+  loop_domain_.insert(loop_domain_.begin() + x, swizzle_out);
+}
+
 void TensorDomain::resize(
     int64_t axis,
     Val* left_expansion,
diff --git a/csrc/ir/internal_base_nodes.h b/csrc/ir/internal_base_nodes.h
index 84505694ceb..238d4c22bf4 100644
--- a/csrc/ir/internal_base_nodes.h
+++ b/csrc/ir/internal_base_nodes.h
@@ -391,6 +391,8 @@ class NVF_API IterDomain : public Val {
       IterDomain* in_y,
       SwizzleMode swizzle_mode = SwizzleMode::Data);
 
+  static IterDomain* swizzle1d(IterDomain* in, ParallelType pt);
+
  protected:
   friend TensorDomain;
   friend ReplayTransformations;
@@ -835,6 +837,8 @@ class NVF_API TensorDomain : public Val {
       int64_t y,
       SwizzleMode swizzle_mode = SwizzleMode::Data);
 
+  void swizzle1d(int64_t x, ParallelType pt);
+
   // Resize an axis by left_expansion and right_expansion
   void resize(
       int64_t axis,
diff --git a/csrc/ir/internal_nodes.cpp b/csrc/ir/internal_nodes.cpp
index 3e1e18d9589..b028651d039 100644
--- a/csrc/ir/internal_nodes.cpp
+++ b/csrc/ir/internal_nodes.cpp
@@ -2808,6 +2808,31 @@ std::string Swizzle2D::toInlineString(int indent_size) const {
 
 NVFUSER_DEFINE_CLONE_AND_CREATE(Swizzle2D)
 
+Swizzle1D::Swizzle1D(
+    IrBuilderPasskey passkey,
+    IterDomain* out,
+    IterDomain* in,
+    ParallelType pt)
+    : Expr(passkey) {
+  addOutput(out);
+  addInput(in);
+  addDataAttribute(pt);
+}
+
+NVFUSER_DEFINE_CLONE_AND_CREATE(Swizzle1D)
+
+std::string Swizzle1D::toString(int indent_size) const {
+  std::stringstream ss;
+  indent(ss, indent_size) << out()->toString() << " = Swizzle1D("
+                          << in()->toString()
+                          << ", parallelType=" << parallelType() << std::endl;
+  return ss.str();
+}
+
+std::string Swizzle1D::toInlineString(int indent_size) const {
+  NVF_THROW("Swizzle1D can not be printed inline");
+}
+
 Resize::Resize(
     IrBuilderPasskey passkey,
     IterDomain* out,
diff --git a/csrc/ir/internal_nodes.h b/csrc/ir/internal_nodes.h
index f0eafd35000..617cf30879c 100644
--- a/csrc/ir/internal_nodes.h
+++ b/csrc/ir/internal_nodes.h
@@ -2089,6 +2089,38 @@ class Swizzle2D : public Expr {
   }
 };
 
+class Swizzle1D : public Expr {
+ public:
+  using Expr::Expr;
+
+  Swizzle1D(
+      IrBuilderPasskey passkey,
+      IterDomain* out,
+      IterDomain* in,
+      ParallelType pt);
+
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+
+  const char* getOpString() const override {
+    return "Swizzle1D";
+  }
+
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+
+  IterDomain* in() const {
+    return inputs().at(0)->as<IterDomain>();
+  }
+
+  IterDomain* out() const {
+    return outputs().at(0)->as<IterDomain>();
+  }
+
+  ParallelType parallelType() const {
+    return attribute<ParallelType>(0);
+  }
+};
+
 //! IterDomain expression to resize
 class Resize : public Expr {
  public:
diff --git a/csrc/multidevice/utils.cpp b/csrc/multidevice/utils.cpp
index 6fc9933226d..0581955639d 100644
--- a/csrc/multidevice/utils.cpp
+++ b/csrc/multidevice/utils.cpp
@@ -179,7 +179,7 @@ int64_t getProducingLogicalAxis(const TensorView* tv, IterDomain* id) {
       // When `unshardedSizes` is given a local tensor of shape [1, 1], it's
       // unclear the global shape is [1, D] or [D, 1] or even [2, D/2], etc.
       id = merge->outer();
-    } else if (auto* swizzle = dynamic_cast<hir::Swizzle*>(def)) {
+    } else if (auto* swizzle = dynamic_cast<Swizzle1D*>(def)) {
       id = swizzle->in();
     } else {
       NVF_THROW(
diff --git a/csrc/tensor_view.cpp b/csrc/tensor_view.cpp
index 6de9a30d84e..51e63a268a5 100644
--- a/csrc/tensor_view.cpp
+++ b/csrc/tensor_view.cpp
@@ -769,6 +769,11 @@ TensorView* TensorView::swizzle(
   return this;
 }
 
+TensorView* TensorView::swizzle1d(int64_t x, ParallelType pt) {
+  domain()->swizzle1d(x, pt);
+  return this;
+}
+
 TensorView* TensorView::rFactor(const std::vector<int64_t>& axes) {
   NVF_ERROR(
       !container()->isA<kir::Kernel>(),
diff --git a/tests/cpp/test_multidevice_host_ir.cpp b/tests/cpp/test_multidevice_host_ir.cpp
index 5da45d67446..a669c806004 100644
--- a/tests/cpp/test_multidevice_host_ir.cpp
+++ b/tests/cpp/test_multidevice_host_ir.cpp
@@ -533,7 +533,7 @@ TEST_F(MultiDeviceTest, SwizzleWithParallelType) {
 
     for (auto* tv : {in_tv, out_tv}) {
       tv->outer_split(0, d);
-      tv = hir::swizzle(tv, 0, ParallelType::DIDx);
+      tv->swizzle1d(0, ParallelType::DIDx);
       tv->axis(0)->parallelize(ParallelType::Stream);
     }
 
@@ -553,7 +553,7 @@ TEST_F(MultiDeviceTest, SwizzleWithParallelType) {
       tv->outer_split(1, d);
       tv->axis(1)->parallelize(ParallelType::DIDx);
       tv->outer_split(0, d);
-      tv = hir::swizzle(tv, 0, ParallelType::DIDx);
+      tv->swizzle1d(0, ParallelType::DIDx);
       tv->axis(0)->parallelize(ParallelType::Stream);
       tv->setAllocationDomain(tv->getLoopDomain(), true);
     }

From 1557cd02f72a37f7239ef4ef5d2a026a90e0b880 Mon Sep 17 00:00:00 2001
From: Priya Mishra <26priya11@gmail.com>
Date: Mon, 2 Feb 2026 18:52:22 -0800
Subject: [PATCH 06/10] unused imports

---
 csrc/host_ir/ops.h                     | 2 --
 csrc/multidevice/utils.cpp             | 1 -
 tests/cpp/test_multidevice_host_ir.cpp | 1 -
 3 files changed, 4 deletions(-)

diff --git a/csrc/host_ir/ops.h b/csrc/host_ir/ops.h
index 765e58d4e4b..66c90082427 100644
--- a/csrc/host_ir/ops.h
+++ b/csrc/host_ir/ops.h
@@ -20,8 +20,6 @@
 
 namespace nvfuser::hir {
 
-TensorView* swizzle(TensorView* in, int64_t axis, ParallelType pt);
-
 // Creates a ShardByStream without needing the destination TensorView. Returns
 // the destination TensorView. `e` is the Expr from which we propagate the loop
 // domain from. `source` must be either an input or an output of `e`. The
diff --git a/csrc/multidevice/utils.cpp b/csrc/multidevice/utils.cpp
index 0581955639d..beb7283c5a1 100644
--- a/csrc/multidevice/utils.cpp
+++ b/csrc/multidevice/utils.cpp
@@ -14,7 +14,6 @@
 #include <vector>
 
 #include "compute_at_map.h"
-#include "host_ir/ir.h"
 #include "ir/internal_base_nodes.h"
 #include "ir/internal_nodes.h"
 #include "transform_replay.h"
diff --git a/tests/cpp/test_multidevice_host_ir.cpp b/tests/cpp/test_multidevice_host_ir.cpp
index a669c806004..a14b225c2f2 100644
--- a/tests/cpp/test_multidevice_host_ir.cpp
+++ b/tests/cpp/test_multidevice_host_ir.cpp
@@ -10,7 +10,6 @@
 #include "fusion.h"
 #include "host_ir/container.h"
 #include "host_ir/evaluator.h"
-#include "host_ir/ops.h"
 #include "host_ir/pass/stream_parallel_type.h"
 #include "ir/all_nodes.h"
 #include "multidevice/symmetric_tensor.h"

From c99f003a0d3777ce1ad392b09fdc5a577f435b1a Mon Sep 17 00:00:00 2001
From: Priya Mishra <26priya11@gmail.com>
Date: Mon, 2 Feb 2026 19:02:09 -0800
Subject: [PATCH 07/10] comments

---
 csrc/ir/interface_nodes.h | 5 ++++-
 csrc/ir/internal_nodes.h  | 6 ++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/csrc/ir/interface_nodes.h b/csrc/ir/interface_nodes.h
index fd9f0de5286..017c0a7db8e 100644
--- a/csrc/ir/interface_nodes.h
+++ b/csrc/ir/interface_nodes.h
@@ -646,7 +646,10 @@ class NVF_API TensorView : public Val {
   //!  to the 2 given indices.
   TensorView* swizzle(SwizzleType swizzle_type, int64_t x, int64_t y);
 
-  //! Swizzle the iterdomain corresponding to the given index.
+  //! Swizzle1D is currently only used and handled in HostIr
+  //! It computes the `in` id to the swizzle as a function of the device id
+  //! (corresponding to the parallel type) and `out` id. See
+  //! `HostIrEvaluator::handle(ShardByStream)` for usage.
   TensorView* swizzle1d(int64_t x, ParallelType pt);
 
   //! Resize an IterDomain by expanding both the left and right sides
diff --git a/csrc/ir/internal_nodes.h b/csrc/ir/internal_nodes.h
index 9eff92b7417..a3ef4782df8 100644
--- a/csrc/ir/internal_nodes.h
+++ b/csrc/ir/internal_nodes.h
@@ -1995,8 +1995,10 @@ class Swizzle : public Expr {
   }
 };
 
-//! Applies 2D swizzles on a rectangular tile defined by 2 iterdomains.
-
+// Swizzle1D is currently only used and handled in HostIr.
+// The main use case is to compute the indexing for ring-based overlap, where
+// `out` is stream-parallel and `in` is a function of the device id and stream
+// index. See `HostIrEvaluator::handle(ShardByStream)` for usage.
 class Swizzle1D : public Expr {
  public:
   using Expr::Expr;

From a4d7b8e83d20be341a93e17eaf650dc74d4ff38f Mon Sep 17 00:00:00 2001
From: Priya Mishra <26priya11@gmail.com>
Date: Mon, 2 Feb 2026 19:04:14 -0800
Subject: [PATCH 08/10] validate parallel type

---
 csrc/tensor_view.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/csrc/tensor_view.cpp b/csrc/tensor_view.cpp
index 150b947687c..a25be606b98 100644
--- a/csrc/tensor_view.cpp
+++ b/csrc/tensor_view.cpp
@@ -769,6 +769,10 @@ TensorView* TensorView::swizzle(
 }
 
 TensorView* TensorView::swizzle1d(int64_t x, ParallelType pt) {
+  NVF_CHECK(
+      deviceParallelTypes().contains(pt),
+      "Swizzle1D only supports device parallel types, given: ",
+      pt);
   domain()->swizzle1d(x, pt);
   return this;
 }

From 102a5f46ab551a195856ccd3850cf5c5ab93c75a Mon Sep 17 00:00:00 2001
From: Priya Mishra <52657555+Priya2698@users.noreply.github.com>
Date: Mon, 2 Feb 2026 19:14:37 -0800
Subject: [PATCH 09/10] Update csrc/ir/internal_nodes.cpp

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
---
 csrc/ir/internal_nodes.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/ir/internal_nodes.cpp b/csrc/ir/internal_nodes.cpp
index ff57913c071..407771386c6 100644
--- a/csrc/ir/internal_nodes.cpp
+++ b/csrc/ir/internal_nodes.cpp
@@ -2788,7 +2788,7 @@ std::string Swizzle1D::toString(int indent_size) const {
   std::stringstream ss;
   indent(ss, indent_size) << out()->toString() << " = Swizzle1D("
                           << in()->toString()
-                          << ", parallelType=" << parallelType() << std::endl;
+                          << ", parallelType=" << parallelType() << ")" << std::endl;
   return ss.str();
 }
 

From b543dec2f07f8cc697af6adbebdcd66266e0b19d Mon Sep 17 00:00:00 2001
From: Priya Mishra <26priya11@gmail.com>
Date: Mon, 2 Feb 2026 21:55:17 -0800
Subject: [PATCH 10/10] lintrunner, condition error

---
 csrc/host_ir/evaluator.cpp | 3 +--
 csrc/ir/internal_nodes.cpp | 3 ++-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/csrc/host_ir/evaluator.cpp b/csrc/host_ir/evaluator.cpp
index bdde8d54fdc..2396767d5b0 100644
--- a/csrc/host_ir/evaluator.cpp
+++ b/csrc/host_ir/evaluator.cpp
@@ -808,7 +808,7 @@ void HostIrEvaluator::handle(ShardByStream* shard) {
   auto in_tensor = getKnownConcreteValue(shard->in()).as<at::Tensor>();
   auto index = expr_evaluator_.evaluate(shard->stream_index()).as<int64_t>();
 
-  if (stream_id->definition() != nullptr) {
+  if (stream_id->definition()->isA<Swizzle1D>()) {
     // If the stream axis is defined by a swizzle, the input to
     // the swizzle is the index into the `in_tensor`.
     // Currently, we use cyclic shift swizzle to compute the index:
@@ -817,7 +817,6 @@ void HostIrEvaluator::handle(ShardByStream* shard) {
     // of `shardByStream` such that `add` and `mod` are in the HostIrContainer
     // similar to
     // https://github.com/NVIDIA/Fuser/blob/0a6adb140d440cc1b6d5f21dfd05874f9699b2c6/csrc/swizzle.h#L26-L31.
-    NVF_CHECK(stream_id->definition()->isA<Swizzle1D>());
     auto* swizzle = stream_id->definition()->as<Swizzle1D>();
     ParallelType pt = swizzle->parallelType();
 
diff --git a/csrc/ir/internal_nodes.cpp b/csrc/ir/internal_nodes.cpp
index 407771386c6..0a3666bd6aa 100644
--- a/csrc/ir/internal_nodes.cpp
+++ b/csrc/ir/internal_nodes.cpp
@@ -2788,7 +2788,8 @@ std::string Swizzle1D::toString(int indent_size) const {
   std::stringstream ss;
   indent(ss, indent_size) << out()->toString() << " = Swizzle1D("
                           << in()->toString()
-                          << ", parallelType=" << parallelType() << ")" << std::endl;
+                          << ", parallelType=" << parallelType() << ")"
+                          << std::endl;
   return ss.str();
 }