Update Reduction

root · root · commit 1300f7955bcd · 2025-02-12T17:22:13.000Z
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -26,9 +26,8 @@
         "--completion-style=bundled",
         "--cross-file-rename",
         "--header-insertion=never",
-        "--header-insertion-decorators",
         "--background-index",
-        "-j=16",
+        "-j=12",
         "--pch-storage=memory",
         "--function-arg-placeholders=false",
     ],
diff --git a/csrc/lib/ops/reduction/op.cuh b/csrc/lib/ops/reduction/op.cuh
@@ -8,34 +8,46 @@ namespace pmpp::ops::cuda
 {
 
 template <typename ScalarT, typename PredT>
-__global__ void reductionKernel(ScalarT* in, ScalarT* out, const PredT& pred)
+__global__ void reductionKernel(const ScalarT* in, uint32_t n, ScalarT* out,
+                                const PredT& pred)
 {
     // Thread index in the block
-    int32_t bTid = threadIdx.x;
-    int32_t i = bTid * 2;
-    for (uint32_t stride = 1; stride < blockDim.x; stride *= 2) {
-        if (bTid % stride == 0) {
-            in[i] = pred(in[i], in[i + stride]);
-        }
+    uint32_t bTid = threadIdx.x;
+    extern __shared__ ScalarT shmem[];
+
+    uint32_t stride = blockDim.x;
+    shmem[bTid] = pred(in[bTid], in[bTid + stride]);
+    stride /= 2;
+
+    for (; stride >= 1; stride /= 2) {
         __syncthreads();
+        if (bTid < stride) {
+            shmem[bTid] = pred(shmem[bTid], shmem[bTid + stride]);
+        }
     }
     if (bTid == 0) {
-        out[blockIdx.x] = in[0];
+        out[0] = shmem[0];
     }
 }
 
 template <typename ScalarT, typename PredT>
-[[nodiscard]] auto launchReduction(ScalarT* in, int32_t n, const PredT& pred)
-    -> ScalarT
+[[nodiscard]] auto launchReduction(const ScalarT* in, uint32_t n,
+                                   const PredT& pred) -> ScalarT
 {
-    dim3 blockDim = {uint32_t(n), 1, 1};
-    dim3 gridDim = {uint32_t(ceilDiv(n, blockDim.x)), 1, 1};
     ScalarT* d_out;
-    cudaMalloc(&d_out, gridDim.x * sizeof(ScalarT));
-    reductionKernel<<<gridDim, blockDim>>>(in, d_out, pred);
+    cudaMalloc(&d_out, 1 * sizeof(ScalarT));
+
+    uint32_t nTreads = n / 2;
+    dim3 blockDim = {nTreads, 1, 1};
+    dim3 gridDim = {1, 1, 1};
+    uint32_t shmemSize = blockDim.x * sizeof(ScalarT);
+
+    reductionKernel<<<gridDim, blockDim, shmemSize>>>(in, n, d_out, pred);
+
     ScalarT out;
     cudaMemcpy(&out, d_out, sizeof(ScalarT), cudaMemcpyDeviceToHost);
     cudaFree(d_out);
+
     PMPP_DEBUG_CUDA_ERR_CHECK(cudaGetLastError());
 
     return out;
@@ -46,13 +58,12 @@ namespace torch_impl
 [[nodiscard]] inline auto mulReduction(const torch::Tensor& in)
     -> torch::Tensor
 {
-    torch::Tensor mutableIn = in.contiguous();
     torch::Tensor result = {};
 
     switch (in.scalar_type()) {
     case torch::kFloat32: {
         result =
-            torch::tensor(launchReduction(mutableIn.mutable_data_ptr<fp32_t>(),
+            torch::tensor(launchReduction(in.const_data_ptr<fp32_t>(),
                                           in.numel(), std::multiplies<>()),
                           in.options());
         break;
diff --git a/csrc/test/OpTest/MulReduction.cpp b/csrc/test/OpTest/MulReduction.cpp
@@ -16,15 +16,13 @@ TEST_F(OpTest, MulRedection)
 
     for (auto cfg : configs) {
         auto nInputs = cfg["nInputs"].as<pmpp::int64_t>();
-        Tensor input = torch::randint(1, 10, {nInputs}).to(torch::kFloat32);
+        Tensor input = torch::rand({nInputs}).to(torch::kFloat32) * 1.5 + 0.5;
 
         Tensor resultCPU = custom_op.call(input);
         Tensor resultCUDA = custom_op.call(input.cuda());
 
-        std::cout << resultCPU << std::endl;
-        std::cout << resultCUDA << std::endl;
-
-        EXPECT_TRUE(resultCPU.equal(resultCUDA.cpu()));
+        Tensor diff = resultCPU - resultCUDA.cpu();
+        EXPECT_LE(diff.abs().max().item<fp32_t>(), 1e-3);
     }
 }
 }  // namespace pmpp::test::ops
diff --git a/scripts/build.sh b/scripts/build.sh
@@ -48,4 +48,4 @@ cmake -S $SOURCE_DIR -B $BUILD_DIR -G Ninja \
     -DVCPKG_OVERLAY_TRIPLETS="csrc/cmake/vcpkg-triplets"
 
 GTEST_COLOR=yes \
-cmake --build $BUILD_DIR --parallel 16 --target all check
+cmake --build $BUILD_DIR --parallel 12 --target all check