[FEAT] Update NCU Profile Script and RelWithDebInfo Build Mode

jamesnulliu · jamesnulliu · commit 1bdfb5d04ac1 · 2025-03-04T05:06:36.000Z
diff --git a/.clangd b/.clangd
@@ -46,7 +46,7 @@ Diagnostics:
       readability-identifier-naming.PrivateMemberCase: aNy_CasE
       readability-identifier-naming.PublicMemberCase: aNy_CasE
       readability-identifier-naming.NamespaceCase: lower_case
-      readability-identifier-naming.EnumCase: camelBack
+      readability-identifier-naming.EnumCase: CamelCase
       readability-identifier-naming.ClassCase: CamelCase
       readability-identifier-naming.StructCase: CamelCase
       readability-identifier-naming.FunctionCase: camelBack
diff --git a/.gitignore b/.gitignore
@@ -160,4 +160,6 @@ cython_debug/
 
 /pmpp/_torch_ops
 vcpkg_installed
-/tmp
+/tmp
+/output/*
+!.gitkeep
diff --git a/README.md b/README.md
@@ -5,6 +5,8 @@
 
 ## 1. Environment
 
+### 1.1. Method 1: Use Docker Image
+
 The simplest way is to use my docker image [jamesnulliu/deeplearning:latest](https://hub.docker.com/r/jamesnulliu/deeplearning) which contains all the softwares you need to build the project:
 
 ```bash
@@ -13,6 +15,8 @@ docker pull jamesnulliu/deeplearning:latest
 
 > Check my blog: [Docker Container with Nvidia GPU Support](https://jamesnulliu.github.io/blogs/docker-container-with-nvidia-gpu-support) if you need any help.
 
+### 1.2. Method 2: Setup Environment Manually
+
 Or if you are planing to setup your own environment, here are some tips:
 
 You should install all the softwares with corresponding versions listed bellow:
@@ -27,7 +31,7 @@ You should install all the softwares with corresponding versions listed bellow:
 
 **🎯Miniconda**
 
-Managing python environments with miniconda is always a good choice. Check [the official website](https://docs.anaconda.com/miniconda/install/#quick-command-line-install) for an installation guide.
+Managing python environments with miniconda is always a good choice. Check [the official website](https://www.anaconda.com/docs/getting-started/miniconda/install#quickstart-install-instructions) for an installation guide.
 
 After installation, if you do not intend to install all the packages in `base` environment, create a new conda environment named `PMPP` (or whatever you like) and activate it:
 
diff --git a/configs/lib-tests.yml b/configs/lib-tests.yml
@@ -1,4 +1,8 @@
 OpTest:
+  VecAdd:
+    - nElems: 64
+    - nElems: 1024
+    - nElems: 2048
   Conv2D:
     - inputHeight: 32
       inputWidth: 32
@@ -13,4 +17,8 @@ OpTest:
     # # [NOTE] The following calculation results would be wrong?
     # - nInputs: 64
     # - nInputs: 128
-    # - nInputs: 512
+    # - nInputs: 512
+  PrefixSum:
+    - nInputs: 32
+    - nInputs: 64
+    - nInputs: 128
diff --git a/csrc/cmake/compilers/cuda-compiler-configs.cmake b/csrc/cmake/compilers/cuda-compiler-configs.cmake
@@ -7,7 +7,7 @@
 #         - `CMAKE_CUDA_STANDARD`: CUDA Standard. Default: 20.
 # ==================================================================================================
 
-include(${PROJECT_SOURCE_DIR}/cmake/utils/logging.cmake)
+include(${CMAKE_CURRENT_LIST_DIR}/../utils/logging.cmake)
 
 enable_language(CUDA)
 
@@ -24,4 +24,5 @@ log_info("CMAKE_CUDA_STANDARD: ${CMAKE_CUDA_STANDARD}")
 
 string(APPEND CMAKE_CUDA_FLAGS " --expt-relaxed-constexpr")
 string(APPEND CMAKE_CUDA_FLAGS_RELEASE " -O3")
+string(APPEND CMAKE_CUDA_FLAGS_RELWITHDEBINFO " -O3 -lineinfo")
 string(APPEND CMAKE_CUDA_FLAGS_DEBUG " -lineinfo")
diff --git a/csrc/cmake/compilers/cxx-compiler-configs.cmake b/csrc/cmake/compilers/cxx-compiler-configs.cmake
@@ -11,14 +11,15 @@
 #         - `STACK_SIZE`: Stack size for the executable. Default: 1048576 (1MB).
 # ==================================================================================================
 
-include(${PROJECT_SOURCE_DIR}/cmake/utils/common.cmake)
+include(${CMAKE_CURRENT_LIST_DIR}/../utils/common.cmake)
 
 enable_language(CXX)
 
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_CXX_STANDARD_REQUIRED   ON)
+
 set_default_values(
-    CMAKE_EXPORT_COMPILE_COMMANDS ON
     CMAKE_CXX_STANDARD            20
-    CMAKE_CXX_STANDARD_REQUIRED   ON
     CMAKE_CXX_SCAN_FOR_MODULES    OFF
     STACK_SIZE                    1048576
 )
@@ -31,13 +32,15 @@ log_info("STACK_SIZE: ${STACK_SIZE}")
 if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
     string(APPEND CMAKE_CXX_FLAGS " /permissive- /Zc:forScope /openmp /Zc:__cplusplus")
     string(APPEND CMAKE_CXX_FLAGS_RELEASE " /O2")
+    string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " /O2 /Zi")
     string(APPEND CMAKE_CXX_FLAGS_DEBUG " /Zi")
     # Set stack size
     string(APPEND CMAKE_EXE_LINKER_FLAGS " /STACK:${STACK_SIZE}")
 # Compiler flags for Clang
 elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
     string(APPEND CMAKE_CXX_FLAGS " -fopenmp -Wall -Wextra -Werror")
     string(APPEND CMAKE_CXX_FLAGS_RELEASE " -O3")
+    string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " -O3 -g")
     string(APPEND CMAKE_CXX_FLAGS_DEBUG " -g")
     # Set stack size
     if (WIN32)
@@ -49,6 +52,7 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
 elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
     string(APPEND CMAKE_CXX_FLAGS " -fopenmp -Wall -Wextra -Werror")
     string(APPEND CMAKE_CXX_FLAGS_RELEASE " -O3")
+    string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " -O3 -g")
     string(APPEND CMAKE_CXX_FLAGS_DEBUG " -g")
     # Set stack size
     if (WIN32)
diff --git a/csrc/lib/ops/prefixSum/op.cuh b/csrc/lib/ops/prefixSum/op.cuh
@@ -0,0 +1,73 @@
+#pragma once
+
+#include "pmpp/pch.hpp"
+
+#include "pmpp/utils/math.hpp"
+
+namespace pmpp::ops::cuda
+{
+
+template <typename ScalarT>
+__global__ void koggeStonePrefixSumKernel(const ScalarT* input,
+                                          ScalarT* output, uint32_t n)
+{
+    extern __shared__ ScalarT shmem[];
+
+    uint32_t btid = threadIdx.x;                            // Block Thread ID
+    uint32_t gtid = blockIdx.x * blockDim.x + threadIdx.x;  // Global Thread ID
+
+    if (gtid < n) {
+        shmem[btid] = input[gtid];
+    } else {
+        shmem[btid] = 0;
+    }
+
+    __syncthreads();
+
+    for (uint32_t stride = 1; stride < blockDim.x; stride *= 2) {
+        ScalarT tmp = 0;
+        if (btid >= stride) {
+            tmp = shmem[btid] + shmem[btid - stride];
+        }
+        __syncthreads();
+        if (btid >= stride) {
+            shmem[btid] = tmp;
+        }
+        __syncthreads();
+    }
+
+    if (gtid < n) {
+        output[gtid] = shmem[btid];
+    }
+}
+
+template <typename ScalarT>
+void launchPrefixSum(const ScalarT* d_input, ScalarT* d_output, uint32_t n)
+{
+    constexpr uint32_t blockSize = 256;
+    uint32_t gridSize = ceilDiv(n, blockSize);
+    koggeStonePrefixSumKernel<<<gridSize, blockSize,
+                                blockSize * sizeof(ScalarT)>>>(d_input,
+                                                               d_output, n);
+}
+
+namespace torch_impl
+{
+inline auto prefixSum(const torch::Tensor& A) -> torch::Tensor
+{
+    torch::Tensor B = torch::empty_like(A);
+    switch (A.scalar_type()) {
+    case torch::kFloat32: {
+        pmpp::ops::cuda::launchPrefixSum(A.data_ptr<fp32_t>(),
+                                         B.data_ptr<fp32_t>(), A.size(0));
+        break;
+    }
+    default:
+        AT_ERROR("Unsupported dtype: ", A.dtype());
+    }
+
+    return B;
+}
+}  // namespace torch_impl
+
+}  // namespace pmpp::ops::cuda
diff --git a/csrc/lib/ops/prefixSum/op.hpp b/csrc/lib/ops/prefixSum/op.hpp
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "pmpp/pch.hpp"
+
+namespace pmpp::ops::cpu
+{
+template <typename ScalarT>
+void launchPrefixSum(ScalarT* input, ScalarT* output, size_t n)
+{
+    output[0] = input[0];
+    for (size_t i = 1; i < n; ++i) {
+        output[i] = output[i - 1] + input[i];
+    }
+}
+
+namespace torch_impl
+{
+inline auto prefixSum(const torch::Tensor& A) -> torch::Tensor
+{
+    torch::Tensor B = torch::empty_like(A);
+    switch (A.scalar_type()) {
+    case torch::kFloat32: {
+        pmpp::ops::cpu::launchPrefixSum(A.data_ptr<fp32_t>(),
+                                        B.data_ptr<fp32_t>(), A.size(0));
+        break;
+    }
+    case torch::kInt32: {
+        pmpp::ops::cpu::launchPrefixSum(A.data_ptr<int32_t>(),
+                                        B.data_ptr<int32_t>(), A.size(0));
+        break;
+    }
+    default:
+        AT_ERROR("Unsupported dtype: ", A.dtype());
+    }
+
+    return B;
+} 
+}  // namespace torch_impl
+}  // namespace pmpp::ops::cpu
diff --git a/csrc/lib/ops/torch_bind.cu b/csrc/lib/ops/torch_bind.cu
@@ -13,6 +13,7 @@ TORCH_LIBRARY(pmpp, m)
     m.def("conv2d(Tensor input, Tensor kernel) -> Tensor");
     m.def("alphabet_histogram(Tensor input, int divider) -> Tensor");
     m.def("mul_reduction(Tensor input) -> Tensor");
+    m.def("prefix_sum(Tensor input) -> Tensor");
 }
 
 // Register the implementations.
@@ -27,6 +28,7 @@ TORCH_LIBRARY_IMPL(pmpp, CPU, m)
     m.impl("alphabet_histogram",
            &pmpp::ops::cpu::torch_impl::alphabetHistogram);
     m.impl("mul_reduction", &pmpp::ops::cpu::torch_impl::mulReduction);
+    m.impl("prefix_sum", &pmpp::ops::cpu::torch_impl::prefixSum);
 }
 
 TORCH_LIBRARY_IMPL(pmpp, CUDA, m)
@@ -38,4 +40,5 @@ TORCH_LIBRARY_IMPL(pmpp, CUDA, m)
     m.impl("alphabet_histogram",
            &pmpp::ops::cuda::torch_impl::alphabetHistogram);
     m.impl("mul_reduction", &pmpp::ops::cuda::torch_impl::mulReduction);
+    m.impl("prefix_sum", &pmpp::ops::cuda::torch_impl::prefixSum);
 }
diff --git a/csrc/lib/ops/torch_impl.hpp b/csrc/lib/ops/torch_impl.hpp
@@ -8,6 +8,8 @@
 #include "./cvtRGBtoGray/op.hpp"
 #include "./matmul/op.cuh"
 #include "./matmul/op.hpp"
+#include "./prefixSum/op.cuh"
+#include "./prefixSum/op.hpp"
 #include "./reduction/op.cuh"
 #include "./reduction/op.hpp"
 #include "./vecAdd/op.cuh"
diff --git a/csrc/lib/ops/vecAdd/op.cuh b/csrc/lib/ops/vecAdd/op.cuh
@@ -10,9 +10,10 @@ __global__ void vecAddKernel(const fp32_t* a, const fp32_t* b, fp32_t* c,
                              int32_t n)
 {
 
-    int i = threadIdx.x + blockDim.x * blockIdx.x;
-    if (i < n) {
-        c[i] = a[i] + b[i];
+    int gtid = threadIdx.x + blockDim.x * blockIdx.x;
+    if (gtid < n) {
+        // [GM] 2 load, 1 store
+        c[gtid] = a[gtid] + b[gtid];
     }
 }
 
diff --git a/csrc/test/OpTest/PrefixSum.cpp b/csrc/test/OpTest/PrefixSum.cpp
@@ -0,0 +1,35 @@
+#include <pmpp/types/cxx_types.hpp>
+#include <torch/torch.h>
+
+#include "./OpTest.hpp"
+
+using torch::Tensor;
+namespace f = torch::nn::functional;
+
+namespace pmpp::test::ops
+{
+TEST_F(OpTest, PrefixSum)
+{
+    const YAML::Node& configs = getConfigs()["OpTest"]["PrefixSum"];
+
+    static auto custom_op = torch::Dispatcher::singleton()
+                                .findSchemaOrThrow("pmpp::prefix_sum", "")
+                                .typed<Tensor(const Tensor&)>();
+
+    for (auto cfg : configs) {
+        auto nInputs = cfg["nInputs"].as<pmpp::int64_t>();
+
+        spdlog::info("nInputs: {}", nInputs);
+
+        Tensor input = torch::rand({nInputs}, torch::kFloat32);
+        Tensor outputCPU = custom_op.call(input);
+        Tensor outputCUDA = custom_op.call(input.cuda());
+
+        Tensor cosSim = f::cosine_similarity(
+            outputCPU.flatten(), outputCUDA.cpu().flatten(),
+            f::CosineSimilarityFuncOptions().dim(0));
+
+        EXPECT_GE(cosSim.item<fp32_t>(), 0.99);
+    }
+}
+}  // namespace pmpp::test::ops
diff --git a/csrc/test/OpTest/vecAdd.cpp b/csrc/test/OpTest/vecAdd.cpp
@@ -11,26 +11,32 @@ namespace pmpp::test::ops
 
 TEST_F(OpTest, VecAdd)
 {
+
+    const YAML::Node& configs = getConfigs()["OpTest"]["VecAdd"];
+
     static auto custom_op = torch::Dispatcher::singleton()
                                 .findSchemaOrThrow("pmpp::vector_add", "")
                                 .typed<torch::Tensor(const torch::Tensor&,
                                                      const torch::Tensor&)>();
 
-    constexpr pmpp::size_t nElems = 1e3;
+    for (const auto& cfg : configs) {
+
+        auto nElems = cfg["nElems"].as<pmpp::size_t>();
 
-    torch::Tensor matAh = torch::rand(nElems, torch::kF32);
-    torch::Tensor matBh = torch::rand(nElems, torch::kF32);
-    torch::Tensor matCh = custom_op.call(matAh, matBh);
+        torch::Tensor matAh = torch::rand(nElems, torch::kF32);
+        torch::Tensor matBh = torch::rand(nElems, torch::kF32);
+        torch::Tensor matCh = custom_op.call(matAh, matBh);
 
-    ASSERT_TRUE(torch::cuda::is_available());
-    torch::Tensor matAd = matAh.to(torch::kCUDA);
-    torch::Tensor matBd = matBh.to(matAd.device());
-    torch::Tensor matCd2h = custom_op.call(matAd, matBd).to(torch::kCPU);
+        ASSERT_TRUE(torch::cuda::is_available());
+        torch::Tensor matAd = matAh.to(torch::kCUDA);
+        torch::Tensor matBd = matBh.to(matAd.device());
+        torch::Tensor matCd2h = custom_op.call(matAd, matBd).to(torch::kCPU);
 
-    Tensor cosSim =
-        f::cosine_similarity(matCh.flatten(), matCd2h.flatten(),
-                             f::CosineSimilarityFuncOptions().dim(0));
+        Tensor cosSim =
+            f::cosine_similarity(matCh.flatten(), matCd2h.flatten(),
+                                 f::CosineSimilarityFuncOptions().dim(0));
 
-    EXPECT_GE(cosSim.item<fp32_t>(), 0.99);
+        EXPECT_GE(cosSim.item<fp32_t>(), 0.99);
+    }
 }
 }  // namespace pmpp::test::ops
diff --git a/output/.gitkeep b/output/.gitkeep
diff --git a/scripts/build.sh b/scripts/build.sh
@@ -28,8 +28,8 @@ while [[ $# -gt 0 ]]; do
             SOURCE_DIR=$2; shift ;;
         -B|--build-dir)
             BUILD_DIR=$2; shift ;;
-        Release|Debug)
-            BUILD_TYPE=$1 ;;
+        Release|Debug|RelWithDebInfo|RD)
+            BUILD_TYPE=${1/RD/RelWithDebInfo} ;;
         --stdc++=*)
             CXX_STANDARD="${1#*=}" ;;
         --stdcuda=*)
diff --git a/scripts/nsight-profile.sh b/scripts/nsight-profile.sh
diff --git a/setup.py b/setup.py

Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@ TORCH_LIBRARY(pmpp, m)`
`13`	`13`	`m.def("conv2d(Tensor input, Tensor kernel) -> Tensor");`
`14`	`14`	`m.def("alphabet_histogram(Tensor input, int divider) -> Tensor");`
`15`	`15`	`m.def("mul_reduction(Tensor input) -> Tensor");`
	`16`	`+ m.def("prefix_sum(Tensor input) -> Tensor");`
`16`	`17`	`}`
`17`	`18`
`18`	`19`	`// Register the implementations.`
`@@ -27,6 +28,7 @@ TORCH_LIBRARY_IMPL(pmpp, CPU, m)`
`27`	`28`	`m.impl("alphabet_histogram",`
`28`	`29`	`&pmpp::ops::cpu::torch_impl::alphabetHistogram);`
`29`	`30`	`m.impl("mul_reduction", &pmpp::ops::cpu::torch_impl::mulReduction);`
	`31`	`+ m.impl("prefix_sum", &pmpp::ops::cpu::torch_impl::prefixSum);`
`30`	`32`	`}`
`31`	`33`
`32`	`34`	`TORCH_LIBRARY_IMPL(pmpp, CUDA, m)`
`@@ -38,4 +40,5 @@ TORCH_LIBRARY_IMPL(pmpp, CUDA, m)`
`38`	`40`	`m.impl("alphabet_histogram",`
`39`	`41`	`&pmpp::ops::cuda::torch_impl::alphabetHistogram);`
`40`	`42`	`m.impl("mul_reduction", &pmpp::ops::cuda::torch_impl::mulReduction);`
	`43`	`+ m.impl("prefix_sum", &pmpp::ops::cuda::torch_impl::prefixSum);`
`41`	`44`	`}`