Skip to content

Commit 1bdfb5d

Browse files
committed
[FEAT] Update NCU Profile Script and RelWithDebInfo Build Mode
1 parent 5ad8466 commit 1bdfb5d

File tree

17 files changed

+231
-30
lines changed

17 files changed

+231
-30
lines changed

.clangd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ Diagnostics:
4646
readability-identifier-naming.PrivateMemberCase: aNy_CasE
4747
readability-identifier-naming.PublicMemberCase: aNy_CasE
4848
readability-identifier-naming.NamespaceCase: lower_case
49-
readability-identifier-naming.EnumCase: camelBack
49+
readability-identifier-naming.EnumCase: CamelCase
5050
readability-identifier-naming.ClassCase: CamelCase
5151
readability-identifier-naming.StructCase: CamelCase
5252
readability-identifier-naming.FunctionCase: camelBack

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,4 +160,6 @@ cython_debug/
160160

161161
/pmpp/_torch_ops
162162
vcpkg_installed
163-
/tmp
163+
/tmp
164+
/output/*
165+
!.gitkeep

README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55

66
## 1. Environment
77

8+
### 1.1. Method 1: Use Docker Image
9+
810
The simplest way is to use my docker image [jamesnulliu/deeplearning:latest](https://hub.docker.com/r/jamesnulliu/deeplearning) which contains all the softwares you need to build the project:
911

1012
```bash
@@ -13,6 +15,8 @@ docker pull jamesnulliu/deeplearning:latest
1315

1416
> Check my blog: [Docker Container with Nvidia GPU Support](https://jamesnulliu.github.io/blogs/docker-container-with-nvidia-gpu-support) if you need any help.
1517
18+
### 1.2. Method 2: Setup Environment Manually
19+
1620
Or if you are planing to setup your own environment, here are some tips:
1721

1822
You should install all the softwares with corresponding versions listed bellow:
@@ -27,7 +31,7 @@ You should install all the softwares with corresponding versions listed bellow:
2731

2832
**🎯Miniconda**
2933

30-
Managing python environments with miniconda is always a good choice. Check [the official website](https://docs.anaconda.com/miniconda/install/#quick-command-line-install) for an installation guide.
34+
Managing python environments with miniconda is always a good choice. Check [the official website](https://www.anaconda.com/docs/getting-started/miniconda/install#quickstart-install-instructions) for an installation guide.
3135

3236
After installation, if you do not intend to install all the packages in `base` environment, create a new conda environment named `PMPP` (or whatever you like) and activate it:
3337

configs/lib-tests.yml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
OpTest:
2+
VecAdd:
3+
- nElems: 64
4+
- nElems: 1024
5+
- nElems: 2048
26
Conv2D:
37
- inputHeight: 32
48
inputWidth: 32
@@ -13,4 +17,8 @@ OpTest:
1317
# # [NOTE] The following calculation results would be wrong?
1418
# - nInputs: 64
1519
# - nInputs: 128
16-
# - nInputs: 512
20+
# - nInputs: 512
21+
PrefixSum:
22+
- nInputs: 32
23+
- nInputs: 64
24+
- nInputs: 128

csrc/cmake/compilers/cuda-compiler-configs.cmake

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
# - `CMAKE_CUDA_STANDARD`: CUDA Standard. Default: 20.
88
# ==================================================================================================
99

10-
include(${PROJECT_SOURCE_DIR}/cmake/utils/logging.cmake)
10+
include(${CMAKE_CURRENT_LIST_DIR}/../utils/logging.cmake)
1111

1212
enable_language(CUDA)
1313

@@ -24,4 +24,5 @@ log_info("CMAKE_CUDA_STANDARD: ${CMAKE_CUDA_STANDARD}")
2424

2525
string(APPEND CMAKE_CUDA_FLAGS " --expt-relaxed-constexpr")
2626
string(APPEND CMAKE_CUDA_FLAGS_RELEASE " -O3")
27+
string(APPEND CMAKE_CUDA_FLAGS_RELWITHDEBINFO " -O3 -lineinfo")
2728
string(APPEND CMAKE_CUDA_FLAGS_DEBUG " -lineinfo")

csrc/cmake/compilers/cxx-compiler-configs.cmake

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,15 @@
1111
# - `STACK_SIZE`: Stack size for the executable. Default: 1048576 (1MB).
1212
# ==================================================================================================
1313

14-
include(${PROJECT_SOURCE_DIR}/cmake/utils/common.cmake)
14+
include(${CMAKE_CURRENT_LIST_DIR}/../utils/common.cmake)
1515

1616
enable_language(CXX)
1717

18+
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
19+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
20+
1821
set_default_values(
19-
CMAKE_EXPORT_COMPILE_COMMANDS ON
2022
CMAKE_CXX_STANDARD 20
21-
CMAKE_CXX_STANDARD_REQUIRED ON
2223
CMAKE_CXX_SCAN_FOR_MODULES OFF
2324
STACK_SIZE 1048576
2425
)
@@ -31,13 +32,15 @@ log_info("STACK_SIZE: ${STACK_SIZE}")
3132
if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
3233
string(APPEND CMAKE_CXX_FLAGS " /permissive- /Zc:forScope /openmp /Zc:__cplusplus")
3334
string(APPEND CMAKE_CXX_FLAGS_RELEASE " /O2")
35+
string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " /O2 /Zi")
3436
string(APPEND CMAKE_CXX_FLAGS_DEBUG " /Zi")
3537
# Set stack size
3638
string(APPEND CMAKE_EXE_LINKER_FLAGS " /STACK:${STACK_SIZE}")
3739
# Compiler flags for Clang
3840
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
3941
string(APPEND CMAKE_CXX_FLAGS " -fopenmp -Wall -Wextra -Werror")
4042
string(APPEND CMAKE_CXX_FLAGS_RELEASE " -O3")
43+
string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " -O3 -g")
4144
string(APPEND CMAKE_CXX_FLAGS_DEBUG " -g")
4245
# Set stack size
4346
if (WIN32)
@@ -49,6 +52,7 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
4952
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
5053
string(APPEND CMAKE_CXX_FLAGS " -fopenmp -Wall -Wextra -Werror")
5154
string(APPEND CMAKE_CXX_FLAGS_RELEASE " -O3")
55+
string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " -O3 -g")
5256
string(APPEND CMAKE_CXX_FLAGS_DEBUG " -g")
5357
# Set stack size
5458
if (WIN32)

csrc/lib/ops/prefixSum/op.cuh

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
#pragma once
2+
3+
#include "pmpp/pch.hpp"
4+
5+
#include "pmpp/utils/math.hpp"
6+
7+
namespace pmpp::ops::cuda
8+
{
9+
10+
template <typename ScalarT>
11+
__global__ void koggeStonePrefixSumKernel(const ScalarT* input,
12+
ScalarT* output, uint32_t n)
13+
{
14+
extern __shared__ ScalarT shmem[];
15+
16+
uint32_t btid = threadIdx.x; // Block Thread ID
17+
uint32_t gtid = blockIdx.x * blockDim.x + threadIdx.x; // Global Thread ID
18+
19+
if (gtid < n) {
20+
shmem[btid] = input[gtid];
21+
} else {
22+
shmem[btid] = 0;
23+
}
24+
25+
__syncthreads();
26+
27+
for (uint32_t stride = 1; stride < blockDim.x; stride *= 2) {
28+
ScalarT tmp = 0;
29+
if (btid >= stride) {
30+
tmp = shmem[btid] + shmem[btid - stride];
31+
}
32+
__syncthreads();
33+
if (btid >= stride) {
34+
shmem[btid] = tmp;
35+
}
36+
__syncthreads();
37+
}
38+
39+
if (gtid < n) {
40+
output[gtid] = shmem[btid];
41+
}
42+
}
43+
44+
template <typename ScalarT>
45+
void launchPrefixSum(const ScalarT* d_input, ScalarT* d_output, uint32_t n)
46+
{
47+
constexpr uint32_t blockSize = 256;
48+
uint32_t gridSize = ceilDiv(n, blockSize);
49+
koggeStonePrefixSumKernel<<<gridSize, blockSize,
50+
blockSize * sizeof(ScalarT)>>>(d_input,
51+
d_output, n);
52+
}
53+
54+
namespace torch_impl
55+
{
56+
inline auto prefixSum(const torch::Tensor& A) -> torch::Tensor
57+
{
58+
torch::Tensor B = torch::empty_like(A);
59+
switch (A.scalar_type()) {
60+
case torch::kFloat32: {
61+
pmpp::ops::cuda::launchPrefixSum(A.data_ptr<fp32_t>(),
62+
B.data_ptr<fp32_t>(), A.size(0));
63+
break;
64+
}
65+
default:
66+
AT_ERROR("Unsupported dtype: ", A.dtype());
67+
}
68+
69+
return B;
70+
}
71+
} // namespace torch_impl
72+
73+
} // namespace pmpp::ops::cuda

csrc/lib/ops/prefixSum/op.hpp

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#pragma once
2+
3+
#include "pmpp/pch.hpp"
4+
5+
namespace pmpp::ops::cpu
6+
{
7+
template <typename ScalarT>
8+
void launchPrefixSum(ScalarT* input, ScalarT* output, size_t n)
9+
{
10+
output[0] = input[0];
11+
for (size_t i = 1; i < n; ++i) {
12+
output[i] = output[i - 1] + input[i];
13+
}
14+
}
15+
16+
namespace torch_impl
17+
{
18+
inline auto prefixSum(const torch::Tensor& A) -> torch::Tensor
19+
{
20+
torch::Tensor B = torch::empty_like(A);
21+
switch (A.scalar_type()) {
22+
case torch::kFloat32: {
23+
pmpp::ops::cpu::launchPrefixSum(A.data_ptr<fp32_t>(),
24+
B.data_ptr<fp32_t>(), A.size(0));
25+
break;
26+
}
27+
case torch::kInt32: {
28+
pmpp::ops::cpu::launchPrefixSum(A.data_ptr<int32_t>(),
29+
B.data_ptr<int32_t>(), A.size(0));
30+
break;
31+
}
32+
default:
33+
AT_ERROR("Unsupported dtype: ", A.dtype());
34+
}
35+
36+
return B;
37+
}
38+
} // namespace torch_impl
39+
} // namespace pmpp::ops::cpu

csrc/lib/ops/torch_bind.cu

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ TORCH_LIBRARY(pmpp, m)
1313
m.def("conv2d(Tensor input, Tensor kernel) -> Tensor");
1414
m.def("alphabet_histogram(Tensor input, int divider) -> Tensor");
1515
m.def("mul_reduction(Tensor input) -> Tensor");
16+
m.def("prefix_sum(Tensor input) -> Tensor");
1617
}
1718

1819
// Register the implementations.
@@ -27,6 +28,7 @@ TORCH_LIBRARY_IMPL(pmpp, CPU, m)
2728
m.impl("alphabet_histogram",
2829
&pmpp::ops::cpu::torch_impl::alphabetHistogram);
2930
m.impl("mul_reduction", &pmpp::ops::cpu::torch_impl::mulReduction);
31+
m.impl("prefix_sum", &pmpp::ops::cpu::torch_impl::prefixSum);
3032
}
3133

3234
TORCH_LIBRARY_IMPL(pmpp, CUDA, m)
@@ -38,4 +40,5 @@ TORCH_LIBRARY_IMPL(pmpp, CUDA, m)
3840
m.impl("alphabet_histogram",
3941
&pmpp::ops::cuda::torch_impl::alphabetHistogram);
4042
m.impl("mul_reduction", &pmpp::ops::cuda::torch_impl::mulReduction);
43+
m.impl("prefix_sum", &pmpp::ops::cuda::torch_impl::prefixSum);
4144
}

csrc/lib/ops/torch_impl.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
#include "./cvtRGBtoGray/op.hpp"
99
#include "./matmul/op.cuh"
1010
#include "./matmul/op.hpp"
11+
#include "./prefixSum/op.cuh"
12+
#include "./prefixSum/op.hpp"
1113
#include "./reduction/op.cuh"
1214
#include "./reduction/op.hpp"
1315
#include "./vecAdd/op.cuh"

0 commit comments

Comments
 (0)