Skip to content

Commit f6b9e8b

Browse files
author
root
committed
[FEATURE] Reduction Kernel with Multi Blocks (p.s. I am too lazy to implement thread coarsening...)
1 parent 1300f79 commit f6b9e8b

File tree

4 files changed

+31
-19
lines changed

4 files changed

+31
-19
lines changed

configs/lib-tests.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,8 @@ OpTest:
99
- nInputs: 2048
1010
divider: 4
1111
MulReduction:
12-
- nInputs: 32
12+
- nInputs: 32
13+
# # [NOTE] The following calculation results would be wrong?
14+
# - nInputs: 64
15+
# - nInputs: 128
16+
# - nInputs: 512

csrc/include/pmpp/utils/common.cuh

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@
1616
do { \
1717
cudaError_t err_ = (err); \
1818
if (err_ != cudaSuccess) { \
19-
fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\"", \
20-
__FILE__, __LINE__, err, cudaGetErrorString(err_), \
21-
#err); \
22-
cudaDeviceReset(); \
23-
throw std::runtime_error("CUDA error"); \
19+
::fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\"", \
20+
__FILE__, __LINE__, err, \
21+
::cudaGetErrorString(err_), #err); \
22+
::cudaDeviceReset(); \
23+
throw ::std::runtime_error("CUDA error"); \
2424
} \
2525
} while (0)
2626
#endif
@@ -52,4 +52,5 @@ __host__ __device__ void initMemory(T* ptr, size_t n, const T& val)
5252
ptr[i] = val;
5353
}
5454
}
55+
5556
} // namespace pmpp::cuda

csrc/include/pmpp/utils/math.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ namespace pmpp
1515
*/
1616
template <typename T1, typename T2>
1717
requires std::is_integral_v<T1> && std::is_integral_v<T2>
18-
constexpr auto ceilDiv(T1 a, T2 b) -> T1
18+
[[nodiscard]] constexpr auto ceilDiv(T1 a, T2 b) -> T1
1919
{
2020
return T1((a + b - 1) / b);
2121
}

csrc/lib/ops/reduction/op.cuh

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,36 +11,43 @@ template <typename ScalarT, typename PredT>
1111
__global__ void reductionKernel(const ScalarT* in, uint32_t n, ScalarT* out,
1212
const PredT& pred)
1313
{
14-
// Thread index in the block
15-
uint32_t bTid = threadIdx.x;
14+
uint32_t stride = blockDim.x;
15+
uint32_t segmentId = blockIdx.x;
16+
uint32_t segmentSize = 2 * stride;
17+
// Block thread index
18+
uint32_t bTidx = threadIdx.x;
19+
// Global thread index
20+
uint32_t gTidx = segmentId * segmentSize + bTidx;
21+
1622
extern __shared__ ScalarT shmem[];
1723

18-
uint32_t stride = blockDim.x;
19-
shmem[bTid] = pred(in[bTid], in[bTid + stride]);
24+
shmem[bTidx] = pred(in[gTidx], in[gTidx + stride]);
2025
stride /= 2;
2126

2227
for (; stride >= 1; stride /= 2) {
2328
__syncthreads();
24-
if (bTid < stride) {
25-
shmem[bTid] = pred(shmem[bTid], shmem[bTid + stride]);
29+
if (bTidx < stride) {
30+
shmem[bTidx] = pred(shmem[bTidx], shmem[bTidx + stride]);
2631
}
2732
}
28-
if (bTid == 0) {
29-
out[0] = shmem[0];
33+
if (bTidx == 0) {
34+
atomicAdd(out, shmem[0]);
3035
}
3136
}
3237

3338
template <typename ScalarT, typename PredT>
3439
[[nodiscard]] auto launchReduction(const ScalarT* in, uint32_t n,
3540
const PredT& pred) -> ScalarT
3641
{
42+
constexpr uint32_t MAX_BLOCK_THREADS = 1024;
43+
3744
ScalarT* d_out;
3845
cudaMalloc(&d_out, 1 * sizeof(ScalarT));
3946

40-
uint32_t nTreads = n / 2;
41-
dim3 blockDim = {nTreads, 1, 1};
42-
dim3 gridDim = {1, 1, 1};
43-
uint32_t shmemSize = blockDim.x * sizeof(ScalarT);
47+
uint32_t stride = std::min(n / 2, MAX_BLOCK_THREADS);
48+
dim3 blockDim = {stride, 1, 1};
49+
dim3 gridDim = {ceilDiv(n, stride * 2), 1, 1};
50+
uint32_t shmemSize = stride * sizeof(ScalarT);
4451

4552
reductionKernel<<<gridDim, blockDim, shmemSize>>>(in, n, d_out, pred);
4653

0 commit comments

Comments
 (0)