Skip to content

Commit 268364d

Browse files
author
root
committed
Update Conv2D
1 parent 5d7b8c1 commit 268364d

File tree

15 files changed

+77
-61
lines changed

15 files changed

+77
-61
lines changed

configs/ctests.yml

Lines changed: 0 additions & 8 deletions
This file was deleted.

configs/lib-tests.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
OpTest:
2+
Conv2D:
3+
- inputHeight: 32
4+
inputWidth: 32
5+
kernelSize: 3

csrc/cmake/config.cmake.in/pmpp-torch-ops-config.cmake.in renamed to csrc/cmake/config.cmake.in/PmppTorchOps-config.cmake.in

File renamed without changes.

csrc/include/pmpp/utils/address.hpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,19 @@ namespace pmpp
1515
/**
1616
* @brief Compute the offset of a multi-dimensional array.
1717
*
18-
* @param args First half is the indexes, second half is the size of each
18+
* @param args First half is the indices, second half is the size of each
1919
* dimension.
2020
* @return std::uint32_t The offset of the multi-dimensional array.
2121
*
22-
* @example computeOffset(1, 2, 3, 4, 5, 6) -> 3*1 + 2*6 + 1*6*5 = 45
22+
* @example
23+
* 1. To calculate the offset of idx (2, 1) in a 2D array of dim (4, 3):
24+
* > offset(2, 1, 4, 3) -> 1*1 + 2*3 = 7
25+
* 2. To calculate the offset of idx (1, 2, 3) in a 3D array of dim
26+
* (4, 5, 6):
27+
* > offset(1, 2, 3, 4, 5, 6) -> 3*1 + 2*6 + 1*6*5 = 45
2328
*/
2429
template <typename OffsetT, typename... ArgsT>
25-
[[nodiscard]] constexpr auto computeOffset(ArgsT... args) -> OffsetT
30+
[[nodiscard]] constexpr auto offset(ArgsT... args) -> OffsetT
2631
{
2732
constexpr std::size_t nArgs = sizeof...(ArgsT);
2833
constexpr std::size_t nDims = nArgs / 2;

csrc/lib/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
set(LIB_NAME "pmpp-torch-ops")
1+
set(LIB_NAME "PmppTorchOps")
22

33
file(GLOB_RECURSE CXX_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
44
file(GLOB_RECURSE CUDA_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)

csrc/lib/ops/conv2d/op.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,29 +5,29 @@ namespace pmpp::ops::cpu
55

66
template <>
77
void launchConv2d<fp32_t>(const fp32_t* input, const fp32_t* kernel,
8-
fp32_t* output, int32_t inputHeight,
9-
int32_t inputWidth, int32_t kernelSize)
8+
fp32_t* output, int32_t inHeight, int32_t inWidth,
9+
int32_t kernelSize)
1010
{
11-
for (int32_t i = 0; i < inputHeight; ++i) {
12-
for (int32_t j = 0; j < inputWidth; ++j) {
11+
for (int32_t i = 0; i < inHeight; ++i) {
12+
for (int32_t j = 0; j < inWidth; ++j) {
1313
fp32_t tmp = 0;
1414
int32_t startRow = i - kernelSize / 2 < 0 ? 0 : i - kernelSize / 2;
1515
int32_t startCol = j - kernelSize / 2 < 0 ? 0 : j - kernelSize / 2;
16-
int32_t endRow = i + kernelSize / 2 >= inputHeight
17-
? inputHeight - 1
16+
int32_t endRow = i + kernelSize / 2 >= inHeight
17+
? inHeight - 1
1818
: i + kernelSize / 2;
19-
int32_t endCol = j + kernelSize / 2 >= inputWidth
20-
? inputWidth - 1
19+
int32_t endCol = j + kernelSize / 2 >= inWidth
20+
? inWidth - 1
2121
: j + kernelSize / 2;
2222

2323
for (int32_t k = startRow; k <= endRow; ++k) {
2424
for (int32_t l = startCol; l <= endCol; ++l) {
25-
tmp += input[k * inputWidth + l] *
25+
tmp += input[k * inWidth + l] *
2626
kernel[(k - i + kernelSize / 2) * kernelSize +
2727
(l - j + kernelSize / 2)];
2828
}
2929
}
30-
output[i * inputWidth + j] = tmp;
30+
output[i * inWidth + j] = tmp;
3131
}
3232
}
3333
}

csrc/lib/ops/conv2d/op.cu

Lines changed: 41 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -9,47 +9,60 @@ namespace pmpp::ops::cuda
99
{
1010

1111
constexpr int32_t MAX_CONV2D_KERNEL_SIZE = 9;
12-
__constant__ fp32_t
13-
CONV2D_KERNEL[MAX_CONV2D_KERNEL_SIZE * MAX_CONV2D_KERNEL_SIZE];
12+
__constant__ fp32_t KERNEL[MAX_CONV2D_KERNEL_SIZE * MAX_CONV2D_KERNEL_SIZE];
1413

15-
template <typename ScalarT, uint32_t IN_TILE_SIZE = 32>
14+
template <typename ScalarT, uint32_t TILE_SIZE = 32>
1615
__global__ void conv2DKernel(const ScalarT* input, const ScalarT* kernel,
17-
ScalarT* output, int32_t inHeight,
18-
int32_t inWidth, int32_t kernelSize)
16+
ScalarT* output, int32_t nRows, int32_t nCols,
17+
int32_t kernelSize)
1918
{
20-
uint32_t OUT_TILE_SIZE = IN_TILE_SIZE - kernelSize / 2 * 2;
19+
// Each block computes (TILE_SIZE, TILE_SIZE) output elements
20+
// Each block contains (TILE_SIZE, TILE_SIZE) threads
21+
// TILE_SIZE must equal to blockDim.x and blockDim.y
2122

22-
int32_t outRow = blockIdx.x * OUT_TILE_SIZE + threadIdx.x - kernelSize / 2;
23-
int32_t outCol = blockIdx.y * OUT_TILE_SIZE + threadIdx.y - kernelSize / 2;
23+
// Current thread computes element at output[outRow, outCol]
24+
int32_t outRow = blockIdx.x * TILE_SIZE + threadIdx.x;
25+
int32_t outCol = blockIdx.y * TILE_SIZE + threadIdx.y;
2426

25-
// [NOTE] IN_TILE_SIZE must equal to blockDim.x and blockDim.y
26-
__shared__ ScalarT inTile[IN_TILE_SIZE][IN_TILE_SIZE];
27-
28-
if (outRow >= 0 && outRow < inHeight && outCol >= 0 && outCol < inWidth) {
27+
__shared__ ScalarT inTile[TILE_SIZE][TILE_SIZE];
28+
// Load input tile into shared memory
29+
if (outRow < nRows && outCol < nCols) {
2930
inTile[threadIdx.x][threadIdx.y] =
30-
input[computeOffset<uint32_t>(outRow, outCol, inWidth, inHeight)];
31+
input[offset<uint32_t>(outRow, outCol, nRows, nCols)];
3132
} else {
3233
inTile[threadIdx.x][threadIdx.y] = 0.0;
3334
}
3435
__syncthreads();
3536

36-
int32_t outTileRow = threadIdx.x - kernelSize / 2;
37-
int32_t outTileCol = threadIdx.y - kernelSize / 2;
38-
39-
if (outRow >= 0 && outRow < inHeight && outCol >= 0 && outCol < inWidth) {
40-
if (outTileRow >= 0 && outTileRow < OUT_TILE_SIZE && outTileCol >= 0 &&
41-
outTileCol < OUT_TILE_SIZE) {
42-
ScalarT tmp = 0;
43-
for (int32_t kRow = 0; kRow < kernelSize; ++kRow) {
44-
for (int32_t kCol = 0; kCol < kernelSize; ++kCol) {
45-
tmp += CONV2D_KERNEL[computeOffset<uint32_t>(
46-
kRow, kCol, kernelSize, kernelSize)] *
47-
inTile[kRow + outTileRow][kCol + outTileCol];
37+
if (outRow < nRows && outCol < nCols) {
38+
ScalarT tmp = 0;
39+
// To compute one output element, each thread needs to loop over the
40+
// kernel:
41+
for (int32_t kRow = 0; kRow < kernelSize; ++kRow) {
42+
for (int32_t kCol = 0; kCol < kernelSize; ++kCol) {
43+
// Realative kernel index in the input tile
44+
int32_t rkInRow = threadIdx.x - kernelSize / 2 + kRow;
45+
int32_t rkInCol = threadIdx.y - kernelSize / 2 + kCol;
46+
if (rkInRow >= 0 && rkInRow < TILE_SIZE && rkInCol >= 0 &&
47+
rkInCol < TILE_SIZE) {
48+
tmp += inTile[rkInRow][rkInCol] *
49+
KERNEL[offset<uint32_t>(kRow, kCol, kernelSize,
50+
kernelSize)];
51+
} else {
52+
// Boundary
53+
int32_t inRow = outRow - kernelSize / 2 + kRow;
54+
int32_t inCol = outCol - kernelSize / 2 + kCol;
55+
if (inRow >= 0 && inRow < nRows && inCol >= 0 &&
56+
inCol < nCols) {
57+
tmp += input[offset<uint32_t>(inRow, inCol, nRows,
58+
nCols)] *
59+
KERNEL[offset<uint32_t>(kRow, kCol, kernelSize,
60+
kernelSize)];
61+
}
4862
}
4963
}
50-
output[computeOffset<uint32_t>(outRow, outCol, inWidth, inWidth)] =
51-
tmp;
5264
}
65+
output[offset<uint32_t>(outRow, outCol, nRows, nCols)] = tmp;
5366
}
5467
}
5568

@@ -62,7 +75,7 @@ void launchConv2d<fp32_t>(const fp32_t* d_input, const fp32_t* d_kernel,
6275
throw std::runtime_error("Kernel size is too large");
6376
}
6477

65-
cudaMemcpyToSymbol(CONV2D_KERNEL, d_kernel,
78+
cudaMemcpyToSymbol(KERNEL, d_kernel,
6679
kernelSize * kernelSize * sizeof(fp32_t));
6780

6881
dim3 blockDim = {32, 32, 1};

csrc/lib/ops/conv2d/torch_impl.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
namespace pmpp::ops::cpu::torch_impl
77
{
8-
auto conv2D(const torch::Tensor& input, const torch::Tensor& kernel)
8+
auto conv2d(const torch::Tensor& input, const torch::Tensor& kernel)
99
-> torch::Tensor
1010
{
1111
TORCH_CHECK(input.scalar_type() == kernel.scalar_type(),
@@ -39,7 +39,7 @@ auto conv2D(const torch::Tensor& input, const torch::Tensor& kernel)
3939

4040
namespace pmpp::ops::cuda::torch_impl
4141
{
42-
auto conv2D(const torch::Tensor& input, const torch::Tensor& kernel)
42+
auto conv2d(const torch::Tensor& input, const torch::Tensor& kernel)
4343
-> torch::Tensor
4444
{
4545
TORCH_CHECK(input.scalar_type() == kernel.scalar_type(),

csrc/lib/ops/cvtRGBtoGray/op.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ __global__ void cvtRGBtoGrayKernel(uint8_t* outImg, const uint8_t* inImg,
2121
return;
2222
}
2323

24-
auto grayOffset = computeOffset<uint32_t>(row, col, height, width);
24+
auto grayOffset = offset<uint32_t>(row, col, height, width);
2525
uint32_t rgbOffset = grayOffset * N_CHANNELS;
2626

2727
uint8_t r = inImg[rgbOffset];

csrc/lib/ops/torch_bind.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ TORCH_LIBRARY(pmpp, m)
1010
m.def("vector_add(Tensor a, Tensor b) -> Tensor");
1111
m.def("cvt_rgb_to_gray(Tensor img) -> Tensor");
1212
m.def("matmul(Tensor A, Tensor B) -> Tensor");
13-
m.def("conv2D(Tensor input, Tensor kernel) -> Tensor");
13+
m.def("conv2d(Tensor input, Tensor kernel) -> Tensor");
1414
}
1515

1616
// Register the implementations.
@@ -21,13 +21,13 @@ TORCH_LIBRARY_IMPL(pmpp, CPU, m)
2121
m.impl("vector_add", &pmpp::ops::cpu::torch_impl::vectorAdd);
2222
m.impl("cvt_rgb_to_gray", &pmpp::ops::cpu::torch_impl::cvtRGBtoGray);
2323
m.impl("matmul", &pmpp::ops::cpu::torch_impl::matmul);
24-
m.impl("conv2D", &pmpp::ops::cpu::torch_impl::conv2D);
24+
m.impl("conv2d", &pmpp::ops::cpu::torch_impl::conv2d);
2525
}
2626

2727
TORCH_LIBRARY_IMPL(pmpp, CUDA, m)
2828
{
2929
m.impl("vector_add", &pmpp::ops::cuda::torch_impl::vectorAdd);
3030
m.impl("cvt_rgb_to_gray", &pmpp::ops::cuda::torch_impl::cvtRGBtoGray);
3131
m.impl("matmul", &pmpp::ops::cuda::torch_impl::matmul);
32-
m.impl("conv2D", &pmpp::ops::cuda::torch_impl::conv2D);
32+
m.impl("conv2d", &pmpp::ops::cuda::torch_impl::conv2d);
3333
}

0 commit comments

Comments
 (0)