Skip to content

Commit 6fbfaaf

Browse files
committed
Update Unit Test for matmul kernel
1 parent e9eb453 commit 6fbfaaf

File tree

17 files changed

+226
-82
lines changed

17 files changed

+226
-82
lines changed

.clangd

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ CompileFlags:
3737
- -arch=*
3838

3939
Diagnostics:
40-
UnusedIncludes: Strict
40+
UnusedIncludes: None
41+
MissingIncludes: None
4142

4243
ClangTidy:
4344
Add: [

.vscode/settings.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
"--header-insertion=never",
2929
"--header-insertion-decorators",
3030
"--background-index",
31-
"-j=8",
31+
// "-j=8",
3232
"--pch-storage=memory",
3333
"--function-arg-placeholders=false",
3434
],

csrc/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
cmake_minimum_required(VERSION 3.30)
33

44
# Project Name and version
5-
project(VSC-CMAKE-CXX-TEMPLATE VERSION 1.0.0)
5+
project(PMPP VERSION 1.0.0)
66

77
# Common utility functions
88
include(${PROJECT_SOURCE_DIR}/cmake/utils/logging.cmake)

csrc/cmake/libraries/libtorch.cmake

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,6 @@
11
include(${CMAKE_CURRENT_LIST_DIR}/../utils/logging.cmake)
22
include(${CMAKE_CURRENT_LIST_DIR}/../utils/run-python.cmake)
33

4-
set(PY_RESULT)
5-
set(PY_OUTPUT)
6-
set(PY_ERROR)
7-
84
# @see "../utils/python.cmake"
95
run_python(
106
"import torch;print(torch.utils.cmake_prefix_path)"

csrc/include/pmpp/utils/math.hpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,21 @@
44

55
namespace pmpp
66
{
7-
template <typename T1, typename T2>
8-
requires std::is_integral_v<T1> && std::is_integral_v<T2>
7+
/**
8+
* @brief Calculate the ceiling of the division of two integers.
9+
*
10+
* @tparam T1 The type of the dividend.
11+
* @tparam T2 The type of the divisor.
12+
* @param a The dividend.
13+
* @param b The divisor.
14+
* @return The ceiling of the division of `a` by `b`.
15+
*
16+
* @bug I prefer to use concept for restricting T1 and T2 here, but clangd 18
17+
* seems not supporting concepts for cuda yet?
18+
*/
19+
template <typename T1, typename T2,
20+
typename = std::enable_if_t<std::is_integral_v<T1> &&
21+
std::is_integral_v<T2>>>
922
constexpr auto ceilDiv(T1 a, T2 b) -> T1
1023
{
1124
return T1((a + b - 1) / b);

csrc/lib/ops/cvtRGBtoGray/torch_impl.cpp

Lines changed: 7 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,13 @@
1-
#include <ATen/TensorUtils.h>
2-
#include <ATen/ops/zero.h>
31
#include <cstdio>
42
#include <cuda_runtime_api.h>
53
#include <torch/torch.h>
64

5+
#include "../ops.hpp"
76
#include "../torch_impl.hpp"
8-
#include "pmpp/types/cxx_types.hpp"
97

10-
namespace pmpp::ops::cpu
8+
namespace pmpp::ops::cpu::torch_impl
119
{
12-
extern void launchCvtRGBtoGray(uint8_t* picOut, const uint8_t* picIn,
13-
uint32_t nRows, uint32_t nCols);
14-
namespace torch_impl
15-
{
16-
auto cvtRGBtoGrayImpl(const torch::Tensor& img) -> torch::Tensor
10+
auto cvtRGBtoGray(const torch::Tensor& img) -> torch::Tensor
1711
{
1812
TORCH_CHECK(img.scalar_type() == torch::kUInt8,
1913
"Expected in Tensor to have dtype = torch::kUInt8, but have: ",
@@ -32,16 +26,11 @@ auto cvtRGBtoGrayImpl(const torch::Tensor& img) -> torch::Tensor
3226

3327
return imgOut;
3428
}
35-
} // namespace torch_impl
36-
} // namespace pmpp::ops::cpu
29+
} // namespace pmpp::ops::cpu::torch_impl
3730

38-
namespace pmpp::ops::cuda
39-
{
40-
extern void launchCvtRGBtoGray(uint8_t* picOut, const uint8_t* picIn,
41-
uint32_t nRows, uint32_t nCols);
42-
namespace torch_impl
31+
namespace pmpp::ops::cuda::torch_impl
4332
{
44-
auto cvtRGBtoGrayImpl(const torch::Tensor& img) -> torch::Tensor
33+
auto cvtRGBtoGray(const torch::Tensor& img) -> torch::Tensor
4534
{
4635
TORCH_CHECK(img.scalar_type() == torch::kUInt8,
4736
"Expected in Tensor to have dtype = torch::kUInt8, but have: ",
@@ -58,5 +47,4 @@ auto cvtRGBtoGrayImpl(const torch::Tensor& img) -> torch::Tensor
5847
img.data_ptr<uint8_t>(), nRows, nCols);
5948
return imgOut;
6049
}
61-
} // namespace torch_impl
62-
} // namespace pmpp::ops::cuda
50+
} // namespace pmpp::ops::cuda::torch_impl

csrc/lib/ops/matmul/op.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#include "../ops.hpp"
2+
3+
namespace pmpp::ops::cpu
4+
{
5+
void launchMatmul(const fp32_t* A, const fp32_t* B, fp32_t* C, size_t m)
6+
{
7+
for (size_t i = 0; i < m; ++i) {
8+
for (size_t j = 0; j < m; ++j) {
9+
C[i * m + j] = 0;
10+
for (size_t k = 0; k < m; ++k) {
11+
C[i * m + j] += A[i * m + k] * B[k * m + j];
12+
}
13+
}
14+
}
15+
}
16+
} // namespace pmpp::ops::cpu

csrc/lib/ops/matmul/op.cu

Lines changed: 34 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,55 @@
11
#include <cuda_runtime.h>
22

3-
#include "pmpp/types/cxx_types.hpp"
3+
#include "../ops.hpp"
4+
#include "pmpp/utils/math.hpp"
45

56
namespace pmpp::ops::cuda
67
{
78
/**
8-
* Assumes:
9-
* 1. M, N, P are square matrices of size width x width;
10-
* 2. Each thread computes one element;
9+
* @brief Matrix multiplication kernel
10+
*
11+
* @note 1. A, B, C are square matrices of size (m, m);
12+
* 2. Each thread computes 1 element of C and each block computes
13+
* (TILE_SIZE, TILE_SIZE) elements of C, which means block size should
14+
* be (TILE_SIZE, TILE_SIZE);
15+
* @todo Add boundary checks.
1116
*/
12-
template <int32_t TILE_SIZE = 16, typename ScalarT = fp32_t>
13-
__global__ void matMulKernel(ScalarT* M, ScalarT* N, ScalarT* P, int32_t Width)
17+
template <int32_t TILE_SIZE = 32, typename ScalarT = fp32_t>
18+
__global__ void matmulKernel(const ScalarT* A, const ScalarT* B, ScalarT* C,
19+
int32_t m)
1420
{
1521
__shared__ ScalarT Mds[TILE_SIZE][TILE_SIZE];
1622
__shared__ ScalarT Nds[TILE_SIZE][TILE_SIZE];
1723

18-
int32_t Row = blockIdx.y * TILE_SIZE + threadIdx.y;
19-
int32_t Col = blockIdx.x * TILE_SIZE + threadIdx.x;
24+
int32_t row = blockIdx.x * TILE_SIZE + threadIdx.x;
25+
int32_t col = blockIdx.y * TILE_SIZE + threadIdx.y;
2026

21-
fp32_t Pvalue = 0.0F;
22-
for (int32_t ph = 0; ph < Width / TILE_SIZE; ++ph) {
23-
Mds[threadIdx.y][threadIdx.x] =
24-
M[Row * Width + (ph * TILE_SIZE + threadIdx.x)];
25-
Nds[threadIdx.y][threadIdx.x] =
26-
N[(ph * TILE_SIZE + threadIdx.y) * Width + Col];
27+
ScalarT tmp = 0.0F;
28+
for (int32_t ph = 0; ph < m / TILE_SIZE; ++ph) {
29+
Mds[threadIdx.x][threadIdx.y] =
30+
A[row * m + (ph * TILE_SIZE + threadIdx.y)];
31+
Nds[threadIdx.x][threadIdx.y] =
32+
B[(ph * TILE_SIZE + threadIdx.x) * m + col];
2733
__syncthreads();
2834

2935
for (int32_t k = 0; k < TILE_SIZE; ++k) {
30-
Pvalue += Mds[threadIdx.y][k] * Nds[k][threadIdx.x];
36+
tmp += Mds[threadIdx.x][k] * Nds[k][threadIdx.y];
3137
}
3238
__syncthreads();
3339
}
3440

35-
P[Row * Width + Col] = Pvalue;
41+
C[row * m + col] = tmp;
42+
}
43+
44+
void launchMatmul(const fp32_t* dA, const fp32_t* dB, fp32_t* dC, size_t m)
45+
{
46+
constexpr uint32_t tileSize = 32;
47+
48+
dim3 blockSize = {tileSize, tileSize};
49+
dim3 gridSize = {uint32_t(ceilDiv(m, tileSize)),
50+
uint32_t(ceilDiv(m, tileSize))};
51+
52+
matmulKernel<tileSize, fp32_t>
53+
<<<gridSize, blockSize>>>(dA, dB, dC, int32_t(m));
3654
}
3755
} // namespace pmpp::ops::cuda

csrc/lib/ops/matmul/torch_impl.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#include "torch/torch.h"
2+
3+
#include "../ops.hpp"
4+
#include "../torch_impl.hpp"
5+
6+
namespace pmpp::ops::cpu::torch_impl
7+
{
8+
auto matmul(const torch::Tensor& A, const torch::Tensor& B) -> torch::Tensor
9+
{
10+
torch::Tensor C = torch::empty({A.size(0), B.size(1)}, A.options());
11+
12+
switch (A.scalar_type()) {
13+
case torch::kFloat32: {
14+
pmpp::ops::cpu::launchMatmul(A.data_ptr<fp32_t>(),
15+
B.data_ptr<fp32_t>(),
16+
C.data_ptr<fp32_t>(), A.size(0));
17+
break;
18+
}
19+
default:
20+
AT_ERROR("Unsupported dtype: ", A.dtype());
21+
}
22+
23+
return C;
24+
}
25+
} // namespace pmpp::ops::cpu::torch_impl
26+
27+
namespace pmpp::ops::cuda::torch_impl
28+
{
29+
auto matmul(const torch::Tensor& A, const torch::Tensor& B) -> torch::Tensor
30+
{
31+
torch::Tensor C = torch::empty({A.size(0), B.size(1)}, A.options());
32+
33+
switch (A.scalar_type()) {
34+
case torch::kFloat32: {
35+
pmpp::ops::cuda::launchMatmul(A.data_ptr<fp32_t>(),
36+
B.data_ptr<fp32_t>(),
37+
C.data_ptr<fp32_t>(), A.size(0));
38+
break;
39+
}
40+
default:
41+
AT_ERROR("Unsupported dtype: ", A.dtype());
42+
}
43+
44+
return C;
45+
}
46+
} // namespace pmpp::ops::cuda::torch_impl

csrc/lib/ops/ops.hpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#pragma once
2+
3+
#include "pmpp/types/cxx_types.hpp"
4+
5+
namespace pmpp::ops::cpu
6+
{
7+
8+
void launchVecAdd(const fp32_t* a, const fp32_t* b, fp32_t* c, size_t n);
9+
10+
void launchCvtRGBtoGray(uint8_t* picOut, const uint8_t* picIn, uint32_t nRows,
11+
uint32_t nCols);
12+
13+
void launchMatmul(const fp32_t* A, const fp32_t* B, fp32_t* C, size_t m);
14+
15+
} // namespace pmpp::ops::cpu
16+
17+
namespace pmpp::ops::cuda
18+
{
19+
20+
void launchVecAdd(const fp32_t* d_A, const fp32_t* d_B, fp32_t* d_C, size_t n);
21+
22+
void launchCvtRGBtoGray(uint8_t* picOut, const uint8_t* picIn, uint32_t nRows,
23+
uint32_t nCols);
24+
25+
void launchMatmul(const fp32_t* dA, const fp32_t* dB, fp32_t* dC, size_t m);
26+
27+
} // namespace pmpp::ops::cuda

0 commit comments

Comments
 (0)