Update Unit Test for matmul kernel

jamesnulliu · jamesnulliu · commit 6fbfaaffc81f · 2025-01-27T19:37:40.000+08:00
diff --git a/.clangd b/.clangd
@@ -37,7 +37,8 @@ CompileFlags:
     - -arch=*
 
 Diagnostics:
-  UnusedIncludes: Strict
+  UnusedIncludes: None
+  MissingIncludes: None
 
   ClangTidy:
     Add: [
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -28,7 +28,7 @@
         "--header-insertion=never",
         "--header-insertion-decorators",
         "--background-index",
-        "-j=8",
+        // "-j=8",
         "--pch-storage=memory",
         "--function-arg-placeholders=false",
     ],
diff --git a/csrc/CMakeLists.txt b/csrc/CMakeLists.txt
@@ -2,7 +2,7 @@
 cmake_minimum_required(VERSION 3.30)
 
 # Project Name and version
-project(VSC-CMAKE-CXX-TEMPLATE VERSION 1.0.0)
+project(PMPP VERSION 1.0.0)
 
 # Common utility functions
 include(${PROJECT_SOURCE_DIR}/cmake/utils/logging.cmake)
diff --git a/csrc/cmake/libraries/libtorch.cmake b/csrc/cmake/libraries/libtorch.cmake
@@ -1,10 +1,6 @@
 include(${CMAKE_CURRENT_LIST_DIR}/../utils/logging.cmake)
 include(${CMAKE_CURRENT_LIST_DIR}/../utils/run-python.cmake)
 
-set(PY_RESULT)
-set(PY_OUTPUT)
-set(PY_ERROR)
-
 # @see "../utils/python.cmake"
 run_python(
     "import torch;print(torch.utils.cmake_prefix_path)"
diff --git a/csrc/include/pmpp/utils/math.hpp b/csrc/include/pmpp/utils/math.hpp
@@ -4,8 +4,21 @@
 
 namespace pmpp
 {
-template <typename T1, typename T2>
-    requires std::is_integral_v<T1> && std::is_integral_v<T2>
+/**
+ * @brief Calculate the ceiling of the division of two integers.
+ *
+ * @tparam T1 The type of the dividend.
+ * @tparam T2 The type of the divisor.
+ * @param a The dividend.
+ * @param b The divisor.
+ * @return The ceiling of the division of `a` by `b`.
+ *
+ * @bug I prefer to use concept for restricting T1 and T2 here, but clangd 18
+ *      seems not supporting concepts for cuda yet?
+ */
+template <typename T1, typename T2,
+          typename = std::enable_if_t<std::is_integral_v<T1> &&
+                                      std::is_integral_v<T2>>>
 constexpr auto ceilDiv(T1 a, T2 b) -> T1
 {
     return T1((a + b - 1) / b);
diff --git a/csrc/lib/ops/cvtRGBtoGray/torch_impl.cpp b/csrc/lib/ops/cvtRGBtoGray/torch_impl.cpp
@@ -1,19 +1,13 @@
-#include <ATen/TensorUtils.h>
-#include <ATen/ops/zero.h>
 #include <cstdio>
 #include <cuda_runtime_api.h>
 #include <torch/torch.h>
 
+#include "../ops.hpp"
 #include "../torch_impl.hpp"
-#include "pmpp/types/cxx_types.hpp"
 
-namespace pmpp::ops::cpu
+namespace pmpp::ops::cpu::torch_impl
 {
-extern void launchCvtRGBtoGray(uint8_t* picOut, const uint8_t* picIn,
-                               uint32_t nRows, uint32_t nCols);
-namespace torch_impl
-{
-auto cvtRGBtoGrayImpl(const torch::Tensor& img) -> torch::Tensor
+auto cvtRGBtoGray(const torch::Tensor& img) -> torch::Tensor
 {
     TORCH_CHECK(img.scalar_type() == torch::kUInt8,
                 "Expected in Tensor to have dtype = torch::kUInt8, but have: ",
@@ -32,16 +26,11 @@ auto cvtRGBtoGrayImpl(const torch::Tensor& img) -> torch::Tensor
 
     return imgOut;
 }
-}  // namespace torch_impl
-}  // namespace pmpp::ops::cpu
+}  // namespace pmpp::ops::cpu::torch_impl
 
-namespace pmpp::ops::cuda
-{
-extern void launchCvtRGBtoGray(uint8_t* picOut, const uint8_t* picIn,
-                               uint32_t nRows, uint32_t nCols);
-namespace torch_impl
+namespace pmpp::ops::cuda::torch_impl
 {
-auto cvtRGBtoGrayImpl(const torch::Tensor& img) -> torch::Tensor
+auto cvtRGBtoGray(const torch::Tensor& img) -> torch::Tensor
 {
     TORCH_CHECK(img.scalar_type() == torch::kUInt8,
                 "Expected in Tensor to have dtype = torch::kUInt8, but have: ",
@@ -58,5 +47,4 @@ auto cvtRGBtoGrayImpl(const torch::Tensor& img) -> torch::Tensor
                                         img.data_ptr<uint8_t>(), nRows, nCols);
     return imgOut;
 }
-}  // namespace torch_impl
-}  // namespace pmpp::ops::cuda
+}  // namespace pmpp::ops::cuda::torch_impl
diff --git a/csrc/lib/ops/matmul/op.cpp b/csrc/lib/ops/matmul/op.cpp
@@ -0,0 +1,16 @@
+#include "../ops.hpp"
+
+namespace pmpp::ops::cpu
+{
+void launchMatmul(const fp32_t* A, const fp32_t* B, fp32_t* C, size_t m)
+{
+    for (size_t i = 0; i < m; ++i) {
+        for (size_t j = 0; j < m; ++j) {
+            C[i * m + j] = 0;
+            for (size_t k = 0; k < m; ++k) {
+                C[i * m + j] += A[i * m + k] * B[k * m + j];
+            }
+        }
+    }
+}
+}  // namespace pmpp::ops::cpu
diff --git a/csrc/lib/ops/matmul/op.cu b/csrc/lib/ops/matmul/op.cu
@@ -1,37 +1,55 @@
 #include <cuda_runtime.h>
 
-#include "pmpp/types/cxx_types.hpp"
+#include "../ops.hpp"
+#include "pmpp/utils/math.hpp"
 
 namespace pmpp::ops::cuda
 {
 /**
- * Assumes:
- * 1. M, N, P are square matrices of size width x width;
- * 2. Each thread computes one element;
+ * @brief Matrix multiplication kernel
+ *
+ * @note 1. A, B, C are square matrices of size (m, m);
+ *       2. Each thread computes 1 element of C and each block computes
+ *          (TILE_SIZE, TILE_SIZE) elements of C, which means block size should
+ *          be (TILE_SIZE, TILE_SIZE);
+ * @todo Add boundary checks.
  */
-template <int32_t TILE_SIZE = 16, typename ScalarT = fp32_t>
-__global__ void matMulKernel(ScalarT* M, ScalarT* N, ScalarT* P, int32_t Width)
+template <int32_t TILE_SIZE = 32, typename ScalarT = fp32_t>
+__global__ void matmulKernel(const ScalarT* A, const ScalarT* B, ScalarT* C,
+                             int32_t m)
 {
     __shared__ ScalarT Mds[TILE_SIZE][TILE_SIZE];
     __shared__ ScalarT Nds[TILE_SIZE][TILE_SIZE];
 
-    int32_t Row = blockIdx.y * TILE_SIZE + threadIdx.y;
-    int32_t Col = blockIdx.x * TILE_SIZE + threadIdx.x;
+    int32_t row = blockIdx.x * TILE_SIZE + threadIdx.x;
+    int32_t col = blockIdx.y * TILE_SIZE + threadIdx.y;
 
-    fp32_t Pvalue = 0.0F;
-    for (int32_t ph = 0; ph < Width / TILE_SIZE; ++ph) {
-        Mds[threadIdx.y][threadIdx.x] =
-            M[Row * Width + (ph * TILE_SIZE + threadIdx.x)];
-        Nds[threadIdx.y][threadIdx.x] =
-            N[(ph * TILE_SIZE + threadIdx.y) * Width + Col];
+    ScalarT tmp = 0.0F;
+    for (int32_t ph = 0; ph < m / TILE_SIZE; ++ph) {
+        Mds[threadIdx.x][threadIdx.y] =
+            A[row * m + (ph * TILE_SIZE + threadIdx.y)];
+        Nds[threadIdx.x][threadIdx.y] =
+            B[(ph * TILE_SIZE + threadIdx.x) * m + col];
         __syncthreads();
 
         for (int32_t k = 0; k < TILE_SIZE; ++k) {
-            Pvalue += Mds[threadIdx.y][k] * Nds[k][threadIdx.x];
+            tmp += Mds[threadIdx.x][k] * Nds[k][threadIdx.y];
         }
         __syncthreads();
     }
 
-    P[Row * Width + Col] = Pvalue;
+    C[row * m + col] = tmp;
+}
+
+void launchMatmul(const fp32_t* dA, const fp32_t* dB, fp32_t* dC, size_t m)
+{
+    constexpr uint32_t tileSize = 32;
+
+    dim3 blockSize = {tileSize, tileSize};
+    dim3 gridSize = {uint32_t(ceilDiv(m, tileSize)),
+                     uint32_t(ceilDiv(m, tileSize))};
+
+    matmulKernel<tileSize, fp32_t>
+        <<<gridSize, blockSize>>>(dA, dB, dC, int32_t(m));
 }
 }  // namespace pmpp::ops::cuda
diff --git a/csrc/lib/ops/matmul/torch_impl.cpp b/csrc/lib/ops/matmul/torch_impl.cpp
@@ -0,0 +1,46 @@
+#include "torch/torch.h"
+
+#include "../ops.hpp"
+#include "../torch_impl.hpp"
+
+namespace pmpp::ops::cpu::torch_impl
+{
+auto matmul(const torch::Tensor& A, const torch::Tensor& B) -> torch::Tensor
+{
+    torch::Tensor C = torch::empty({A.size(0), B.size(1)}, A.options());
+
+    switch (A.scalar_type()) {
+    case torch::kFloat32: {
+        pmpp::ops::cpu::launchMatmul(A.data_ptr<fp32_t>(),
+                                     B.data_ptr<fp32_t>(),
+                                     C.data_ptr<fp32_t>(), A.size(0));
+        break;
+    }
+    default:
+        AT_ERROR("Unsupported dtype: ", A.dtype());
+    }
+
+    return C;
+}
+}  // namespace pmpp::ops::cpu::torch_impl
+
+namespace pmpp::ops::cuda::torch_impl
+{
+auto matmul(const torch::Tensor& A, const torch::Tensor& B) -> torch::Tensor
+{
+    torch::Tensor C = torch::empty({A.size(0), B.size(1)}, A.options());
+
+    switch (A.scalar_type()) {
+    case torch::kFloat32: {
+        pmpp::ops::cuda::launchMatmul(A.data_ptr<fp32_t>(),
+                                      B.data_ptr<fp32_t>(),
+                                      C.data_ptr<fp32_t>(), A.size(0));
+        break;
+    }
+    default:
+        AT_ERROR("Unsupported dtype: ", A.dtype());
+    }
+
+    return C;
+}
+}  // namespace pmpp::ops::cuda::torch_impl
diff --git a/csrc/lib/ops/ops.hpp b/csrc/lib/ops/ops.hpp
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "pmpp/types/cxx_types.hpp"
+
+namespace pmpp::ops::cpu
+{
+
+void launchVecAdd(const fp32_t* a, const fp32_t* b, fp32_t* c, size_t n);
+
+void launchCvtRGBtoGray(uint8_t* picOut, const uint8_t* picIn, uint32_t nRows,
+                        uint32_t nCols);
+
+void launchMatmul(const fp32_t* A, const fp32_t* B, fp32_t* C, size_t m);
+
+}  // namespace pmpp::ops::cpu
+
+namespace pmpp::ops::cuda
+{
+
+void launchVecAdd(const fp32_t* d_A, const fp32_t* d_B, fp32_t* d_C, size_t n);
+
+void launchCvtRGBtoGray(uint8_t* picOut, const uint8_t* picIn, uint32_t nRows,
+                        uint32_t nCols);
+
+void launchMatmul(const fp32_t* dA, const fp32_t* dB, fp32_t* dC, size_t m);
+
+}  // namespace pmpp::ops::cuda
diff --git a/csrc/lib/ops/torch_bind.cpp b/csrc/lib/ops/torch_bind.cpp
@@ -9,19 +9,22 @@ TORCH_LIBRARY(pmpp, m)
 {
     m.def("vector_add(Tensor a, Tensor b) -> Tensor");
     m.def("cvt_rgb_to_gray(Tensor img) -> Tensor");
+    m.def("matmul(Tensor A, Tensor B) -> Tensor");
 }
 
-// Register the implementation.
+// Register the implementations.
 // @see
 //   https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit?tab=t.0#heading=h.jc288bcufw9a
 TORCH_LIBRARY_IMPL(pmpp, CPU, m)
 {
-    m.impl("vector_add", &pmpp::ops::cpu::torch_impl::vectorAddImpl);
-    m.impl("cvt_rgb_to_gray", &pmpp::ops::cpu::torch_impl::cvtRGBtoGrayImpl);
+    m.impl("vector_add", &pmpp::ops::cpu::torch_impl::vectorAdd);
+    m.impl("cvt_rgb_to_gray", &pmpp::ops::cpu::torch_impl::cvtRGBtoGray);
+    m.impl("matmul", &pmpp::ops::cpu::torch_impl::matmul);
 }
 
 TORCH_LIBRARY_IMPL(pmpp, CUDA, m)
 {
-    m.impl("vector_add", &pmpp::ops::cuda::torch_impl::vectorAddImpl);
-    m.impl("cvt_rgb_to_gray", &pmpp::ops::cuda::torch_impl::cvtRGBtoGrayImpl);
+    m.impl("vector_add", &pmpp::ops::cuda::torch_impl::vectorAdd);
+    m.impl("cvt_rgb_to_gray", &pmpp::ops::cuda::torch_impl::cvtRGBtoGray);
+    m.impl("matmul", &pmpp::ops::cuda::torch_impl::matmul);
 }
diff --git a/csrc/lib/ops/torch_impl.hpp b/csrc/lib/ops/torch_impl.hpp
@@ -4,14 +4,24 @@
 
 namespace pmpp::ops::cpu::torch_impl
 {
-auto vectorAddImpl(const torch::Tensor& A, const torch::Tensor& B)
-    -> torch::Tensor;
-auto cvtRGBtoGrayImpl(const torch::Tensor& img) -> torch::Tensor;
+
+auto vectorAdd(const torch::Tensor& A,
+               const torch::Tensor& B) -> torch::Tensor;
+
+auto cvtRGBtoGray(const torch::Tensor& img) -> torch::Tensor;
+
+auto matmul(const torch::Tensor& A, const torch::Tensor& B) -> torch::Tensor;
+
 }  // namespace pmpp::ops::cpu::torch_impl
 
 namespace pmpp::ops::cuda::torch_impl
 {
-auto vectorAddImpl(const torch::Tensor& A, const torch::Tensor& B)
-    -> torch::Tensor;
-auto cvtRGBtoGrayImpl(const torch::Tensor& img) -> torch::Tensor;
+
+auto vectorAdd(const torch::Tensor& A,
+               const torch::Tensor& B) -> torch::Tensor;
+
+auto cvtRGBtoGray(const torch::Tensor& img) -> torch::Tensor;
+
+auto matmul(const torch::Tensor& A, const torch::Tensor& B) -> torch::Tensor;
+
 }  // namespace pmpp::ops::cuda::torch_impl
diff --git a/csrc/lib/ops/vecAdd/torch_impl.cpp b/csrc/lib/ops/vecAdd/torch_impl.cpp
@@ -2,8 +2,8 @@
 #include <torch/torch.h>
 #include <torch/types.h>
 
+#include "../ops.hpp"
 #include "../torch_impl.hpp"
-#include "pmpp/types/cxx_types.hpp"
 
 #define VECTOR_ADD_CHECK(A, B, _Device)                                       \
     do {                                                                      \
@@ -20,15 +20,9 @@
                     A.sizes(), " and ", B.sizes(), ".");                      \
     } while (false)
 
-namespace pmpp::ops::cpu
+namespace pmpp::ops::cpu::torch_impl
 {
-extern void launchVecAdd(const fp32_t* a, const fp32_t* b, fp32_t* c,
-                         size_t n);
-
-namespace torch_impl
-{
-auto vectorAddImpl(const torch::Tensor& A, const torch::Tensor& B)
-    -> torch::Tensor
+auto vectorAdd(const torch::Tensor& A, const torch::Tensor& B) -> torch::Tensor
 {
     VECTOR_ADD_CHECK(A, B, "CPU");
 
@@ -47,18 +41,11 @@ auto vectorAddImpl(const torch::Tensor& A, const torch::Tensor& B)
 
     return C;
 }
-}  // namespace torch_impl
-}  // namespace pmpp::ops::cpu
-
-namespace pmpp::ops::cuda
-{
-extern void launchVecAdd(const fp32_t* d_A, const fp32_t* d_B, fp32_t* d_C,
-                         size_t n);
+}  // namespace pmpp::ops::cpu::torch_impl
 
-namespace torch_impl
+namespace pmpp::ops::cuda::torch_impl
 {
-auto vectorAddImpl(const torch::Tensor& A, const torch::Tensor& B)
-    -> torch::Tensor
+auto vectorAdd(const torch::Tensor& A, const torch::Tensor& B) -> torch::Tensor
 {
     VECTOR_ADD_CHECK(A, B, "CUDA");
 
@@ -77,5 +64,4 @@ auto vectorAddImpl(const torch::Tensor& A, const torch::Tensor& B)
 
     return C;
 }
-}  // namespace torch_impl
-}  // namespace pmpp::ops::cuda
+}  // namespace pmpp::ops::cuda::torch_impl
diff --git a/csrc/test/test_ops/cvtRGBtoGray.cpp b/csrc/test/test_ops/cvtRGBtoGray.cpp
diff --git a/csrc/test/test_ops/matmul.cpp b/csrc/test/test_ops/matmul.cpp
diff --git a/csrc/test/test_ops/vecAdd.cpp b/csrc/test/test_ops/vecAdd.cpp
diff --git a/test/test.py b/test/test.py