Skip to content

Conversation

@christiangnrd
Copy link
Member

Not a draft to also run benchmarks

@github-actions
Copy link
Contributor

github-actions bot commented Oct 22, 2025

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic main) to apply these changes.

Click here to view the suggested changes.
diff --git a/src/MetalKernels.jl b/src/MetalKernels.jl
index 4e856194..7573c5e1 100644
--- a/src/MetalKernels.jl
+++ b/src/MetalKernels.jl
@@ -136,26 +136,26 @@ end
 
 KI.argconvert(::MetalBackend, arg) = mtlconvert(arg)
 
-function KI.kernel_function(::MetalBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::MetalBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
     kern = mtlfunction(f, tt; name, kwargs...)
-    KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
+    return KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
 end
 
-function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups=1, workgroupsize=1)
+function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups = 1, workgroupsize = 1)
     KI.check_launch_args(numworkgroups, workgroupsize)
 
-    obj.kern(args...; threads=workgroupsize, groups=numworkgroups)
+    return obj.kern(args...; threads = workgroupsize, groups = numworkgroups)
 end
 
 
-function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int=typemax(Int))::Int
-    Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
+function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int = typemax(Int))::Int
+    return Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
 end
 function KI.max_work_group_size(::MetalBackend)::Int
-    Int(device().maxThreadsPerThreadgroup.width)
+    return Int(device().maxThreadsPerThreadgroup.width)
 end
 function KI.multiprocessor_count(::MetalBackend)::Int
-    Metal.num_gpu_cores()
+    return Metal.num_gpu_cores()
 end
 
 
diff --git a/src/broadcast.jl b/src/broadcast.jl
index 72ced3ed..e90f5826 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -66,8 +66,8 @@ end
     if _broadcast_shapes[Is] > BROADCAST_SPECIALIZATION_THRESHOLD
         ## COV_EXCL_START
         function broadcast_cartesian_static(dest, bc, Is)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
              while 1 <= i <= length(dest)
                 I = @inbounds Is[i]
                 @inbounds dest[I] = bc[I]
@@ -91,8 +91,8 @@ end
        (isa(IndexStyle(dest), IndexLinear) && isa(IndexStyle(bc), IndexLinear))
         ## COV_EXCL_START
         function broadcast_linear(dest, bc)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
              while 1 <= i <= length(dest)
                  @inbounds dest[i] = bc[i]
                  i += stride
@@ -150,8 +150,8 @@ end
     else
         ## COV_EXCL_START
         function broadcast_cartesian(dest, bc)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
              while 1 <= i <= length(dest)
                 I = @inbounds CartesianIndices(dest)[i]
                 @inbounds dest[I] = bc[I]
diff --git a/src/device/random.jl b/src/device/random.jl
index 12b053a2..edc999cd 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -89,8 +89,8 @@ end
         @inbounds global_random_counters()[simdgroupId]
     elseif field === :ctr2
         globalId = KI.get_global_id().x +
-                   (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
-                   (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+            (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+            (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
         globalId % UInt32
     end::UInt32
 end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 7be5ef43..a737e8d0 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -224,7 +224,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
     # we might not be able to launch all those threads to reduce each slice in one go.
     # that's why each threads also loops across their inputs, processing multiple values
     # so that we can span the entire reduction dimension using a single item group.
-    kernel = KI.@kernel backend launch = false partial_mapreduce_device(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
+    kernel = KI.@kernel backend launch = false partial_mapreduce_device(
+        f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
                                                           Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A)
 
     # how many threads do we want?
@@ -260,7 +261,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
         # we can cover the dimensions to reduce using a single group
         kernel(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
                Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A;
-               workgroupsize = threads, numworkgroups = groups)
+            workgroupsize = threads, numworkgroups = groups
+        )
     else
         # temporary empty array whose type will match the final partial array
 	    partial = similar(R, ntuple(_ -> 0, Val(ndims(R)+1)))
@@ -287,7 +289,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
         partial_kernel(f, op, init, Val(threads), Val(Rreduce),
                         Val(Rother), Val(UInt64(length(Rother))),
                         Val(grain), Val(shuffle), partial, A;
-                        numworkgroups = partial_groups, workgroupsize = partial_threads)
+            numworkgroups = partial_groups, workgroupsize = partial_threads
+        )
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init=init)
     end
diff --git a/test/kernelabstractions.jl b/test/kernelabstractions.jl
index 221ee680..6f9d5a2c 100644
--- a/test/kernelabstractions.jl
+++ b/test/kernelabstractions.jl
@@ -7,6 +7,6 @@ Testsuite.testsuite(()->MetalBackend(), "Metal", Metal, MtlArray, Metal.MtlDevic
     "Convert",           # depends on https://github.com/JuliaGPU/Metal.jl/issues/69
     "SpecialFunctions",  # no equivalent Metal intrinsics for gamma, erf, etc
     "sparse",            # not supported yet
-    "CPU synchronization",
-    "fallback test: callable types",
+            "CPU synchronization",
+            "fallback test: callable types",
 ]))
diff --git a/test/runtests.jl b/test/runtests.jl
index bcf20b51..7336f956 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
 @static if VERSION < v"1.11" && get(ENV, "BUILDKITE_PIPELINE_NAME", "Metal.jl") == "Metal.jl"
     using Pkg
-    Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+    Pkg.add(url = "https://github.com/JuliaGPU/KernelAbstractions.jl", rev = "main")
 end
 
 using Metal

@christiangnrd christiangnrd force-pushed the kaintr branch 3 times, most recently from 9ac3d49 to 6314372 Compare October 22, 2025 04:31
Copy link
Contributor

@github-actions github-actions bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Metal Benchmarks

Details
Benchmark suite Current: d9cd725 Previous: 67d668c Ratio
latency/precompile 29827285166 ns 24820843000 ns 1.20
latency/ttfp 2462452167 ns 2257593833 ns 1.09
latency/import 1572945708.5 ns 1431203750 ns 1.10
integration/metaldevrt 839895.5 ns 834875 ns 1.01
integration/byval/slices=1 1565812.5 ns 1525666.5 ns 1.03
integration/byval/slices=3 11917584 ns 8498958 ns 1.40
integration/byval/reference 1543041.5 ns 1538166 ns 1.00
integration/byval/slices=2 2614959 ns 2552562 ns 1.02
kernel/indexing 583875 ns 593833 ns 0.98
kernel/indexing_checked 585167 ns 575750 ns 1.02
kernel/launch 12708 ns 11250 ns 1.13
kernel/rand 564792 ns 557187.5 ns 1.01
array/construct 6459 ns 6000 ns 1.08
array/broadcast 595334 ns 591209 ns 1.01
array/random/randn/Float32 794792 ns 836917 ns 0.95
array/random/randn!/Float32 610125 ns 619542 ns 0.98
array/random/rand!/Int64 547375 ns 548834 ns 1.00
array/random/rand!/Float32 574209 ns 593333 ns 0.97
array/random/rand/Int64 747958 ns 735667 ns 1.02
array/random/rand/Float32 663604.5 ns 631792 ns 1.05
array/accumulate/Int64/1d 1318312.5 ns 1237125 ns 1.07
array/accumulate/Int64/dims=1 1935416.5 ns 1795625 ns 1.08
array/accumulate/Int64/dims=2 2295021 ns 2130458 ns 1.08
array/accumulate/Int64/dims=1L 11869167 ns 11609562.5 ns 1.02
array/accumulate/Int64/dims=2L 10109687.5 ns 9610834 ns 1.05
array/accumulate/Float32/1d 1148417 ns 1111187.5 ns 1.03
array/accumulate/Float32/dims=1 1664667 ns 1518146 ns 1.10
array/accumulate/Float32/dims=2 2044833 ns 1836167 ns 1.11
array/accumulate/Float32/dims=1L 9982708 ns 9757375 ns 1.02
array/accumulate/Float32/dims=2L 8093916 ns 7203562.5 ns 1.12
array/reductions/reduce/Int64/1d 1333458.5 ns 1498333 ns 0.89
array/reductions/reduce/Int64/dims=1 1069375 ns 1076542 ns 0.99
array/reductions/reduce/Int64/dims=2 1300875 ns 1129417 ns 1.15
array/reductions/reduce/Int64/dims=1L 2027541 ns 2002083.5 ns 1.01
array/reductions/reduce/Int64/dims=2L 4224104.5 ns 4214895.5 ns 1.00
array/reductions/reduce/Float32/1d 1039083 ns 991375 ns 1.05
array/reductions/reduce/Float32/dims=1 833000 ns 827000 ns 1.01
array/reductions/reduce/Float32/dims=2 878625 ns 833917 ns 1.05
array/reductions/reduce/Float32/dims=1L 1342187.5 ns 1305125 ns 1.03
array/reductions/reduce/Float32/dims=2L 1892708 ns 1788375 ns 1.06
array/reductions/mapreduce/Int64/1d 1348084 ns 1549292 ns 0.87
array/reductions/mapreduce/Int64/dims=1 1083417 ns 1085333 ns 1.00
array/reductions/mapreduce/Int64/dims=2 1311583 ns 1201959 ns 1.09
array/reductions/mapreduce/Int64/dims=1L 2039458.5 ns 2019583 ns 1.01
array/reductions/mapreduce/Int64/dims=2L 4298583 ns 3628521 ns 1.18
array/reductions/mapreduce/Float32/1d 1062917 ns 1036542 ns 1.03
array/reductions/mapreduce/Float32/dims=1 833333 ns 819667 ns 1.02
array/reductions/mapreduce/Float32/dims=2 895792 ns 843917 ns 1.06
array/reductions/mapreduce/Float32/dims=1L 1310209 ns 1280500 ns 1.02
array/reductions/mapreduce/Float32/dims=2L 1864250 ns 1784500 ns 1.04
array/private/copyto!/gpu_to_gpu 636708 ns 635375 ns 1.00
array/private/copyto!/cpu_to_gpu 795875 ns 786625 ns 1.01
array/private/copyto!/gpu_to_cpu 788458 ns 773833 ns 1.02
array/private/iteration/findall/int 1715417 ns 1620458 ns 1.06
array/private/iteration/findall/bool 1497000 ns 1430125 ns 1.05
array/private/iteration/findfirst/int 2107187 ns 2024937.5 ns 1.04
array/private/iteration/findfirst/bool 2093208 ns 2010916 ns 1.04
array/private/iteration/scalar 5242458 ns 5600375 ns 0.94
array/private/iteration/logical 2653437.5 ns 2504521 ns 1.06
array/private/iteration/findmin/1d 2296958 ns 2209917 ns 1.04
array/private/iteration/findmin/2d 1568792 ns 1498584 ns 1.05
array/private/copy 574375 ns 558312.5 ns 1.03
array/shared/copyto!/gpu_to_gpu 84875 ns 82042 ns 1.03
array/shared/copyto!/cpu_to_gpu 82875 ns 79750 ns 1.04
array/shared/copyto!/gpu_to_cpu 82792 ns 82125 ns 1.01
array/shared/iteration/findall/int 1732166 ns 1600354 ns 1.08
array/shared/iteration/findall/bool 1519708 ns 1452458 ns 1.05
array/shared/iteration/findfirst/int 1699416.5 ns 1621520.5 ns 1.05
array/shared/iteration/findfirst/bool 1693250 ns 1607916.5 ns 1.05
array/shared/iteration/scalar 210208.5 ns 202916 ns 1.04
array/shared/iteration/logical 2655208 ns 2386416.5 ns 1.11
array/shared/iteration/findmin/1d 1906334 ns 1799396 ns 1.06
array/shared/iteration/findmin/2d 1562916 ns 1500416.5 ns 1.04
array/shared/copy 240250 ns 230791 ns 1.04
array/permutedims/4d 2641229 ns 2358000 ns 1.12
array/permutedims/2d 1162709 ns 1133208 ns 1.03
array/permutedims/3d 1671375 ns 1645604 ns 1.02
metal/synchronization/stream 19500 ns 18500 ns 1.05
metal/synchronization/context 20250 ns 19625 ns 1.03

This comment was automatically generated by workflow using github-action-benchmark.

@christiangnrd christiangnrd force-pushed the kaintr branch 3 times, most recently from 22e754e to 68db9c2 Compare October 22, 2025 13:38
@christiangnrd christiangnrd force-pushed the kaintr branch 2 times, most recently from 2b8dce1 to 0e76668 Compare November 2, 2025 21:16
@christiangnrd christiangnrd force-pushed the kaintr branch 9 times, most recently from db9a7dc to c802ccc Compare November 6, 2025 22:27
@christiangnrd christiangnrd force-pushed the kaintr branch 6 times, most recently from 4b8f026 to ce67b4c Compare November 18, 2025 01:36
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants