-
Notifications
You must be signed in to change notification settings - Fork 48
[Do not merge] Test KernelIntrinsics
#688
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
christiangnrd
wants to merge
9
commits into
main
Choose a base branch
from
kaintr
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Contributor
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/src/MetalKernels.jl b/src/MetalKernels.jl
index 4e856194..7573c5e1 100644
--- a/src/MetalKernels.jl
+++ b/src/MetalKernels.jl
@@ -136,26 +136,26 @@ end
KI.argconvert(::MetalBackend, arg) = mtlconvert(arg)
-function KI.kernel_function(::MetalBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::MetalBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
kern = mtlfunction(f, tt; name, kwargs...)
- KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
+ return KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
end
-function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups=1, workgroupsize=1)
+function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups = 1, workgroupsize = 1)
KI.check_launch_args(numworkgroups, workgroupsize)
- obj.kern(args...; threads=workgroupsize, groups=numworkgroups)
+ return obj.kern(args...; threads = workgroupsize, groups = numworkgroups)
end
-function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int=typemax(Int))::Int
- Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
+function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int = typemax(Int))::Int
+ return Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
end
function KI.max_work_group_size(::MetalBackend)::Int
- Int(device().maxThreadsPerThreadgroup.width)
+ return Int(device().maxThreadsPerThreadgroup.width)
end
function KI.multiprocessor_count(::MetalBackend)::Int
- Metal.num_gpu_cores()
+ return Metal.num_gpu_cores()
end
diff --git a/src/broadcast.jl b/src/broadcast.jl
index 72ced3ed..e90f5826 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -66,8 +66,8 @@ end
if _broadcast_shapes[Is] > BROADCAST_SPECIALIZATION_THRESHOLD
## COV_EXCL_START
function broadcast_cartesian_static(dest, bc, Is)
- i = KI.get_global_id().x
- stride = KI.get_global_size().x
+ i = KI.get_global_id().x
+ stride = KI.get_global_size().x
while 1 <= i <= length(dest)
I = @inbounds Is[i]
@inbounds dest[I] = bc[I]
@@ -91,8 +91,8 @@ end
(isa(IndexStyle(dest), IndexLinear) && isa(IndexStyle(bc), IndexLinear))
## COV_EXCL_START
function broadcast_linear(dest, bc)
- i = KI.get_global_id().x
- stride = KI.get_global_size().x
+ i = KI.get_global_id().x
+ stride = KI.get_global_size().x
while 1 <= i <= length(dest)
@inbounds dest[i] = bc[i]
i += stride
@@ -150,8 +150,8 @@ end
else
## COV_EXCL_START
function broadcast_cartesian(dest, bc)
- i = KI.get_global_id().x
- stride = KI.get_global_size().x
+ i = KI.get_global_id().x
+ stride = KI.get_global_size().x
while 1 <= i <= length(dest)
I = @inbounds CartesianIndices(dest)[i]
@inbounds dest[I] = bc[I]
diff --git a/src/device/random.jl b/src/device/random.jl
index 12b053a2..edc999cd 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -89,8 +89,8 @@ end
@inbounds global_random_counters()[simdgroupId]
elseif field === :ctr2
globalId = KI.get_global_id().x +
- (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
- (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+ (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+ (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
globalId % UInt32
end::UInt32
end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 7be5ef43..a737e8d0 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -224,7 +224,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
# we might not be able to launch all those threads to reduce each slice in one go.
# that's why each threads also loops across their inputs, processing multiple values
# so that we can span the entire reduction dimension using a single item group.
- kernel = KI.@kernel backend launch = false partial_mapreduce_device(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
+ kernel = KI.@kernel backend launch = false partial_mapreduce_device(
+ f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A)
# how many threads do we want?
@@ -260,7 +261,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
# we can cover the dimensions to reduce using a single group
kernel(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A;
- workgroupsize = threads, numworkgroups = groups)
+ workgroupsize = threads, numworkgroups = groups
+ )
else
# temporary empty array whose type will match the final partial array
partial = similar(R, ntuple(_ -> 0, Val(ndims(R)+1)))
@@ -287,7 +289,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
partial_kernel(f, op, init, Val(threads), Val(Rreduce),
Val(Rother), Val(UInt64(length(Rother))),
Val(grain), Val(shuffle), partial, A;
- numworkgroups = partial_groups, workgroupsize = partial_threads)
+ numworkgroups = partial_groups, workgroupsize = partial_threads
+ )
GPUArrays.mapreducedim!(identity, op, R, partial; init=init)
end
diff --git a/test/kernelabstractions.jl b/test/kernelabstractions.jl
index 221ee680..6f9d5a2c 100644
--- a/test/kernelabstractions.jl
+++ b/test/kernelabstractions.jl
@@ -7,6 +7,6 @@ Testsuite.testsuite(()->MetalBackend(), "Metal", Metal, MtlArray, Metal.MtlDevic
"Convert", # depends on https://github.com/JuliaGPU/Metal.jl/issues/69
"SpecialFunctions", # no equivalent Metal intrinsics for gamma, erf, etc
"sparse", # not supported yet
- "CPU synchronization",
- "fallback test: callable types",
+ "CPU synchronization",
+ "fallback test: callable types",
]))
diff --git a/test/runtests.jl b/test/runtests.jl
index bcf20b51..7336f956 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
@static if VERSION < v"1.11" && get(ENV, "BUILDKITE_PIPELINE_NAME", "Metal.jl") == "Metal.jl"
using Pkg
- Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+ Pkg.add(url = "https://github.com/JuliaGPU/KernelAbstractions.jl", rev = "main")
end
using Metal |
9ac3d49 to
6314372
Compare
Contributor
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Metal Benchmarks
Details
| Benchmark suite | Current: d9cd725 | Previous: 67d668c | Ratio |
|---|---|---|---|
latency/precompile |
29827285166 ns |
24820843000 ns |
1.20 |
latency/ttfp |
2462452167 ns |
2257593833 ns |
1.09 |
latency/import |
1572945708.5 ns |
1431203750 ns |
1.10 |
integration/metaldevrt |
839895.5 ns |
834875 ns |
1.01 |
integration/byval/slices=1 |
1565812.5 ns |
1525666.5 ns |
1.03 |
integration/byval/slices=3 |
11917584 ns |
8498958 ns |
1.40 |
integration/byval/reference |
1543041.5 ns |
1538166 ns |
1.00 |
integration/byval/slices=2 |
2614959 ns |
2552562 ns |
1.02 |
kernel/indexing |
583875 ns |
593833 ns |
0.98 |
kernel/indexing_checked |
585167 ns |
575750 ns |
1.02 |
kernel/launch |
12708 ns |
11250 ns |
1.13 |
kernel/rand |
564792 ns |
557187.5 ns |
1.01 |
array/construct |
6459 ns |
6000 ns |
1.08 |
array/broadcast |
595334 ns |
591209 ns |
1.01 |
array/random/randn/Float32 |
794792 ns |
836917 ns |
0.95 |
array/random/randn!/Float32 |
610125 ns |
619542 ns |
0.98 |
array/random/rand!/Int64 |
547375 ns |
548834 ns |
1.00 |
array/random/rand!/Float32 |
574209 ns |
593333 ns |
0.97 |
array/random/rand/Int64 |
747958 ns |
735667 ns |
1.02 |
array/random/rand/Float32 |
663604.5 ns |
631792 ns |
1.05 |
array/accumulate/Int64/1d |
1318312.5 ns |
1237125 ns |
1.07 |
array/accumulate/Int64/dims=1 |
1935416.5 ns |
1795625 ns |
1.08 |
array/accumulate/Int64/dims=2 |
2295021 ns |
2130458 ns |
1.08 |
array/accumulate/Int64/dims=1L |
11869167 ns |
11609562.5 ns |
1.02 |
array/accumulate/Int64/dims=2L |
10109687.5 ns |
9610834 ns |
1.05 |
array/accumulate/Float32/1d |
1148417 ns |
1111187.5 ns |
1.03 |
array/accumulate/Float32/dims=1 |
1664667 ns |
1518146 ns |
1.10 |
array/accumulate/Float32/dims=2 |
2044833 ns |
1836167 ns |
1.11 |
array/accumulate/Float32/dims=1L |
9982708 ns |
9757375 ns |
1.02 |
array/accumulate/Float32/dims=2L |
8093916 ns |
7203562.5 ns |
1.12 |
array/reductions/reduce/Int64/1d |
1333458.5 ns |
1498333 ns |
0.89 |
array/reductions/reduce/Int64/dims=1 |
1069375 ns |
1076542 ns |
0.99 |
array/reductions/reduce/Int64/dims=2 |
1300875 ns |
1129417 ns |
1.15 |
array/reductions/reduce/Int64/dims=1L |
2027541 ns |
2002083.5 ns |
1.01 |
array/reductions/reduce/Int64/dims=2L |
4224104.5 ns |
4214895.5 ns |
1.00 |
array/reductions/reduce/Float32/1d |
1039083 ns |
991375 ns |
1.05 |
array/reductions/reduce/Float32/dims=1 |
833000 ns |
827000 ns |
1.01 |
array/reductions/reduce/Float32/dims=2 |
878625 ns |
833917 ns |
1.05 |
array/reductions/reduce/Float32/dims=1L |
1342187.5 ns |
1305125 ns |
1.03 |
array/reductions/reduce/Float32/dims=2L |
1892708 ns |
1788375 ns |
1.06 |
array/reductions/mapreduce/Int64/1d |
1348084 ns |
1549292 ns |
0.87 |
array/reductions/mapreduce/Int64/dims=1 |
1083417 ns |
1085333 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=2 |
1311583 ns |
1201959 ns |
1.09 |
array/reductions/mapreduce/Int64/dims=1L |
2039458.5 ns |
2019583 ns |
1.01 |
array/reductions/mapreduce/Int64/dims=2L |
4298583 ns |
3628521 ns |
1.18 |
array/reductions/mapreduce/Float32/1d |
1062917 ns |
1036542 ns |
1.03 |
array/reductions/mapreduce/Float32/dims=1 |
833333 ns |
819667 ns |
1.02 |
array/reductions/mapreduce/Float32/dims=2 |
895792 ns |
843917 ns |
1.06 |
array/reductions/mapreduce/Float32/dims=1L |
1310209 ns |
1280500 ns |
1.02 |
array/reductions/mapreduce/Float32/dims=2L |
1864250 ns |
1784500 ns |
1.04 |
array/private/copyto!/gpu_to_gpu |
636708 ns |
635375 ns |
1.00 |
array/private/copyto!/cpu_to_gpu |
795875 ns |
786625 ns |
1.01 |
array/private/copyto!/gpu_to_cpu |
788458 ns |
773833 ns |
1.02 |
array/private/iteration/findall/int |
1715417 ns |
1620458 ns |
1.06 |
array/private/iteration/findall/bool |
1497000 ns |
1430125 ns |
1.05 |
array/private/iteration/findfirst/int |
2107187 ns |
2024937.5 ns |
1.04 |
array/private/iteration/findfirst/bool |
2093208 ns |
2010916 ns |
1.04 |
array/private/iteration/scalar |
5242458 ns |
5600375 ns |
0.94 |
array/private/iteration/logical |
2653437.5 ns |
2504521 ns |
1.06 |
array/private/iteration/findmin/1d |
2296958 ns |
2209917 ns |
1.04 |
array/private/iteration/findmin/2d |
1568792 ns |
1498584 ns |
1.05 |
array/private/copy |
574375 ns |
558312.5 ns |
1.03 |
array/shared/copyto!/gpu_to_gpu |
84875 ns |
82042 ns |
1.03 |
array/shared/copyto!/cpu_to_gpu |
82875 ns |
79750 ns |
1.04 |
array/shared/copyto!/gpu_to_cpu |
82792 ns |
82125 ns |
1.01 |
array/shared/iteration/findall/int |
1732166 ns |
1600354 ns |
1.08 |
array/shared/iteration/findall/bool |
1519708 ns |
1452458 ns |
1.05 |
array/shared/iteration/findfirst/int |
1699416.5 ns |
1621520.5 ns |
1.05 |
array/shared/iteration/findfirst/bool |
1693250 ns |
1607916.5 ns |
1.05 |
array/shared/iteration/scalar |
210208.5 ns |
202916 ns |
1.04 |
array/shared/iteration/logical |
2655208 ns |
2386416.5 ns |
1.11 |
array/shared/iteration/findmin/1d |
1906334 ns |
1799396 ns |
1.06 |
array/shared/iteration/findmin/2d |
1562916 ns |
1500416.5 ns |
1.04 |
array/shared/copy |
240250 ns |
230791 ns |
1.04 |
array/permutedims/4d |
2641229 ns |
2358000 ns |
1.12 |
array/permutedims/2d |
1162709 ns |
1133208 ns |
1.03 |
array/permutedims/3d |
1671375 ns |
1645604 ns |
1.02 |
metal/synchronization/stream |
19500 ns |
18500 ns |
1.05 |
metal/synchronization/context |
20250 ns |
19625 ns |
1.03 |
This comment was automatically generated by workflow using github-action-benchmark.
22e754e to
68db9c2
Compare
2b8dce1 to
0e76668
Compare
db9a7dc to
c802ccc
Compare
4b8f026 to
ce67b4c
Compare
ce67b4c to
03bb0dd
Compare
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Not a draft to also run benchmarks