From 9c17a13c48f872865553061d119534aec44e1545 Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Fri, 3 Feb 2023 19:55:58 +0200
Subject: [PATCH 1/4] Add AMDGPU extension

---
 Project.toml                 |  9 +++++-
 docs/src/index.md            |  4 ++-
 ext/AMDGPUExt/AMDGPUExt.jl   | 61 ++++++++++++++++++++++++++++++++++++
 ext/AMDGPUExt/activations.jl | 16 ++++++++++
 ext/AMDGPUExt/conv.jl        | 50 +++++++++++++++++++++++++++++
 ext/AMDGPUExt/pool.jl        | 43 +++++++++++++++++++++++++
 ext/AMDGPUExt/softmax.jl     | 11 +++++++
 test/amd/activations.jl      | 10 ++++++
 test/amd/batched_mul.jl      | 34 ++++++++++++++++++++
 test/amd/batched_repr.jl     | 43 +++++++++++++++++++++++++
 test/amd/conv.jl             |  9 ++++++
 test/amd/pool.jl             | 11 +++++++
 test/amd/runtests.jl         | 54 +++++++++++++++++++++++++++++++
 test/amd/softmax.jl          | 17 ++++++++++
 test/amd/storage_type.jl     | 13 ++++++++
 test/runtests.jl             | 13 ++++++++
 16 files changed, 396 insertions(+), 2 deletions(-)
 create mode 100644 ext/AMDGPUExt/AMDGPUExt.jl
 create mode 100644 ext/AMDGPUExt/activations.jl
 create mode 100644 ext/AMDGPUExt/conv.jl
 create mode 100644 ext/AMDGPUExt/pool.jl
 create mode 100644 ext/AMDGPUExt/softmax.jl
 create mode 100644 test/amd/activations.jl
 create mode 100644 test/amd/batched_mul.jl
 create mode 100644 test/amd/batched_repr.jl
 create mode 100644 test/amd/conv.jl
 create mode 100644 test/amd/pool.jl
 create mode 100644 test/amd/runtests.jl
 create mode 100644 test/amd/softmax.jl
 create mode 100644 test/amd/storage_type.jl

diff --git a/Project.toml b/Project.toml
index 753a0faf2..b24c66814 100644
--- a/Project.toml
+++ b/Project.toml
@@ -11,7 +11,14 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
+[weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+
+[extensions]
+AMDGPUExt = "AMDGPU"
+
 [compat]
+AMDGPU = "0.4.5"
 Adapt = "2, 3.2"
 ChainRulesCore = "1.13"
 Requires = "0.5, 1.0"
@@ -32,4 +39,4 @@ UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["ChainRulesTestUtils", "CUDA", "Documenter", "FiniteDifferences", "ForwardDiff", "Logging", "NNlibCUDA", "Random", "StableRNGs", "Test", "UnicodePlots", "Zygote"]
+test = ["ChainRulesTestUtils", "AMDGPU", "CUDA", "Documenter", "FiniteDifferences", "ForwardDiff", "Logging", "NNlibCUDA", "Random", "StableRNGs", "Test", "UnicodePlots", "Zygote"]
diff --git a/docs/src/index.md b/docs/src/index.md
index 0eea8ddbb..63168abe7 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -4,4 +4,6 @@
 
 For use with automatic differentiation, this package defines gradients using [ChainRules.jl](https://github.com/JuliaDiff/ChainRules.jl). These will be seen by various packages including [Zygote.jl](https://github.com/FluxML/Zygote.jl).
 
-To use these functions with [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) you will need [NNlibCUDA.jl](https://github.com/FluxML/NNlibCUDA.jl) as well.
+To use these functions with [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl)
+or [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) you will need to load them
+and NNlib in the same Julia session.
diff --git a/ext/AMDGPUExt/AMDGPUExt.jl b/ext/AMDGPUExt/AMDGPUExt.jl
new file mode 100644
index 000000000..f141c6f8a
--- /dev/null
+++ b/ext/AMDGPUExt/AMDGPUExt.jl
@@ -0,0 +1,61 @@
+module AMDGPUExt
+
+using Adapt
+using AMDGPU
+using AMDGPU.MIOpen
+using ChainRulesCore
+using NNlib
+using NNlib: BatchedAdjoint, BatchedTranspose, BatchedAdjOrTrans
+using NNlib: DenseConvDims, PoolDims
+
+const MIOPENFloat = Union{Float16, Float32}
+
+const ROCBatchedAdjoint{T} = BatchedAdjoint{T, <: ROCArray{T}}
+const ROCBatchedTranspose{T} = BatchedTranspose{T, <: ROCArray{T}}
+const ROCBatchedAdjOrTrans{T} = Union{ROCBatchedAdjoint{T}, ROCBatchedTranspose{T}}
+const WrappedROCBatchedAdjOrTrans{T, N} = Adapt.WrappedArray{T, N, ROCBatchedAdjOrTrans{T}, ROCBatchedAdjOrTrans{T}}
+const AnyROCBatchedAdjOrTrans = Union{ROCBatchedAdjOrTrans, WrappedROCBatchedAdjOrTrans}
+
+function Base.convert(::Type{T}, b::AnyROCBatchedAdjOrTrans) where {T <: Array}
+    Base.convert(T, adapt(Array, b))
+end
+
+function Base.Array{T, N}(b::AnyROCBatchedAdjOrTrans) where {T, N}
+    Array{T, N}(adapt(Array, b))
+end
+
+Base.collect(b::AnyROCBatchedAdjOrTrans) = collect(adapt(Array, b))
+
+function Base.show(
+    io::IO, mime::MIME{Symbol("text/plain")}, x::AnyROCBatchedAdjOrTrans,
+)
+    show(io, mime, adapt(Array, x))
+end
+
+Base.show(io::IO, x::AnyROCBatchedAdjOrTrans) = show(io, adapt(Array, x))
+
+Base.display(x::AnyROCBatchedAdjOrTrans) = display(adapt(Array, x))
+
+function NNlib._batched_gemm!(
+    ::Type{<: ROCArray}, transA::Char, transB::Char, α, A, B, β, C,
+)
+    AMDGPU.rocBLAS.gemm_batched!(transA, transB, α, A, B, β, C)
+end
+
+function nnlib_padding(dims)
+    pd = NNlib.padding(dims)
+    if !all(pd[1:2:end] .== pd[2:2:end])
+        @warn """
+        MIOpen does not support asymmetric padding, defaulting to symmetric choice:
+        $pd -> $(pd[1:2:end]).
+        """ maxlog=1
+    end
+    pd[1:2:end]
+end
+
+include("conv.jl")
+include("pool.jl")
+include("softmax.jl")
+include("activations.jl")
+
+end
diff --git a/ext/AMDGPUExt/activations.jl b/ext/AMDGPUExt/activations.jl
new file mode 100644
index 000000000..1563bb45e
--- /dev/null
+++ b/ext/AMDGPUExt/activations.jl
@@ -0,0 +1,16 @@
+for (f, op) in [
+    NNlib.relu => MIOpen.relu,
+    NNlib.relu6 => x -> MIOpen.clippedrelu(x, 6),
+    NNlib.softplus => MIOpen.softrelu,
+    NNlib.σ => MIOpen.sigmoid,
+    Base.tanh => MIOpen.tanh,
+    # TODO define for leakyrelu, elu, etc.?
+]
+    @eval function Base.materialize(
+        bc::Broadcast.Broadcasted{<:Any,<:Any,typeof($f),<:Tuple{ROCArray{<:MIOPENFloat}}}
+    )
+        return $op(bc.args[1])
+    end
+end
+
+Base.broadcasted(::typeof(identity), x::ROCArray{T}) where {T<:MIOPENFloat} = x
diff --git a/ext/AMDGPUExt/conv.jl b/ext/AMDGPUExt/conv.jl
new file mode 100644
index 000000000..b0cebff87
--- /dev/null
+++ b/ext/AMDGPUExt/conv.jl
@@ -0,0 +1,50 @@
+function NNlib.conv!(
+    y::ROCArray{T, N}, x::ROCArray{T, N}, w::ROCArray{T, N}, cdims::DenseConvDims,
+) where {T <: MIOPENFloat, N}
+    NNlib.flipkernel(cdims) || throw(ArgumentError(
+        "MIOpen supports only cross-correlation as its convolution implementation."))
+
+    nd = max(0, 4 - N)
+    ncdims = NNlib.insert_singleton_spatial_dimension(cdims, nd)
+    MIOpen.convolution!(
+        NNlib.insert_singleton_spatial_dimension(y, nd),
+        NNlib.insert_singleton_spatial_dimension(x, nd),
+        NNlib.insert_singleton_spatial_dimension(w, nd);
+        padding=nnlib_padding(ncdims), stride=NNlib.stride(ncdims),
+        dilation=NNlib.dilation(ncdims), groups=NNlib.groupcount(ncdims))
+    return y
+end
+
+function NNlib.∇conv_data!(
+    dx::ROCArray{T, N}, dy::ROCArray{T, N}, w::ROCArray{T, N}, cdims::DenseConvDims,
+) where {T <: MIOPENFloat, N}
+    NNlib.flipkernel(cdims) || throw(ArgumentError(
+        "MIOpen supports only cross-correlation as its convolution implementation."))
+
+    nd = max(0, 4 - N)
+    ncdims = NNlib.insert_singleton_spatial_dimension(cdims, nd)
+    MIOpen.∇convolution_data!(
+        NNlib.insert_singleton_spatial_dimension(dx, nd),
+        NNlib.insert_singleton_spatial_dimension(dy, nd),
+        NNlib.insert_singleton_spatial_dimension(w, nd);
+        padding=nnlib_padding(ncdims), stride=NNlib.stride(ncdims),
+        dilation=NNlib.dilation(ncdims), groups=NNlib.groupcount(ncdims))
+    return dx
+end
+
+function NNlib.∇conv_filter!(
+    dw::ROCArray{T, N}, x::ROCArray{T, N}, dy::ROCArray{T, N}, cdims::DenseConvDims,
+) where {T <: MIOPENFloat, N}
+    NNlib.flipkernel(cdims) || throw(ArgumentError(
+        "MIOpen supports only cross-correlation as its convolution implementation."))
+
+    nd = max(0, 4 - N)
+    ncdims = NNlib.insert_singleton_spatial_dimension(cdims, nd)
+    MIOpen.∇convolution_weight!(
+        NNlib.insert_singleton_spatial_dimension(dw, nd),
+        NNlib.insert_singleton_spatial_dimension(dy, nd),
+        NNlib.insert_singleton_spatial_dimension(x, nd);
+        padding=nnlib_padding(ncdims), stride=NNlib.stride(ncdims),
+        dilation=NNlib.dilation(ncdims), groups=NNlib.groupcount(ncdims))
+    return dw
+end
diff --git a/ext/AMDGPUExt/pool.jl b/ext/AMDGPUExt/pool.jl
new file mode 100644
index 000000000..5549bab1c
--- /dev/null
+++ b/ext/AMDGPUExt/pool.jl
@@ -0,0 +1,43 @@
+for poolname in (:maxpool, :meanpool)
+    @eval function NNlib.$(poolname)(
+        x::ROCArray{T, N}, pdims::PoolDims,
+    ) where {T <: MIOPENFloat, N}
+        y = similar(x, NNlib.output_size(pdims)..., NNlib.channels_out(pdims), size(x, N))
+        nd = max(0, 4 - N)
+        npdims = NNlib.insert_singleton_spatial_dimension(pdims, nd)
+        MIOpen.$(Symbol("$(poolname)!"))(
+            NNlib.insert_singleton_spatial_dimension(y, nd),
+            NNlib.insert_singleton_spatial_dimension(x, nd);
+            dims=NNlib.kernel_size(npdims), padding=nnlib_padding(npdims),
+            stride=NNlib.stride(npdims), do_backward=false)
+        return y
+    end
+
+    @eval function ChainRulesCore.rrule(
+        ::typeof(NNlib.$(poolname)), x::ROCArray{T, N}, pdims::PoolDims,
+    ) where {T <: MIOPENFloat, N}
+        y = similar(x, NNlib.output_size(pdims)..., NNlib.channels_out(pdims), size(x, N))
+        nd = max(0, 4 - N)
+        npdims = NNlib.insert_singleton_spatial_dimension(pdims, nd)
+
+        # `workspace` is used in the pullback.
+        _, workspace = MIOpen.$(Symbol("$(poolname)!"))(
+            NNlib.insert_singleton_spatial_dimension(y, nd),
+            NNlib.insert_singleton_spatial_dimension(x, nd);
+            dims=NNlib.kernel_size(npdims), padding=nnlib_padding(npdims),
+            stride=NNlib.stride(npdims))
+
+        function _pooling_pullback(Δ)
+            dx = similar(x)
+            MIOpen.$(Symbol("∇$(poolname)!"))(
+                NNlib.insert_singleton_spatial_dimension(dx, nd),
+                NNlib.insert_singleton_spatial_dimension(unthunk(Δ), nd),
+                NNlib.insert_singleton_spatial_dimension(y, nd),
+                NNlib.insert_singleton_spatial_dimension(x, nd);
+                dims=NNlib.kernel_size(npdims), padding=nnlib_padding(npdims),
+                stride=NNlib.stride(npdims), workspace)
+            return NoTangent(), dx, NoTangent()
+        end
+        y, _pooling_pullback
+    end
+end
diff --git a/ext/AMDGPUExt/softmax.jl b/ext/AMDGPUExt/softmax.jl
new file mode 100644
index 000000000..de75f9748
--- /dev/null
+++ b/ext/AMDGPUExt/softmax.jl
@@ -0,0 +1,11 @@
+for fname in (:softmax, :logsoftmax)
+    @eval function NNlib.$(fname)(x::ROCArray{T}; dims = 1) where T <: MIOPENFloat
+        MIOpen.$(fname)(x; dims)
+    end
+
+    @eval function NNlib.$(Symbol("∇$(fname)"))(
+        dy::ROCArray{T, N}, x::ROCArray{T, N}, y::ROCArray{T, N}; dims = 1,
+    ) where {T <: MIOPENFloat, N}
+        MIOpen.$(Symbol("∇$(fname)!"))(dy, y; dims)
+    end
+end
diff --git a/test/amd/activations.jl b/test/amd/activations.jl
new file mode 100644
index 000000000..2abb0c272
--- /dev/null
+++ b/test/amd/activations.jl
@@ -0,0 +1,10 @@
+@testset "Compare CPU & GPU" begin
+    for (T, atol) in ((Float16, 1f-2), (Float32, 1f-5))
+        x = randn(T, 16)
+        gputest(x -> NNlib.relu.(x), x; atol)
+        gputest(x -> NNlib.relu6.(x), x; atol)
+        gputest(x -> NNlib.softplus.(x), x; atol)
+        gputest(x -> tanh.(x), x; atol)
+        gputest(x -> identity.(x), x; atol)
+    end
+end
diff --git a/test/amd/batched_mul.jl b/test/amd/batched_mul.jl
new file mode 100644
index 000000000..bc9dae899
--- /dev/null
+++ b/test/amd/batched_mul.jl
@@ -0,0 +1,34 @@
+@testset "batched_mul" begin
+    A = rand(Float32, 3, 3, 2)
+    B = rand(Float32, 3, 3, 2)
+    dA, dB = ROCArray.((A, B))
+
+    C = batched_mul(A, B)
+    @test ROCArray(C) ≈ batched_mul(dA, dB)
+
+    Ct = batched_mul(batched_transpose(A), B)
+    @test ROCArray(Ct) ≈ batched_mul(batched_transpose(dA), dB)
+
+    Ca = batched_mul(A, batched_adjoint(B))
+    @test ROCArray(Ca) ≈ batched_mul(dA, batched_adjoint(dB))
+
+    # 5-arg batched_mul!
+    C .= pi
+    batched_mul!(C, A, B, 2f0, 3f0)
+    Cpi = ROCArray(similar(C)) .= pi
+    @test ROCArray(C) ≈ batched_mul!(Cpi, dA, dB, 2f0, 3f0)
+
+    # PermutedDimsArray
+    @test ROCArray(Ct) ≈ batched_mul(PermutedDimsArray(dA, (2, 1, 3)), dB)
+
+    # FIXME same but with (1, 3, 2) errors
+    D = permutedims(B, (2, 1, 3))
+    Cp = batched_mul(batched_adjoint(A), B)
+    @test ROCArray(Cp) ≈ batched_mul(
+        batched_adjoint(dA), PermutedDimsArray(ROCArray(D), (2, 1, 3)))
+
+    # Methods which reshape
+    M = randn(Float32, 3, 3)
+    Cm = batched_mul(A, M)
+    @test ROCArray(Cm) ≈ batched_mul(dA, ROCArray(M))
+end
diff --git a/test/amd/batched_repr.jl b/test/amd/batched_repr.jl
new file mode 100644
index 000000000..dfdbc558b
--- /dev/null
+++ b/test/amd/batched_repr.jl
@@ -0,0 +1,43 @@
+function print_array_strs(x)
+    str = sprint((io, x)->show(io, MIME"text/plain"(), x), x)
+    return @view split(str, '\n')[2:end]
+end
+
+@testset "BatchedAdjOrTrans" begin
+    x = rand(Float32, 3, 4, 2)
+    y = ROCArray(x)
+
+    bax = batched_adjoint(x)
+    btx = batched_transpose(x)
+    bay = batched_adjoint(y)
+    bty = batched_transpose(y)
+
+    @test sprint(show, bax) == sprint(show, bay)
+    @test sprint(show, btx) == sprint(show, bty)
+
+    @test print_array_strs(bax) == print_array_strs(bay)
+    @test print_array_strs(btx) == print_array_strs(bty)
+
+    @test Array(bax) == Array(bay)
+    @test collect(bax) == collect(bay)
+    @test Array(btx) == Array(bty)
+    @test collect(btx) == collect(bty)
+
+    for shape in (:, (12, 2))
+        rbax = reshape(bax, shape)
+        rbtx = reshape(btx, shape)
+        rbay = reshape(bay, shape)
+        rbty = reshape(bty, shape)
+
+        @test sprint(show, rbax) == sprint(show, rbay)
+        @test sprint(show, rbtx) == sprint(show, rbty)
+
+        @test print_array_strs(rbax) == print_array_strs(rbay)
+        @test print_array_strs(rbtx) == print_array_strs(rbty)
+
+        @test Array(rbax) == Array(rbay)
+        @test collect(rbax) == collect(rbay)
+        @test Array(rbtx) == Array(rbty)
+        @test collect(rbtx) == collect(rbty)
+    end
+end
diff --git a/test/amd/conv.jl b/test/amd/conv.jl
new file mode 100644
index 000000000..b6be3fd39
--- /dev/null
+++ b/test/amd/conv.jl
@@ -0,0 +1,9 @@
+@testset "Compare CPU & GPU" begin
+    channels, batch = 3, 2
+    for T in (Float16, Float32), nd in (1, 2, 3)
+        x = rand(Float32, fill(4, nd)..., 3, 1)
+        w = rand(Float32, fill(2, nd)..., channels, 4)
+        cdims = DenseConvDims(x, w, flipkernel=true)
+        gputest((x, w) -> NNlib.conv(x, w, cdims), x, w; atol=1e-4)
+    end
+end
diff --git a/test/amd/pool.jl b/test/amd/pool.jl
new file mode 100644
index 000000000..c32f67298
--- /dev/null
+++ b/test/amd/pool.jl
@@ -0,0 +1,11 @@
+@testset "Compare CPU & GPU" begin
+    channels, batch = 3, 2
+    for T in (Float16, Float32), nd in (1, 2, 3)
+        x = rand(T, fill(8, nd)..., channels, batch)
+        pdims = PoolDims(x, 2)
+        # NOTE: Disable grad check for maxpool as *sometimes*
+        # it does not *completely* agree with CPU :/
+        gputest(x -> NNlib.maxpool(x, pdims), x; checkgrad=false)
+        gputest(x -> NNlib.meanpool(x, pdims), x)
+    end
+end
diff --git a/test/amd/runtests.jl b/test/amd/runtests.jl
new file mode 100644
index 000000000..fd15e6274
--- /dev/null
+++ b/test/amd/runtests.jl
@@ -0,0 +1,54 @@
+using NNlib: batched_adjoint, batched_mul, batched_mul!, batched_transpose
+using NNlib: is_strided, storage_type
+using LinearAlgebra
+
+AMDGPU.allowscalar(false)
+
+function gputest(f, xs...; checkgrad=true, atol=1e-6, kws...)
+    cpu_in = xs
+    gpu_in = ROCArray.(xs)
+
+    cpu_out = f(cpu_in...; kws...)
+    gpu_out = f(gpu_in...; kws...)
+    @test collect(cpu_out) ≈ collect(gpu_out)
+
+    if checkgrad
+        cpu_grad = gradient((x...) -> sum(f(x...; kws...)), cpu_in...)
+        gpu_grad = gradient((x...) -> sum(f(x...; kws...)), gpu_in...)
+        for (cpu_g, gpu_g) in zip(cpu_grad, gpu_grad)
+            if cpu_g === nothing
+                @test gpu_g === nothing
+            else
+                @test collect(cpu_g) ≈ collect(gpu_g) atol=atol
+            end
+        end
+    end
+end
+
+@testset "Storage types" begin
+    include("storage_type.jl")
+end
+
+@testset "Batched repr" begin
+    include("batched_repr.jl")
+end
+
+@testset "Batched multiplication" begin
+    include("batched_mul.jl")
+end
+
+@testset "Convolution" begin
+    include("conv.jl")
+end
+
+@testset "Pooling" begin
+    include("pool.jl")
+end
+
+@testset "Softmax" begin
+    include("softmax.jl")
+end
+
+@testset "Activations" begin
+    include("activations.jl")
+end
diff --git a/test/amd/softmax.jl b/test/amd/softmax.jl
new file mode 100644
index 000000000..cd8545223
--- /dev/null
+++ b/test/amd/softmax.jl
@@ -0,0 +1,17 @@
+@testset "Compare CPU & GPU" begin
+    for (T, atol) in ((Float16, 1f-2), (Float32, 1f-5))
+        for (sz, dims) in [
+            ((5,), :), ((5,), 1),
+            ((5, 5), :), ((5, 5), 1), ((5, 5), 2),
+            ((5, 5, 5, 5), (2, 3)), ((5, 5, 5, 5), (2, 4)),
+        ]
+            if T == Float16
+                x = ones(T, sz) # Really low precision.
+            else
+                x = randn(T, sz)
+            end
+            gputest(NNlib.softmax, x; atol)
+            gputest(NNlib.logsoftmax, x; atol)
+        end
+    end
+end
diff --git a/test/amd/storage_type.jl b/test/amd/storage_type.jl
new file mode 100644
index 000000000..d884ddd7f
--- /dev/null
+++ b/test/amd/storage_type.jl
@@ -0,0 +1,13 @@
+@testset "NNlib storage type" begin
+    x = ROCArray(ones(Float32, 10, 10))
+    @test storage_type(x) <: ROCArray{Float32, 2}
+    @test storage_type(reshape(view(x, 1:2:10,:), 10, :)) <: ROCArray{Float32, 2}
+
+    @test is_strided(x)
+    @test is_strided(view(x, 1:2:5,:))
+    @test is_strided(PermutedDimsArray(x, (2, 1)))
+
+    @test !is_strided(reshape(view(x, 1:2:10, :), 10, :))
+    @test !is_strided((x .+ im)')
+    @test !is_strided(Diagonal(ROCArray(ones(3))))
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 357b96f4a..e4f2f518a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -26,6 +26,19 @@ include("test_utils.jl")
         @info "Insufficient version or CUDA not found; Skipping CUDA tests"
     end
 
+    if get(ENV, "NNLIB_TEST_AMDGPU", "false") == "true"
+        using AMDGPU
+        if AMDGPU.functional() && AMDGPU.functional(:MIOpen)
+            @testset "AMDGPU" begin
+                include("amd/runtests.jl")
+            end
+        else
+            @info "AMDGPU.jl package is not functional. Skipping AMDGPU tests."
+        end
+    else
+        @info "Skipping AMDGPU tests, set NNLIB_TEST_CUDA=true to run them."
+    end
+
     if VERSION < v"1.6"
         @info "skipping doctests, on Julia $VERSION"
     else

From c68bc4bfed349c2cede19cd1651f605a89b420ed Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Mon, 6 Feb 2023 18:32:17 +0200
Subject: [PATCH 2/4] Add compat bounds, buildkite AMDGPU job & fix docs

When running builkite job for the AMDGPU, we populate 'test' target
with AMDGPU before running them.
---
 .buildkite/pipeline.yml | 29 +++++++++++++++++++++++++++++
 Project.toml            |  5 +++--
 docs/src/index.md       |  5 ++---
 3 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index a047715b9..a06af0e80 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -55,6 +55,35 @@ steps:
     if: build.pull_request.labels includes "benchmark"
     timeout_in_minutes: 30
 
+  - label: "AMDGPU - Julia 1.9 - No Artifacts"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: 1.9-nightly
+      - JuliaCI/julia-test#v1:
+      - JuliaCI/julia-coverage#v1:
+          codecov: true
+          dirs:
+            - src
+            - ext
+    agents:
+      queue: "juliagpu"
+      rocm: "*"
+      rocmgpu: "*"
+    command:
+      - julia -e """
+        using TOML
+        conf = TOML.parse(read(\"Project.toml\", String))
+        push!(conf[\"targets\"][\"test\"], \"AMDGPU\")
+        open(io -> TOML.print(io, conf), \"Project.toml\", \"w\")
+        """
+      - julia --project -e 'using Pkg; Pkg.update()'
+    timeout_in_minutes: 30
+    env:
+      JULIA_AMDGPU_CORE_MUST_LOAD: "1"
+      JULIA_AMDGPU_HIP_MUST_LOAD: "1"
+      JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"
+      NNLIB_TEST_AMDGPU: true
+
   # - label: "GPU julia nightly"
   #   plugins:
   #     - JuliaCI/julia#v1:
diff --git a/Project.toml b/Project.toml
index b24c66814..1e2d73edf 100644
--- a/Project.toml
+++ b/Project.toml
@@ -18,13 +18,14 @@ AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 AMDGPUExt = "AMDGPU"
 
 [compat]
-AMDGPU = "0.4.5"
+AMDGPU = "0.4.7"
 Adapt = "2, 3.2"
 ChainRulesCore = "1.13"
 Requires = "0.5, 1.0"
 julia = "1.6"
 
 [extras]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
@@ -39,4 +40,4 @@ UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["ChainRulesTestUtils", "AMDGPU", "CUDA", "Documenter", "FiniteDifferences", "ForwardDiff", "Logging", "NNlibCUDA", "Random", "StableRNGs", "Test", "UnicodePlots", "Zygote"]
+test = ["ChainRulesTestUtils", "CUDA", "Documenter", "FiniteDifferences", "ForwardDiff", "Logging", "NNlibCUDA", "Random", "StableRNGs", "Test", "UnicodePlots", "Zygote"]
diff --git a/docs/src/index.md b/docs/src/index.md
index 63168abe7..91adcee0c 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -4,6 +4,5 @@
 
 For use with automatic differentiation, this package defines gradients using [ChainRules.jl](https://github.com/JuliaDiff/ChainRules.jl). These will be seen by various packages including [Zygote.jl](https://github.com/FluxML/Zygote.jl).
 
-To use these functions with [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl)
-or [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) you will need to load them
-and NNlib in the same Julia session.
+To use these functions with [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) you will need [NNlibCUDA.jl](https://github.com/FluxML/NNlibCUDA.jl) as well.
+For [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) you will need to load it and NNlib in the same Julia session.

From d4bd704fe52b2cd5de1a56dac8b68d01436ba7f7 Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Fri, 10 Feb 2023 00:34:34 +0200
Subject: [PATCH 3/4] Update buildkite config

---
 .buildkite/pipeline.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index a06af0e80..eff9249b2 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -76,7 +76,6 @@ steps:
         push!(conf[\"targets\"][\"test\"], \"AMDGPU\")
         open(io -> TOML.print(io, conf), \"Project.toml\", \"w\")
         """
-      - julia --project -e 'using Pkg; Pkg.update()'
     timeout_in_minutes: 30
     env:
       JULIA_AMDGPU_CORE_MUST_LOAD: "1"

From 44f7b3dcf580568d00c4454c212760f279a8f2e8 Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Fri, 10 Feb 2023 00:37:57 +0200
Subject: [PATCH 4/4] Update buildkite config

---
 .buildkite/pipeline.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index eff9249b2..61fe79362 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -71,10 +71,10 @@ steps:
       rocmgpu: "*"
     command:
       - julia -e """
-        using TOML
-        conf = TOML.parse(read(\"Project.toml\", String))
-        push!(conf[\"targets\"][\"test\"], \"AMDGPU\")
-        open(io -> TOML.print(io, conf), \"Project.toml\", \"w\")
+        using TOML;
+        conf = TOML.parse(read(\"Project.toml\", String));
+        push!(conf[\"targets\"][\"test\"], \"AMDGPU\");
+        open(io -> TOML.print(io, conf), \"Project.toml\", \"w\");
         """
     timeout_in_minutes: 30
     env: