JuliaGPU · maleadt · Nov 1, 2023 · Oct 31, 2023 · Oct 31, 2023 · Oct 31, 2023
diff --git a/lib/cusparse/array.jl b/lib/cusparse/array.jl
@@ -417,9 +417,9 @@ Adapt.adapt_storage(::Type{CuArray}, xs::SparseMatrixCSC) = CuSparseMatrixCSC(xs
 Adapt.adapt_storage(::Type{CuArray{T}}, xs::SparseVector) where {T} = CuSparseVector{T}(xs)
 Adapt.adapt_storage(::Type{CuArray{T}}, xs::SparseMatrixCSC) where {T} = CuSparseMatrixCSC{T}(xs)
 
-Adapt.adapt_storage(::CUDA.CuArrayAdaptor, xs::AbstractSparseArray) =
+Adapt.adapt_storage(::CUDA.CuArrayKernelAdaptor, xs::AbstractSparseArray) =
   adapt(CuArray, xs)
-Adapt.adapt_storage(::CUDA.CuArrayAdaptor, xs::AbstractSparseArray{<:AbstractFloat}) =
+Adapt.adapt_storage(::CUDA.CuArrayKernelAdaptor, xs::AbstractSparseArray{<:AbstractFloat}) =
   adapt(CuArray{Float32}, xs)
 
 Adapt.adapt_storage(::Type{Array}, xs::CuSparseVector) = SparseVector(xs)
@@ -546,15 +546,15 @@ end
 
 # interop with device arrays
 
-function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseVector)
+function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseVector)
     return CuSparseDeviceVector(
         adapt(to, x.iPtr),
         adapt(to, x.nzVal),
         length(x), x.nnz
     )
 end
 
-function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSR)
+function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixCSR)
     return CuSparseDeviceMatrixCSR(
         adapt(to, x.rowPtr),
         adapt(to, x.colVal),
@@ -563,7 +563,7 @@ function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSR)
     )
 end
 
-function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSC)
+function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixCSC)
     return CuSparseDeviceMatrixCSC(
         adapt(to, x.colPtr),
         adapt(to, x.rowVal),
@@ -572,7 +572,7 @@ function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSC)
     )
 end
 
-function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixBSR)
+function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixBSR)
     return CuSparseDeviceMatrixBSR(
         adapt(to, x.rowPtr),
         adapt(to, x.colVal),
@@ -582,7 +582,7 @@ function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixBSR)
     )
 end
 
-function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCOO)
+function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixCOO)
     return CuSparseDeviceMatrixCOO(
         adapt(to, x.rowInd),
         adapt(to, x.colInd),

diff --git a/src/array.jl b/src/array.jl
@@ -388,17 +388,17 @@ end
 
 function Base.unsafe_convert(::Type{CuPtr{T}}, x::CuArray{T}) where {T}
   buf = x.data[]
-  if buf isa Mem.UnifiedBuffer
+  if is_unified(x)
     mark_async(buf)
   end
   convert(CuPtr{T}, buf) + x.offset*Base.elsize(x)
 end
 
 function Base.unsafe_convert(::Type{Ptr{T}}, x::CuArray{T}) where {T}
   buf = x.data[]
-  if buf isa Mem.DeviceBuffer
+  if is_device(x)
     throw(ArgumentError("cannot take the CPU address of a $(typeof(x))"))
-  elseif buf isa Mem.UnifiedBuffer
+  elseif is_unified(x)
     ensure_sync(buf)
   end
   convert(Ptr{T}, buf) + x.offset*Base.elsize(x)
@@ -637,19 +637,19 @@ Adapt.adapt_storage(::Type{<:CuArray{T, N, B}}, xs::AT) where {T, N, B, AT<:Abst
 
 # eagerly converts Float64 to Float32, for performance reasons
 
-struct CuArrayAdaptor{B} end
+struct CuArrayKernelAdaptor{B} end
 
-Adapt.adapt_storage(::CuArrayAdaptor{B}, xs::AbstractArray{T,N}) where {T,N,B} =
+Adapt.adapt_storage(::CuArrayKernelAdaptor{B}, xs::AbstractArray{T,N}) where {T,N,B} =
   isbits(xs) ? xs : CuArray{T,N,B}(xs)
 
-Adapt.adapt_storage(::CuArrayAdaptor{B}, xs::AbstractArray{T,N}) where {T<:AbstractFloat,N,B} =
+Adapt.adapt_storage(::CuArrayKernelAdaptor{B}, xs::AbstractArray{T,N}) where {T<:AbstractFloat,N,B} =
   isbits(xs) ? xs : CuArray{Float32,N,B}(xs)
 
-Adapt.adapt_storage(::CuArrayAdaptor{B}, xs::AbstractArray{T,N}) where {T<:Complex{<:AbstractFloat},N,B} =
+Adapt.adapt_storage(::CuArrayKernelAdaptor{B}, xs::AbstractArray{T,N}) where {T<:Complex{<:AbstractFloat},N,B} =
   isbits(xs) ? xs : CuArray{ComplexF32,N,B}(xs)
 
 # not for Float16
-Adapt.adapt_storage(::CuArrayAdaptor{B}, xs::AbstractArray{T,N}) where {T<:Union{Float16,BFloat16},N,B} =
+Adapt.adapt_storage(::CuArrayKernelAdaptor{B}, xs::AbstractArray{T,N}) where {T<:Union{Float16,BFloat16},N,B} =
   isbits(xs) ? xs : CuArray{T,N,B}(xs)
 
 """
@@ -707,7 +707,7 @@ julia> CuArray(1:3)
   else
     default_memory
   end
-  adapt(CuArrayAdaptor{memory}(), xs)
+  adapt(CuArrayKernelAdaptor{memory}(), xs)
 end
 
 Base.getindex(::typeof(cu), xs...) = CuArray([xs...])

diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
@@ -121,35 +121,44 @@ end
 
 ## host to device value conversion
 
-struct Adaptor end
+struct KernelAdaptor end
 
 # convert CUDA host pointers to device pointers
 # TODO: use ordinary ptr?
-Adapt.adapt_storage(to::Adaptor, p::CuPtr{T}) where {T} = reinterpret(LLVMPtr{T,AS.Generic}, p)
+Adapt.adapt_storage(to::KernelAdaptor, p::CuPtr{T}) where {T} =
+    reinterpret(LLVMPtr{T,AS.Generic}, p)
+
+# convert CUDA host arrays to device arrays
+function Adapt.adapt_storage(::KernelAdaptor, xs::DenseCuArray{T,N}) where {T,N}
+  # prefetch unified memory as we're likely to use it on the GPU
+  # TODO: make this configurable?
+  if is_unified(xs) && sizeof(xs) > 0 && !is_capturing()
+    buf = xs.data[]
+    subbuf = Mem.UnifiedBuffer(buf.ctx, pointer(xs), sizeof(xs))
+    Mem.prefetch(subbuf)
+  end
+
+  Base.unsafe_convert(CuDeviceArray{T,N,AS.Global}, xs)
+end
 
 # Base.RefValue isn't GPU compatible, so provide a compatible alternative
 struct CuRefValue{T} <: Ref{T}
   x::T
 end
 Base.getindex(r::CuRefValue{T}) where T = r.x
-Adapt.adapt_structure(to::Adaptor, r::Base.RefValue) = CuRefValue(adapt(to, r[]))
+Adapt.adapt_structure(to::KernelAdaptor, r::Base.RefValue) = CuRefValue(adapt(to, r[]))
 
 # broadcast sometimes passes a ref(type), resulting in a GPU-incompatible DataType box.
 # avoid that by using a special kind of ref that knows about the boxed type.
 struct CuRefType{T} <: Ref{DataType} end
 Base.getindex(r::CuRefType{T}) where T = T
-Adapt.adapt_structure(to::Adaptor, r::Base.RefValue{<:Union{DataType,Type}}) = CuRefType{r[]}()
+Adapt.adapt_structure(to::KernelAdaptor, r::Base.RefValue{<:Union{DataType,Type}}) =
+    CuRefType{r[]}()
 
 # case where type is the function being broadcasted
-Adapt.adapt_structure(to::Adaptor, bc::Base.Broadcast.Broadcasted{Style, <:Any, Type{T}}) where {Style, T} =
-    Base.Broadcast.Broadcasted{Style}((x...) -> T(x...), adapt(to, bc.args), bc.axes)
-
-Adapt.adapt_storage(::Adaptor, xs::CuArray{T,N}) where {T,N} =
-  Base.unsafe_convert(CuDeviceArray{T,N,AS.Global}, xs)
-
-# we materialize ReshapedArray/ReinterpretArray/SubArray/... directly as a device array
-Adapt.adapt_structure(::Adaptor, xs::DenseCuArray{T,N}) where {T,N} =
-  Base.unsafe_convert(CuDeviceArray{T,N,AS.Global}, xs)
+Adapt.adapt_structure(to::KernelAdaptor,
+                      bc::Broadcast.Broadcasted{Style, <:Any, Type{T}}) where {Style, T} =
+    Broadcast.Broadcasted{Style}((x...) -> T(x...), adapt(to, bc.args), bc.axes)
 
 """
     cudaconvert(x)
@@ -159,9 +168,9 @@ converted to a GPU-friendly format. By default, the function does nothing and re
 input object `x` as-is.
 
 Do not add methods to this function, but instead extend the underlying Adapt.jl package and
-register methods for the the `CUDA.Adaptor` type.
+register methods for the the `CUDA.KernelAdaptor` type.
 """
-cudaconvert(arg) = adapt(Adaptor(), arg)
+cudaconvert(arg) = adapt(KernelAdaptor(), arg)
 
 
 ## abstract kernel functionality

diff --git a/src/texture.jl b/src/texture.jl
@@ -319,6 +319,6 @@ memory_source(::Any) = error("Unknown texture source $(typeof(t))")
 memory_source(::CuArray) = LinearMemory()
 memory_source(::CuTextureArray) = ArrayMemory()
 
-Adapt.adapt_storage(::Adaptor, t::CuTexture{T,N}) where {T,N} =
+Adapt.adapt_storage(::KernelAdaptor, t::CuTexture{T,N}) where {T,N} =
     CuDeviceTexture{T,N,typeof(memory_source(parent(t))),
                     t.normalized_coordinates, typeof(t.interpolation)}(size(t), t.handle)
diff --git a/test/core/execution.jl b/test/core/execution.jl
@@ -470,7 +470,7 @@ end
     @eval struct Host   end
     @eval struct Device end
 
-    Adapt.adapt_storage(::CUDA.Adaptor, a::Host) = Device()
+    Adapt.adapt_storage(::CUDA.KernelAdaptor, a::Host) = Device()
 
     Base.convert(::Type{Int}, ::Host)   = 1
     Base.convert(::Type{Int}, ::Device) = 2