Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better support for unified and host memory #2138

Merged
merged 19 commits into from
Nov 1, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions lib/cusparse/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -417,9 +417,9 @@ Adapt.adapt_storage(::Type{CuArray}, xs::SparseMatrixCSC) = CuSparseMatrixCSC(xs
Adapt.adapt_storage(::Type{CuArray{T}}, xs::SparseVector) where {T} = CuSparseVector{T}(xs)
Adapt.adapt_storage(::Type{CuArray{T}}, xs::SparseMatrixCSC) where {T} = CuSparseMatrixCSC{T}(xs)

Adapt.adapt_storage(::CUDA.CuArrayAdaptor, xs::AbstractSparseArray) =
Adapt.adapt_storage(::CUDA.CuArrayKernelAdaptor, xs::AbstractSparseArray) =
adapt(CuArray, xs)
Adapt.adapt_storage(::CUDA.CuArrayAdaptor, xs::AbstractSparseArray{<:AbstractFloat}) =
Adapt.adapt_storage(::CUDA.CuArrayKernelAdaptor, xs::AbstractSparseArray{<:AbstractFloat}) =
adapt(CuArray{Float32}, xs)

Adapt.adapt_storage(::Type{Array}, xs::CuSparseVector) = SparseVector(xs)
Expand Down Expand Up @@ -546,15 +546,15 @@ end

# interop with device arrays

function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseVector)
function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseVector)
return CuSparseDeviceVector(
adapt(to, x.iPtr),
adapt(to, x.nzVal),
length(x), x.nnz
)
end

function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSR)
function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixCSR)
return CuSparseDeviceMatrixCSR(
adapt(to, x.rowPtr),
adapt(to, x.colVal),
Expand All @@ -563,7 +563,7 @@ function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSR)
)
end

function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSC)
function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixCSC)
return CuSparseDeviceMatrixCSC(
adapt(to, x.colPtr),
adapt(to, x.rowVal),
Expand All @@ -572,7 +572,7 @@ function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSC)
)
end

function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixBSR)
function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixBSR)
return CuSparseDeviceMatrixBSR(
adapt(to, x.rowPtr),
adapt(to, x.colVal),
Expand All @@ -582,7 +582,7 @@ function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixBSR)
)
end

function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCOO)
function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixCOO)
return CuSparseDeviceMatrixCOO(
adapt(to, x.rowInd),
adapt(to, x.colInd),
Expand Down
18 changes: 9 additions & 9 deletions src/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -388,17 +388,17 @@ end

function Base.unsafe_convert(::Type{CuPtr{T}}, x::CuArray{T}) where {T}
buf = x.data[]
if buf isa Mem.UnifiedBuffer
if is_unified(x)
mark_async(buf)
end
convert(CuPtr{T}, buf) + x.offset*Base.elsize(x)
end

function Base.unsafe_convert(::Type{Ptr{T}}, x::CuArray{T}) where {T}
buf = x.data[]
if buf isa Mem.DeviceBuffer
if is_device(x)
throw(ArgumentError("cannot take the CPU address of a $(typeof(x))"))
elseif buf isa Mem.UnifiedBuffer
elseif is_unified(x)
ensure_sync(buf)
end
convert(Ptr{T}, buf) + x.offset*Base.elsize(x)
Expand Down Expand Up @@ -637,19 +637,19 @@ Adapt.adapt_storage(::Type{<:CuArray{T, N, B}}, xs::AT) where {T, N, B, AT<:Abst

# eagerly converts Float64 to Float32, for performance reasons

struct CuArrayAdaptor{B} end
struct CuArrayKernelAdaptor{B} end

Adapt.adapt_storage(::CuArrayAdaptor{B}, xs::AbstractArray{T,N}) where {T,N,B} =
Adapt.adapt_storage(::CuArrayKernelAdaptor{B}, xs::AbstractArray{T,N}) where {T,N,B} =
isbits(xs) ? xs : CuArray{T,N,B}(xs)

Adapt.adapt_storage(::CuArrayAdaptor{B}, xs::AbstractArray{T,N}) where {T<:AbstractFloat,N,B} =
Adapt.adapt_storage(::CuArrayKernelAdaptor{B}, xs::AbstractArray{T,N}) where {T<:AbstractFloat,N,B} =
isbits(xs) ? xs : CuArray{Float32,N,B}(xs)

Adapt.adapt_storage(::CuArrayAdaptor{B}, xs::AbstractArray{T,N}) where {T<:Complex{<:AbstractFloat},N,B} =
Adapt.adapt_storage(::CuArrayKernelAdaptor{B}, xs::AbstractArray{T,N}) where {T<:Complex{<:AbstractFloat},N,B} =
isbits(xs) ? xs : CuArray{ComplexF32,N,B}(xs)

# not for Float16
Adapt.adapt_storage(::CuArrayAdaptor{B}, xs::AbstractArray{T,N}) where {T<:Union{Float16,BFloat16},N,B} =
Adapt.adapt_storage(::CuArrayKernelAdaptor{B}, xs::AbstractArray{T,N}) where {T<:Union{Float16,BFloat16},N,B} =
isbits(xs) ? xs : CuArray{T,N,B}(xs)

"""
Expand Down Expand Up @@ -707,7 +707,7 @@ julia> CuArray(1:3)
else
default_memory
end
adapt(CuArrayAdaptor{memory}(), xs)
adapt(CuArrayKernelAdaptor{memory}(), xs)
end

Base.getindex(::typeof(cu), xs...) = CuArray([xs...])
Expand Down
39 changes: 24 additions & 15 deletions src/compiler/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -121,35 +121,44 @@ end

## host to device value conversion

struct Adaptor end
struct KernelAdaptor end

# convert CUDA host pointers to device pointers
# TODO: use ordinary ptr?
Adapt.adapt_storage(to::Adaptor, p::CuPtr{T}) where {T} = reinterpret(LLVMPtr{T,AS.Generic}, p)
Adapt.adapt_storage(to::KernelAdaptor, p::CuPtr{T}) where {T} =
reinterpret(LLVMPtr{T,AS.Generic}, p)

# convert CUDA host arrays to device arrays
function Adapt.adapt_storage(::KernelAdaptor, xs::DenseCuArray{T,N}) where {T,N}
# prefetch unified memory as we're likely to use it on the GPU
# TODO: make this configurable?
if is_unified(xs) && sizeof(xs) > 0 && !is_capturing()
buf = xs.data[]
subbuf = Mem.UnifiedBuffer(buf.ctx, pointer(xs), sizeof(xs))
Mem.prefetch(subbuf)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pmccormick you were asking me about this yesterday.

end

Base.unsafe_convert(CuDeviceArray{T,N,AS.Global}, xs)
end

# Base.RefValue isn't GPU compatible, so provide a compatible alternative
struct CuRefValue{T} <: Ref{T}
x::T
end
Base.getindex(r::CuRefValue{T}) where T = r.x
Adapt.adapt_structure(to::Adaptor, r::Base.RefValue) = CuRefValue(adapt(to, r[]))
Adapt.adapt_structure(to::KernelAdaptor, r::Base.RefValue) = CuRefValue(adapt(to, r[]))

# broadcast sometimes passes a ref(type), resulting in a GPU-incompatible DataType box.
# avoid that by using a special kind of ref that knows about the boxed type.
struct CuRefType{T} <: Ref{DataType} end
Base.getindex(r::CuRefType{T}) where T = T
Adapt.adapt_structure(to::Adaptor, r::Base.RefValue{<:Union{DataType,Type}}) = CuRefType{r[]}()
Adapt.adapt_structure(to::KernelAdaptor, r::Base.RefValue{<:Union{DataType,Type}}) =
CuRefType{r[]}()

# case where type is the function being broadcasted
Adapt.adapt_structure(to::Adaptor, bc::Base.Broadcast.Broadcasted{Style, <:Any, Type{T}}) where {Style, T} =
Base.Broadcast.Broadcasted{Style}((x...) -> T(x...), adapt(to, bc.args), bc.axes)

Adapt.adapt_storage(::Adaptor, xs::CuArray{T,N}) where {T,N} =
Base.unsafe_convert(CuDeviceArray{T,N,AS.Global}, xs)

# we materialize ReshapedArray/ReinterpretArray/SubArray/... directly as a device array
Adapt.adapt_structure(::Adaptor, xs::DenseCuArray{T,N}) where {T,N} =
Base.unsafe_convert(CuDeviceArray{T,N,AS.Global}, xs)
Adapt.adapt_structure(to::KernelAdaptor,
bc::Broadcast.Broadcasted{Style, <:Any, Type{T}}) where {Style, T} =
Broadcast.Broadcasted{Style}((x...) -> T(x...), adapt(to, bc.args), bc.axes)

"""
cudaconvert(x)
Expand All @@ -159,9 +168,9 @@ converted to a GPU-friendly format. By default, the function does nothing and re
input object `x` as-is.

Do not add methods to this function, but instead extend the underlying Adapt.jl package and
register methods for the the `CUDA.Adaptor` type.
register methods for the the `CUDA.KernelAdaptor` type.
"""
cudaconvert(arg) = adapt(Adaptor(), arg)
cudaconvert(arg) = adapt(KernelAdaptor(), arg)


## abstract kernel functionality
Expand Down
2 changes: 1 addition & 1 deletion src/texture.jl
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,6 @@ memory_source(::Any) = error("Unknown texture source $(typeof(t))")
memory_source(::CuArray) = LinearMemory()
memory_source(::CuTextureArray) = ArrayMemory()

Adapt.adapt_storage(::Adaptor, t::CuTexture{T,N}) where {T,N} =
Adapt.adapt_storage(::KernelAdaptor, t::CuTexture{T,N}) where {T,N} =
CuDeviceTexture{T,N,typeof(memory_source(parent(t))),
t.normalized_coordinates, typeof(t.interpolation)}(size(t), t.handle)
2 changes: 1 addition & 1 deletion test/core/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@ end
@eval struct Host end
@eval struct Device end

Adapt.adapt_storage(::CUDA.Adaptor, a::Host) = Device()
Adapt.adapt_storage(::CUDA.KernelAdaptor, a::Host) = Device()

Base.convert(::Type{Int}, ::Host) = 1
Base.convert(::Type{Int}, ::Device) = 2
Expand Down