Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better support for unified and host memory #2138

Merged
merged 19 commits into from
Nov 1, 2023
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -241,10 +241,29 @@ steps:
env:
JULIA_CUDA_USE_COMPAT: 'false' # NVIDIA bug #3418723: injection tools prevent probing libcuda
if: build.message !~ /\[skip tests\]/ &&
build.message !~ /\[skip sanitizer\]/ &&
!build.pull_request.draft
timeout_in_minutes: 10

- label: "Unified memory"
plugins:
- JuliaCI/julia#v1:
version: 1.9
- JuliaCI/julia-test#v1:
test_args: "--quickfail"
- JuliaCI/julia-coverage#v1:
dirs:
- src
- lib
- examples
agents:
queue: "juliagpu"
cuda: "*"
commands: |
echo -e "[CUDA]\ndefault_memory = \"unified\"" >LocalPreferences.toml
if: build.message !~ /\[skip tests\]/ &&
!build.pull_request.draft
timeout_in_minutes: 120

# we want to benchmark every commit on the master branch, even if it failed CI
- wait: ~
continue_on_failure: true
Expand Down
4 changes: 4 additions & 0 deletions LocalPreferences.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
# making it possible to do use cooperative multitasking.
#nonblocking_synchronization = true

# which memory type unspecified allocations should default to.
# possible values: "device", "unified", "host"
#default_memory = "device"

[CUDA_Driver_jll]
# whether to attempt to load a forwards-compatibile userspace driver.
# only turn this off if you experience issues, e.g., when using a local
Expand Down
14 changes: 7 additions & 7 deletions lib/cusparse/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -417,9 +417,9 @@ Adapt.adapt_storage(::Type{CuArray}, xs::SparseMatrixCSC) = CuSparseMatrixCSC(xs
Adapt.adapt_storage(::Type{CuArray{T}}, xs::SparseVector) where {T} = CuSparseVector{T}(xs)
Adapt.adapt_storage(::Type{CuArray{T}}, xs::SparseMatrixCSC) where {T} = CuSparseMatrixCSC{T}(xs)

Adapt.adapt_storage(::CUDA.CuArrayAdaptor, xs::AbstractSparseArray) =
Adapt.adapt_storage(::CUDA.CuArrayKernelAdaptor, xs::AbstractSparseArray) =
adapt(CuArray, xs)
Adapt.adapt_storage(::CUDA.CuArrayAdaptor, xs::AbstractSparseArray{<:AbstractFloat}) =
Adapt.adapt_storage(::CUDA.CuArrayKernelAdaptor, xs::AbstractSparseArray{<:AbstractFloat}) =
adapt(CuArray{Float32}, xs)

Adapt.adapt_storage(::Type{Array}, xs::CuSparseVector) = SparseVector(xs)
Expand Down Expand Up @@ -546,15 +546,15 @@ end

# interop with device arrays

function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseVector)
function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseVector)
return CuSparseDeviceVector(
adapt(to, x.iPtr),
adapt(to, x.nzVal),
length(x), x.nnz
)
end

function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSR)
function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixCSR)
return CuSparseDeviceMatrixCSR(
adapt(to, x.rowPtr),
adapt(to, x.colVal),
Expand All @@ -563,7 +563,7 @@ function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSR)
)
end

function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSC)
function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixCSC)
return CuSparseDeviceMatrixCSC(
adapt(to, x.colPtr),
adapt(to, x.rowVal),
Expand All @@ -572,7 +572,7 @@ function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSC)
)
end

function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixBSR)
function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixBSR)
return CuSparseDeviceMatrixBSR(
adapt(to, x.rowPtr),
adapt(to, x.colVal),
Expand All @@ -582,7 +582,7 @@ function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixBSR)
)
end

function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCOO)
function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixCOO)
return CuSparseDeviceMatrixCOO(
adapt(to, x.rowInd),
adapt(to, x.colInd),
Expand Down
139 changes: 113 additions & 26 deletions src/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,23 @@ const CuVector{T} = CuArray{T,1}
const CuMatrix{T} = CuArray{T,2}
const CuVecOrMat{T} = Union{CuVector{T},CuMatrix{T}}

# default to non-unified memory
# unspecified memory allocation
const default_memory = let str = Preferences.@load_preference("default_memory", "device")
if str == "device"
Mem.DeviceBuffer
elseif str == "unified"
Mem.UnifiedBuffer
elseif str == "host"
Mem.HostBuffer
else
error("unknown default memory type: $default_memory")
end
end
CuArray{T,N}(::UndefInitializer, dims::Dims{N}) where {T,N} =
CuArray{T,N,Mem.DeviceBuffer}(undef, dims)
CuArray{T,N,default_memory}(undef, dims)
is_device(a::CuArray) = isa(a.data[], Mem.DeviceBuffer)
is_unified(a::CuArray) = isa(a.data[], Mem.UnifiedBuffer)
is_host(a::CuArray) = isa(a.data[], Mem.HostBuffer)

# buffer, type and dimensionality specified
CuArray{T,N,B}(::UndefInitializer, dims::NTuple{N,Integer}) where {T,N,B} =
Expand Down Expand Up @@ -299,9 +312,15 @@ const StridedCuVector{T} = StridedCuArray{T,1}
const StridedCuMatrix{T} = StridedCuArray{T,2}
const StridedCuVecOrMat{T} = Union{StridedCuVector{T}, StridedCuMatrix{T}}

Base.pointer(x::StridedCuArray{T}) where {T} = Base.unsafe_convert(CuPtr{T}, x)
@inline function Base.pointer(x::StridedCuArray{T}, i::Integer) where T
Base.unsafe_convert(CuPtr{T}, x) + Base._memory_offset(x, i)
@inline function Base.pointer(x::StridedCuArray{T}, i::Integer=1; type=Mem.Device) where T
PT = if type == Mem.Device
CuPtr{T}
elseif type == Mem.Host
Ptr{T}
else
error("unknown memory type")
end
Base.unsafe_convert(PT, x) + Base._memory_offset(x, i)
end

# anything that's (secretly) backed by a CuArray
Expand All @@ -320,7 +339,7 @@ const AnyCuVecOrMat{T} = Union{AnyCuVector{T}, AnyCuMatrix{T}}
end

@inline CuArray{T,N}(xs::AbstractArray{<:Any,N}) where {T,N} =
CuArray{T,N,Mem.Device}(xs)
CuArray{T,N,default_memory}(xs)

@inline CuArray{T,N}(xs::CuArray{<:Any,N,B}) where {T,N,B} =
CuArray{T,N,B}(xs)
Expand All @@ -340,15 +359,61 @@ CuArray{T,N}(xs::CuArray{T,N,B}) where {T,N,B} = copy(xs)
Base.convert(::Type{T}, x::T) where T <: CuArray = x


## interop with C libraries
## interop with libraries

# when a unified buffer is converted to a device pointer, we assume it will be accessed
# asynchronously. we keep track of that in the task local storage, and use that information
# to perform additional synchronization when converting the buffer to a host pointer.
# TODO: optimize this! it currently halves the performance of scalar indexing.
function mark_async(buf::Mem.UnifiedBuffer)
tls = task_local_storage()
if haskey(tls, :CUDA_ASYNC_BUFFERS)
async_buffers = tls[:CUDA_ASYNC_BUFFERS]::Vector{Mem.UnifiedBuffer}
in(buf, async_buffers) && return
pushfirst!(async_buffers, buf)
else
tls[:CUDA_ASYNC_BUFFERS] = [buf]
end
return
end
function ensure_sync(buf::Mem.UnifiedBuffer)
tls = task_local_storage()
haskey(tls, :CUDA_ASYNC_BUFFERS) || return
async_buffers = tls[:CUDA_ASYNC_BUFFERS]::Vector{Mem.UnifiedBuffer}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I really need to finish TaskLocalValues.jl xD

in(buf, async_buffers) || return
synchronize()
filter!(!isequal(buf), async_buffers)
return
end

Base.unsafe_convert(::Type{Ptr{T}}, x::CuArray{T}) where {T} =
throw(ArgumentError("cannot take the CPU address of a $(typeof(x))"))
function Base.unsafe_convert(::Type{CuPtr{T}}, x::CuArray{T}) where {T}
convert(CuPtr{T}, x.data[]) + x.offset*Base.elsize(x)
buf = x.data[]
if is_unified(x)
mark_async(buf)
end
convert(CuPtr{T}, buf) + x.offset*Base.elsize(x)
end

function Base.unsafe_convert(::Type{Ptr{T}}, x::CuArray{T}) where {T}
buf = x.data[]
if is_device(x)
throw(ArgumentError("cannot take the CPU address of a $(typeof(x))"))
elseif is_unified(x)
ensure_sync(buf)
end
convert(Ptr{T}, buf) + x.offset*Base.elsize(x)
end


## indexing

Base.getindex(x::CuArray{<:Any, <:Any, Mem.UnifiedBuffer}, I::Int) =
unsafe_load(pointer(x, I; type=Mem.Host))

Base.setindex!(x::CuArray{<:Any, <:Any, Mem.UnifiedBuffer}, v, I::Int) =
unsafe_store!(pointer(x, I; type=Mem.Host), v)


## interop with device arrays

function Base.unsafe_convert(::Type{CuDeviceArray{T,N,AS.Global}}, a::DenseCuArray{T,N}) where {T,N}
Expand All @@ -360,8 +425,16 @@ end
## memory copying

typetagdata(a::Array, i=1) = ccall(:jl_array_typetagdata, Ptr{UInt8}, (Any,), a) + i - 1
typetagdata(a::CuArray, i=1) =
convert(CuPtr{UInt8}, a.data[]) + a.maxsize + a.offset + i - 1
function typetagdata(a::CuArray, i=1; type=Mem.Device)
PT = if type == Mem.Device
CuPtr{UInt8}
elseif type == Mem.Host
Ptr{UInt8}
else
error("unknown memory type")
end
convert(PT, a.data[]) + a.maxsize + a.offset + i - 1
end

function Base.copyto!(dest::DenseCuArray{T}, doffs::Integer, src::Array{T}, soffs::Integer,
n::Integer) where T
Expand Down Expand Up @@ -476,11 +549,11 @@ function Base.unsafe_copyto!(dest::DenseCuArray{T,<:Any,<:Union{Mem.UnifiedBuffe
synchronize()

GC.@preserve src dest begin
cpu_ptr = pointer(src, soffs)
unsafe_copyto!(host_pointer(pointer(dest, doffs)), cpu_ptr, n)
ptr = pointer(src, soffs)
unsafe_copyto!(pointer(dest, doffs; type=Mem.Host), ptr, n)
if Base.isbitsunion(T)
cpu_ptr = typetagdata(src, soffs)
unsafe_copyto!(host_pointer(typetagdata(dest, doffs)), cpu_ptr, n)
ptr = typetagdata(src, soffs)
unsafe_copyto!(typetagdata(dest, doffs; type=Mem.Host), ptr, n)
end
end
return dest
Expand All @@ -492,11 +565,11 @@ function Base.unsafe_copyto!(dest::Array{T}, doffs,
synchronize()

GC.@preserve src dest begin
cpu_ptr = pointer(dest, doffs)
unsafe_copyto!(cpu_ptr, host_pointer(pointer(src, soffs)), n)
ptr = pointer(dest, doffs)
unsafe_copyto!(ptr, pointer(src, soffs; type=Mem.Host), n)
if Base.isbitsunion(T)
cpu_ptr = typetagdata(dest, doffs)
unsafe_copyto!(cpu_ptr, host_pointer(typetagdata(src, soffs)), n)
ptr = typetagdata(dest, doffs)
unsafe_copyto!(ptr, typetagdata(src, soffs; type=Mem.Host), n)
end
end

Expand Down Expand Up @@ -564,19 +637,19 @@ Adapt.adapt_storage(::Type{<:CuArray{T, N, B}}, xs::AT) where {T, N, B, AT<:Abst

# eagerly converts Float64 to Float32, for performance reasons

struct CuArrayAdaptor{B} end
struct CuArrayKernelAdaptor{B} end

Adapt.adapt_storage(::CuArrayAdaptor{B}, xs::AbstractArray{T,N}) where {T,N,B} =
Adapt.adapt_storage(::CuArrayKernelAdaptor{B}, xs::AbstractArray{T,N}) where {T,N,B} =
isbits(xs) ? xs : CuArray{T,N,B}(xs)

Adapt.adapt_storage(::CuArrayAdaptor{B}, xs::AbstractArray{T,N}) where {T<:AbstractFloat,N,B} =
Adapt.adapt_storage(::CuArrayKernelAdaptor{B}, xs::AbstractArray{T,N}) where {T<:AbstractFloat,N,B} =
isbits(xs) ? xs : CuArray{Float32,N,B}(xs)

Adapt.adapt_storage(::CuArrayAdaptor{B}, xs::AbstractArray{T,N}) where {T<:Complex{<:AbstractFloat},N,B} =
Adapt.adapt_storage(::CuArrayKernelAdaptor{B}, xs::AbstractArray{T,N}) where {T<:Complex{<:AbstractFloat},N,B} =
isbits(xs) ? xs : CuArray{ComplexF32,N,B}(xs)

# not for Float16
Adapt.adapt_storage(::CuArrayAdaptor{B}, xs::AbstractArray{T,N}) where {T<:Union{Float16,BFloat16},N,B} =
Adapt.adapt_storage(::CuArrayKernelAdaptor{B}, xs::AbstractArray{T,N}) where {T<:Union{Float16,BFloat16},N,B} =
isbits(xs) ? xs : CuArray{T,N,B}(xs)

"""
Expand Down Expand Up @@ -621,7 +694,21 @@ julia> CuArray(1:3)
3
```
"""
@inline cu(xs; unified::Bool=false) = adapt(CuArrayAdaptor{unified ? Mem.UnifiedBuffer : Mem.DeviceBuffer}(), xs)
@inline function cu(xs; device::Bool=false, unified::Bool=false, host::Bool=false)
if device + unified + host > 1
throw(ArgumentError("Can only specify one of `device`, `unified`, or `host`"))
end
memory = if device
Mem.DeviceBuffer
elseif unified
Mem.UnifiedBuffer
elseif host
Mem.HostBuffer
else
default_memory
end
adapt(CuArrayKernelAdaptor{memory}(), xs)
end

Base.getindex(::typeof(cu), xs...) = CuArray([xs...])

Expand Down
39 changes: 24 additions & 15 deletions src/compiler/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -121,35 +121,44 @@ end

## host to device value conversion

struct Adaptor end
struct KernelAdaptor end

# convert CUDA host pointers to device pointers
# TODO: use ordinary ptr?
Adapt.adapt_storage(to::Adaptor, p::CuPtr{T}) where {T} = reinterpret(LLVMPtr{T,AS.Generic}, p)
Adapt.adapt_storage(to::KernelAdaptor, p::CuPtr{T}) where {T} =
reinterpret(LLVMPtr{T,AS.Generic}, p)

# convert CUDA host arrays to device arrays
function Adapt.adapt_storage(::KernelAdaptor, xs::DenseCuArray{T,N}) where {T,N}
# prefetch unified memory as we're likely to use it on the GPU
# TODO: make this configurable?
if is_unified(xs) && sizeof(xs) > 0 && !is_capturing()
buf = xs.data[]
subbuf = Mem.UnifiedBuffer(buf.ctx, pointer(xs), sizeof(xs))
Mem.prefetch(subbuf)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pmccormick you were asking me about this yesterday.

end

Base.unsafe_convert(CuDeviceArray{T,N,AS.Global}, xs)
end

# Base.RefValue isn't GPU compatible, so provide a compatible alternative
struct CuRefValue{T} <: Ref{T}
x::T
end
Base.getindex(r::CuRefValue{T}) where T = r.x
Adapt.adapt_structure(to::Adaptor, r::Base.RefValue) = CuRefValue(adapt(to, r[]))
Adapt.adapt_structure(to::KernelAdaptor, r::Base.RefValue) = CuRefValue(adapt(to, r[]))

# broadcast sometimes passes a ref(type), resulting in a GPU-incompatible DataType box.
# avoid that by using a special kind of ref that knows about the boxed type.
struct CuRefType{T} <: Ref{DataType} end
Base.getindex(r::CuRefType{T}) where T = T
Adapt.adapt_structure(to::Adaptor, r::Base.RefValue{<:Union{DataType,Type}}) = CuRefType{r[]}()
Adapt.adapt_structure(to::KernelAdaptor, r::Base.RefValue{<:Union{DataType,Type}}) =
CuRefType{r[]}()

# case where type is the function being broadcasted
Adapt.adapt_structure(to::Adaptor, bc::Base.Broadcast.Broadcasted{Style, <:Any, Type{T}}) where {Style, T} =
Base.Broadcast.Broadcasted{Style}((x...) -> T(x...), adapt(to, bc.args), bc.axes)

Adapt.adapt_storage(::Adaptor, xs::CuArray{T,N}) where {T,N} =
Base.unsafe_convert(CuDeviceArray{T,N,AS.Global}, xs)

# we materialize ReshapedArray/ReinterpretArray/SubArray/... directly as a device array
Adapt.adapt_structure(::Adaptor, xs::DenseCuArray{T,N}) where {T,N} =
Base.unsafe_convert(CuDeviceArray{T,N,AS.Global}, xs)
Adapt.adapt_structure(to::KernelAdaptor,
bc::Broadcast.Broadcasted{Style, <:Any, Type{T}}) where {Style, T} =
Broadcast.Broadcasted{Style}((x...) -> T(x...), adapt(to, bc.args), bc.axes)

"""
cudaconvert(x)
Expand All @@ -159,9 +168,9 @@ converted to a GPU-friendly format. By default, the function does nothing and re
input object `x` as-is.

Do not add methods to this function, but instead extend the underlying Adapt.jl package and
register methods for the the `CUDA.Adaptor` type.
register methods for the the `CUDA.KernelAdaptor` type.
"""
cudaconvert(arg) = adapt(Adaptor(), arg)
cudaconvert(arg) = adapt(KernelAdaptor(), arg)


## abstract kernel functionality
Expand Down
2 changes: 1 addition & 1 deletion src/texture.jl
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,6 @@ memory_source(::Any) = error("Unknown texture source $(typeof(t))")
memory_source(::CuArray) = LinearMemory()
memory_source(::CuTextureArray) = ArrayMemory()

Adapt.adapt_storage(::Adaptor, t::CuTexture{T,N}) where {T,N} =
Adapt.adapt_storage(::KernelAdaptor, t::CuTexture{T,N}) where {T,N} =
CuDeviceTexture{T,N,typeof(memory_source(parent(t))),
t.normalized_coordinates, typeof(t.interpolation)}(size(t), t.handle)
Loading