JuliaNeuralGraphics · pxl-th · Jun 1, 2023 · Jun 2, 2023 · Jun 2, 2023 · Jun 3, 2023
diff --git a/Project.toml b/Project.toml
@@ -4,11 +4,12 @@ authors = ["Anton Smirnov <[email protected]>"]
 version = "0.1.0"
 
 [deps]
-AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 ImageCore = "a09fc81d-aa75-5fe9-8630-4744c3626534"
 ImageMagick = "6218d12a-5da1-5696-b52f-db25d2ecc6d1"
 ImageTransformations = "02fcd773-0e25-5acc-982a-7f6622650795"
@@ -25,6 +26,5 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
-AMDGPU = "0.4"
 KernelAbstractions = "0.9"
 Zygote = "0.6.55"
diff --git a/src/Nerf.jl b/src/Nerf.jl
@@ -16,8 +16,7 @@ using Rotations
 using StaticArrays
 using Statistics
 using Zygote
-
-# TODO rand on device
+using Flux
 
 include("kautils.jl")
 
@@ -75,38 +74,33 @@ include("ray.jl")
 include("acceleration/occupancy.jl")
 include("encoding/grid.jl")
 include("encoding/spherical_harmonics.jl")
-include("nn/nn.jl")
 include("sampler.jl")
 include("loss.jl")
 include("trainer.jl")
 include("renderer/renderer.jl")
+include("models/common.jl")
 include("models/basic.jl")
 include("marching_cubes/marching_cubes.jl")
 include("marching_tetrahedra/marching_tetrahedra.jl")
 
-function sync_free!(Backend, args...)
-    unsafe_free!.(args)
-end
-
 @info "[Nerf.jl] Backend: $BACKEND_NAME"
 @info "[Nerf.jl] Device: $Backend"
 
 # TODO
-# - use Flux for models
+# - join Backend and Flux.gpu somehow
 # - non-allocating renderer (except NN part)
-# - get rid of sync_free
 
 function main()
     config_file = joinpath(pkgdir(Nerf), "data", "raccoon_sofa2", "transforms.json")
     dataset = Dataset(Backend; config_file)
 
-    model = BasicModel(BasicField(Backend))
+    model = BasicModel(BasicField()) |> Flux.gpu
     trainer = Trainer(model, dataset; n_rays=512)
 
     camera = Camera(MMatrix{3, 4, Float32}(I), dataset.intrinsics)
     renderer = Renderer(Backend, camera, trainer.bbox, trainer.cone)
 
-    for i in 1:20_000
+    for i in 1:100
         loss = step!(trainer)
         @show i, loss
 
@@ -143,9 +137,28 @@ end
 function benchmark()
     config_file = joinpath(pkgdir(Nerf), "data", "raccoon_sofa2", "transforms.json")
     dataset = Dataset(Backend; config_file)
-    model = BasicModel(BasicField(Backend))
+    model = BasicModel(BasicField()) |> Flux.gpu
     trainer = Trainer(model, dataset; n_rays=512)
 
+    positions = CUDA.rand(Float32, 3, 512 * 512)
+    directions = CUDA.rand(Float32, 3, 512 * 512)
+
+    @time begin
+        for i in 1:10
+            model(positions, directions)
+        end
+        CUDA.synchronize()
+    end
+
+    @time begin
+        for i in 1:1000
+            model(positions, directions)
+        end
+        CUDA.synchronize()
+    end
+    return
+
+
     # GC.enable_logging(true)
 
     Core.println("Trainer benchmark")

diff --git a/src/acceleration/occupancy.jl b/src/acceleration/occupancy.jl
@@ -73,11 +73,11 @@ function update!(
 
     step ÷= update_frequency
 
-    Backend = get_backend(oc)
-    points = allocate(Backend, SVector{3, Float32}, (n_samples,))
-    indices = allocate(Backend, UInt32, (n_samples,))
+    kab = get_backend(oc)
+    points = allocate(kab, SVector{3, Float32}, (n_samples,))
+    indices = allocate(kab, UInt32, (n_samples,))
 
-    gp_kernel = generate_points!(Backend)
+    gp_kernel = generate_points!(kab)
     gp_kernel(
         points, indices, rng_state, density, bbox,
         -0.01f0, UInt32(step); ndrange=n_uniform)
@@ -92,36 +92,48 @@ function update!(
 
     raw_points = reshape(reinterpret(Float32, points), 3, :)
     log_densities = density_eval_fn(raw_points)
-    sync_free!(Backend, points)
+    unsafe_free!(points)
+    if BACKEND_NAME == "AMD"
+        KernelAbstractions.synchronize(kab)
+    end
 
-    tmp_density = KernelAbstractions.zeros(Backend, Float32, size(oc.density))
-    distribute_density!(Backend)(
+    tmp_density = KernelAbstractions.zeros(kab, Float32, size(oc.density))
+    distribute_density!(kab)(
         reinterpret(UInt32, tmp_density), log_densities,
         indices, cone.min_stepsize; ndrange=length(indices))
-    sync_free!(Backend, indices, log_densities)
+    unsafe_free!.((indices, log_densities))
+    if BACKEND_NAME == "AMD"
+        KernelAbstractions.synchronize(kab)
+    end
 
-    ema_update!(Backend)(
+    ema_update!(kab)(
         oc.density, tmp_density, decay; ndrange=length(oc.density))
-    sync_free!(Backend, tmp_density)
+    unsafe_free!(tmp_density)
+    if BACKEND_NAME == "AMD"
+        KernelAbstractions.synchronize(kab)
+    end
 
     update_binary!(oc; threshold)
+    if BACKEND_NAME == "AMD"
+        KernelAbstractions.synchronize(kab)
+    end
     return rng_state
 end
 
 function update_binary!(oc::OccupancyGrid; threshold::Float32 = 0.01f0)
-    Backend = get_backend(oc)
+    kab = get_backend(oc)
 
     oc.mean_density = mean(x -> max(0f0, x), @view(oc.density[:, :, :, 1]))
     threshold = min(threshold, oc.mean_density)
-    distribute_to_binary!(Backend)(
+    distribute_to_binary!(kab)(
         oc.binary, oc.density, threshold; ndrange=length(oc.binary))
 
     binary_level_length = offset_binary(oc, 1)
     binary_resolution = UInt32(size(oc.density, 1) ÷ 8)
     ndrange = binary_level_length ÷ 8
     n_levels = size(oc.density, 4)
 
-    bmp_kernel = binary_max_pool!(Backend)
+    bmp_kernel = binary_max_pool!(kab)
     for l in 1:(n_levels - 1)
         s, m, e = binary_level_length .* ((l - 1), l, (l + 1))
         prev_level = @view(oc.binary[(s + 1):m])
@@ -222,9 +234,9 @@ end
 function mark_invisible_regions!(
     oc::OccupancyGrid; intrinsics, rotations, translations,
 )
-    Backend = get_backend(oc)
+    kab = get_backend(oc)
     res_scale = 0.5f0 .* intrinsics.resolution ./ intrinsics.focal
-    _mark_invisible_regions!(Backend)(
+    _mark_invisible_regions!(kab)(
         oc.density, rotations, translations, res_scale;
         ndrange=length(oc.density))
 end

diff --git a/src/encoding/grid.jl b/src/encoding/grid.jl
@@ -1,7 +1,8 @@
 include("grid_utils.jl")
 include("grid_kernels.jl")
 
-struct GridEncoding{O}
+struct GridEncoding{O, T}
+    θ::T
     offset_table::O
     n_dims::UInt32
     n_features::UInt32
@@ -10,9 +11,12 @@ struct GridEncoding{O}
     base_resolution::UInt32
     scale::Float32
 end
+Flux.@functor GridEncoding
 
-function GridEncoding(
-    Backend; n_levels::Int = 16, scale::Float32 = 1.5f0,
+Flux.trainable(ge::GridEncoding) = (; θ=ge.θ)
+
+function GridEncoding(;
+    n_levels::Int = 16, scale::Float32 = 1.5f0,
     base_resolution::Int = 16, n_features::Int = 2, hashmap_size::Int = 19,
 )
     @assert n_levels < 34 "Too many levels for the offset table."
@@ -39,8 +43,10 @@ function GridEncoding(
 
     offset_table[end] = offset
     n_params = offset * n_features
+    θ = rand(Float32, n_features, n_params ÷ n_features) .* 2f-4 .- 1f-4
+
     GridEncoding(
-        adapt(Backend, offset_table), UInt32(n_dims), UInt32(n_features),
+        θ, offset_table, UInt32(n_dims), UInt32(n_features),
         UInt32(n_levels), UInt32(n_params), UInt32(base_resolution), scale)
 end
 
@@ -52,31 +58,26 @@ function _get_kernel_params(ge)
     NPD, NFPL
 end
 
-function init(ge::GridEncoding)
-    shape = Int64.((ge.n_features, ge.n_params ÷ ge.n_features))
-    adapt(get_backend(ge), rand(Float32, shape) .* 2f-4 .- 1f-4)
-end
-
-function reset!(::GridEncoding, θ)
-    copy!(θ, rand(Float32, size(θ)) .* 2f-4 .- 1f-4)
+function reset!(ge::GridEncoding)
+    copy!(ge.θ, rand(Float32, size(θ)) .* 2f-4 .- 1f-4)
 end
 
 function get_output_shape(ge::GridEncoding)
     Int.((ge.n_features, ge.n_levels))
 end
 
-function (ge::GridEncoding)(x, θ)
+function (ge::GridEncoding)(x)
     Backend = get_backend(ge)
     n = size(x, 2)
     y = allocate(Backend, Float32, (get_output_shape(ge)..., n))
     NPD, NFPL = _get_kernel_params(ge)
     grid_kernel!(Backend)(
-        y, nothing, x, θ, ge.offset_table, NPD, NFPL,
+        y, nothing, x, ge.θ, ge.offset_table, NPD, NFPL,
         ge.base_resolution, log2(ge.scale); ndrange=(n, ge.n_levels))
     reshape(y, :, n)
 end
 
-function (ge::GridEncoding)(x, θ, ::Val{:IG})
+function (ge::GridEncoding)(x, ::Val{:IG})
     Backend = get_backend(ge)
     n = size(x, 2)
     y = allocate(Backend, Float32, (get_output_shape(ge)..., n))
@@ -85,16 +86,16 @@ function (ge::GridEncoding)(x, θ, ::Val{:IG})
 
     NPD, NFPL = _get_kernel_params(ge)
     grid_kernel!(Backend)(
-        y, ∂y∂x, x, θ, ge.offset_table, NPD, NFPL, ge.base_resolution,
+        y, ∂y∂x, x, ge.θ, ge.offset_table, NPD, NFPL, ge.base_resolution,
         log2(ge.scale); ndrange=(n, ge.n_levels))
     reshape(y, :, n), ∂y∂x
 end
 
-function ∇(ge::GridEncoding, ∂f∂y, x, θ)
+function ∇(ge::GridEncoding, ∂f∂y, x)
     Backend = get_backend(ge)
     n = size(x, 2)
     NPD, NFPL = _get_kernel_params(ge)
-    ∂grid = KernelAbstractions.zeros(Backend, Float32, size(θ))
+    ∂grid = KernelAbstractions.zeros(Backend, Float32, size(ge.θ))
     ∇grid_kernel!(Backend)(
         ∂grid, ∂f∂y, x, ge.offset_table, NPD, NFPL, ge.base_resolution,
         log2(ge.scale); ndrange=(n, ge.n_levels))
@@ -111,23 +112,22 @@ function ∇grid_input(ge::GridEncoding, ∂L∂y, ∂y∂x)
     ∂L∂x
 end
 
-function ChainRulesCore.rrule(ge::GridEncoding, x, θ)
+function ChainRulesCore.rrule(ge::GridEncoding, x)
     n = size(x, 2)
     function encode_pullback(Δ)
-        Δ2 = reshape(unthunk(Δ), (get_output_shape(ge)..., n))
-        Tangent{GridEncoding}(), NoTangent(), ∇(ge, Δ2, x, θ)
+        Δ = reshape(unthunk(Δ), (get_output_shape(ge)..., n))
+        Tangent{GridEncoding}(;θ=∇(ge, Δ, x)), NoTangent()
     end
-    ge(x, θ), encode_pullback
+    ge(x), encode_pullback
 end
 
-function ChainRulesCore.rrule(ge::GridEncoding, x, θ, ::Val{:IG})
+function ChainRulesCore.rrule(ge::GridEncoding, x, ::Val{:IG})
     n = size(x, 2)
-    y, ∂y∂x = ge(x, θ, Val{:IG}())
+    y, ∂y∂x = ge(x, Val{:IG}())
     function encode_pullback(Δ)
-        Δ2 = reshape(unthunk(Δ), (get_output_shape(ge)..., n))
-        (
-            Tangent{GridEncoding}(), @thunk(∇grid_input(ge, Δ2, ∂y∂x)),
-            @thunk(∇(ge, Δ2, x, θ)), NoTangent())
+        Δ = reshape(unthunk(Δ), (get_output_shape(ge)..., n))
+        (Tangent{GridEncoding}(; θ=@thunk(∇(ge, Δ, x))),
+            @thunk(∇grid_input(ge, Δ, ∂y∂x)), NoTangent())
     end
     y, encode_pullback
 end
diff --git a/src/kautils.jl b/src/kautils.jl
@@ -1,7 +1,7 @@
-# Supported values are: ROC, CUDA.
+# Supported values are: AMD, CUDA.
 const BACKEND_NAME::String = @load_preference("backend", "ROC")
 
-@static if BACKEND_NAME == "ROC"
+@static if BACKEND_NAME == "AMD"
     using AMDGPU
     AMDGPU.allowscalar(false)
     const Backend::ROCBackend = ROCBackend()