Skip to content

Commit

Permalink
Add A100 benchmark results (#6)
Browse files Browse the repository at this point in the history
  • Loading branch information
smartalecH authored May 12, 2024
1 parent c8224d5 commit ac86f22
Show file tree
Hide file tree
Showing 9 changed files with 165 additions and 51 deletions.
38 changes: 37 additions & 1 deletion benchmark/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,40 @@ Future benchmarks include
* `grating_coupler.jl` - A simple silicon photonics grating coupler simulation (pulled from SiEPIC).
* `directional_coupler.jl` - A simple silicon photonics directional coupler simulation (pulled from SiEPIC).
* `metalens.jl` - A simple metalens simulation (pulled from the Tidy3D paper).
* `uLED.jl` - A simple uLED pixel.
* `uLED.jl` - A simple uLED pixel.

## Usage

To run the suite of benchmarks on your platform, simply run:

```bash
julia run_benchmarks.jl
```

To (optionally) specify a particular hardware platform (either `CUDA`, `METAL` or `CPU`), use the `--backend` flag:

```bash
julia run_benchmarks.jl --backend=CUDA
```

To (optionally) specify the arithemtic precision (either `Float32` or `Float64`), use the `--precision` flag:

```bash
julia run_benchmarks.jl --backend=CUDA --precision=Float64
```

To (optionally) _profile_ the current hardware and save the results, us the `--profile` flag:

```bash
julia run_benchmarks.jl --backend=CUDA --precision=Float64 --profile
```

## Saving profiling results

All profiling results can be saved to the benchmark's corresponding yaml file. For example, all `dipole.jl` results will be saved in `dipole.yml`.

Whenever adding a new hardware platform or precision configuration, you must manually add in the appropriate tests to the yaml file. This allows you to cherrypick specific simulation parameters best geared for that particular hardware platform.

For example, since an NVIDIA H100 GPU has significantly more VRAM than an NVIDIA V100, you'll want to setup tests with larger domains, resolutions, etc.

Similarly, all profile configurations accept a "tolerance" parameter, which can be used to specify how sensitive a change in performance needs to be before alerting a user. For example, a tolerance of 1.1 indicates a change of 10% will raise a warning, encouraging the user to record the change.
14 changes: 7 additions & 7 deletions benchmark/benchmark_utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

module BenchmarkUtils

import fdtd
import Khronos

using CUDA
using ArgParse
Expand All @@ -15,9 +15,9 @@ export detect_and_set_backend, get_hardware_key, benchmark_result
UPDATE_FACTOR = 2.0

backend_string_to_struct = Dict([
("CUDA", fdtd.CUDADevice()),
("Metal", fdtd.MetalDevice()),
("CPU", fdtd.CPUDevice()),
("CUDA", Khronos.CUDADevice()),
("Metal", Khronos.MetalDevice()),
("CPU", Khronos.CPUDevice()),
])

precision_string_to_type = Dict([("Float32", Float32), ("Float64", Float64)])
Expand Down Expand Up @@ -65,7 +65,7 @@ function detect_and_set_backend(backend::String, precision::String)

backend_struct = backend_string_to_struct[backend]
precision_type = precision_string_to_type[precision]
fdtd.choose_backend(backend_struct, precision_type)
Khronos.choose_backend(backend_struct, precision_type)

return backend, precision
end
Expand Down Expand Up @@ -100,11 +100,11 @@ end

function detect_and_set_backend(backend::Nothing, precision::Nothing)

if fdtd.CUDA.functional()
if Khronos.CUDA.functional()
# Check for CUDA
default_backend = "CUDA"
default_precision = "Float64"
elseif fdtd.Metal.functional()
elseif Khronos.Metal.functional()
# Check for Metal
default_backend = "Metal"
default_precision = "Float32"
Expand Down
12 changes: 6 additions & 6 deletions benchmark/dipole.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
#
import YAML
import fdtd
import Khronos

using KernelAbstractions
using Logging
Expand All @@ -25,15 +25,15 @@ hardware_key = get_hardware_key()

function build_dipole_simulation(resolution, sim_xyz)
sources = [
fdtd.UniformSource(
time_profile = fdtd.ContinuousWaveSource(fcen = 1.0),
component = fdtd.Ez(),
Khronos.UniformSource(
time_profile = Khronos.ContinuousWaveSource(fcen = 1.0),
component = Khronos.Ez(),
center = [0.0, 0.0, 0.0],
size = [0.0, 0.0, 0.0],
),
]

sim = fdtd.Simulation(
sim = Khronos.Simulation(
cell_size = sim_xyz * [1.0, 1.0, 1.0],
cell_center = [0.0, 0.0, 0.0],
resolution = resolution,
Expand All @@ -58,7 +58,7 @@ end
@testset "resolution: $resolution | size_xyz: $size_xyz" begin

sim = build_dipole_simulation(resolution, size_xyz)
timstep_rate = fdtd.run_benchmark(sim, 110)
timstep_rate = Khronos.run_benchmark(sim, 110)
benchmark_result(
timstep_rate,
benchmark_rate,
Expand Down
35 changes: 35 additions & 0 deletions benchmark/dipole.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,39 @@
simple_dipole:
NVIDIA A100-SXM4-80GB:
CUDA:
Float64:
- timestep_rate: 1171.2865695086832
resolution: 10.0
size_xyz: 4.0
tolerance: 1.1
- timestep_rate: 9404.763282823858
resolution: 20.0
size_xyz: 4.0
tolerance: 1.1
- timestep_rate: 74542.75690545407
resolution: 40.0
size_xyz: 4.0
tolerance: 1.1
- timestep_rate: 609755.7829281278
resolution: 80.0
size_xyz: 4.0
tolerance: 1.1
- timestep_rate: 18532.62636974196
resolution: 10.0
size_xyz: 10.0
tolerance: 1.1
- timestep_rate: 152326.27564917377
resolution: 20.0
size_xyz: 10.0
tolerance: 1.1
- timestep_rate: 1.1860356824106395e6
resolution: 40.0
size_xyz: 10.0
tolerance: 1.1
- timestep_rate: 1.4693758582999355e6
resolution: 50.0
size_xyz: 10.0
tolerance: 1.1
NVIDIA H100:
CUDA:
Float64:
Expand Down
26 changes: 13 additions & 13 deletions benchmark/periodic_stack.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
#
import YAML
import fdtd
import Khronos

using KernelAbstractions
using Logging
Expand Down Expand Up @@ -30,23 +30,23 @@ function build_periodic_stack(resolution::Real, z_scaling::Real)
#TODO swap out for actual planewave source once ready
src_z = (-z_thickness / 2.0) + 1.0
sources = [
fdtd.UniformSource(
time_profile = fdtd.ContinuousWaveSource(fcen = 1.0),
component = fdtd.Ex(),
Khronos.UniformSource(
time_profile = Khronos.ContinuousWaveSource(fcen = 1.0),
component = Khronos.Ex(),
center = [0.0, 0.0, src_z],
size = [Inf, Inf, 0.0],
),
fdtd.UniformSource(
time_profile = fdtd.ContinuousWaveSource(fcen = 1.0),
component = fdtd.Hy(),
Khronos.UniformSource(
time_profile = Khronos.ContinuousWaveSource(fcen = 1.0),
component = Khronos.Hy(),
center = [0.0, 0.0, src_z],
size = [Inf, Inf, 0.0],
),
]

mat_low = fdtd.Material= 1.5)
mat_mid = fdtd.Material= 2.5)
mat_high = fdtd.Material= 3.5)
mat_low = Khronos.Material= 1.5)
mat_mid = Khronos.Material= 2.5)
mat_high = Khronos.Material= 3.5)

materials = [mat_low, mat_mid, mat_low, mat_high, mat_mid, mat_high]
thicknesses = [0.5, 1.0, 0.75, 1.0, 0.25, 0.5] * z_scaling
Expand All @@ -58,7 +58,7 @@ function build_periodic_stack(resolution::Real, z_scaling::Real)
append!(
geometry,
[
fdtd.Object(
Khronos.Object(
Cuboid([0.0, 0.0, z_cur], [4.0, 4.0, current_thick]),
current_mat,
),
Expand All @@ -67,7 +67,7 @@ function build_periodic_stack(resolution::Real, z_scaling::Real)
z_cur += current_thick / 2.0
end

sim = fdtd.Simulation(
sim = Khronos.Simulation(
cell_size = [4.0, 4.0, z_thickness],
cell_center = [0.0, 0.0, 0.0],
resolution = resolution,
Expand All @@ -93,7 +93,7 @@ end
@testset "resolution: $resolution | z_scaling: $z_scaling" begin

sim = build_periodic_stack(resolution, z_scaling)
timstep_rate = fdtd.run_benchmark(sim, 110)
timstep_rate = Khronos.run_benchmark(sim, 110)
benchmark_result(
timstep_rate,
benchmark_rate,
Expand Down
23 changes: 23 additions & 0 deletions benchmark/periodic_stack.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,27 @@
dielectric_periodic_stack:
NVIDIA A100-SXM4-80GB:
CUDA:
Float64:
- timestep_rate: 1209.9099268020047
z_scaling: 0.5
resolution: 10.0
tolerance: 1.1
- timestep_rate: 9692.560245531686
z_scaling: 0.5
resolution: 20.0
tolerance: 1.1
- timestep_rate: 80896.69434189708
z_scaling: 0.5
resolution: 40.0
tolerance: 1.1
- timestep_rate: 2550.213338400152
z_scaling: 1.0
resolution: 10.0
tolerance: 1.1
- timestep_rate: 18446.00281738533
z_scaling: 1.0
resolution: 20.0
tolerance: 1.1
NVIDIA H100:
CUDA:
Float64:
Expand Down
24 changes: 12 additions & 12 deletions benchmark/sphere.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
#
import YAML
import fdtd
import Khronos

using KernelAbstractions
using Logging
Expand Down Expand Up @@ -31,29 +31,29 @@ function build_sphere_sim(resolution, radius; include_loss = false)
src_z = -s_xyz / 2.0 + 1.0
#TODO swap out for actual planewave source once ready
sources = [
fdtd.UniformSource(
time_profile = fdtd.ContinuousWaveSource(fcen = 1.0),
component = fdtd.Ex(),
Khronos.UniformSource(
time_profile = Khronos.ContinuousWaveSource(fcen = 1.0),
component = Khronos.Ex(),
center = [0.0, 0.0, src_z],
size = [Inf, Inf, 0.0],
),
fdtd.UniformSource(
time_profile = fdtd.ContinuousWaveSource(fcen = 1.0),
component = fdtd.Hy(),
Khronos.UniformSource(
time_profile = Khronos.ContinuousWaveSource(fcen = 1.0),
component = Khronos.Hy(),
center = [0.0, 0.0, src_z],
size = [Inf, Inf, 0.0],
),
]

if include_loss
mat = fdtd.Material= 3, σD = 5)
mat = Khronos.Material= 3, σD = 5)
else
mat = fdtd.Material= 3)
mat = Khronos.Material= 3)
end

geometry = [fdtd.Object(Ball([0.0, 0.0, 0.0], radius), mat)]
geometry = [Khronos.Object(Ball([0.0, 0.0, 0.0], radius), mat)]

sim = fdtd.Simulation(
sim = Khronos.Simulation(
cell_size = [s_xyz, s_xyz, s_xyz],
cell_center = [0.0, 0.0, 0.0],
resolution = resolution,
Expand All @@ -79,7 +79,7 @@ end
@testset "resolution: $resolution | radius: $radius" begin

sim = build_sphere_sim(resolution, radius)
timstep_rate = fdtd.run_benchmark(sim, 110)
timstep_rate = Khronos.run_benchmark(sim, 110)
benchmark_result(
timstep_rate,
benchmark_rate,
Expand Down
23 changes: 14 additions & 9 deletions benchmark/sphere.yml
Original file line number Diff line number Diff line change
@@ -1,37 +1,42 @@
scattering_off_sphere:
NVIDIA H100:
NVIDIA A100-SXM4-80GB:
CUDA:
Float64:
- timestep_rate: 1276.1677579534114
- timestep_rate: 1961.7151837162314
resolution: 10.0
radius: 1.0
tolerance: 1.1
- timestep_rate: 10478.948683355819
- timestep_rate: 15772.803850782191
resolution: 20.0
radius: 1.0
tolerance: 1.1
- timestep_rate: 3501.804327824161
- timestep_rate: 5458.308123079258
resolution: 10.0
radius: 2.0
tolerance: 1.1
- timestep_rate: 29747.91329835354
- timestep_rate: 43314.779933009675
resolution: 20.0
radius: 2.0
tolerance: 1.1
CPU:
NVIDIA H100:
CUDA:
Float64:
- timestep_rate: 5.024192476494856
- timestep_rate: 1276.1677579534114
resolution: 10.0
radius: 1.0
tolerance: 1.1
- timestep_rate: 7.351783347815566
- timestep_rate: 10478.948683355819
resolution: 20.0
radius: 1.0
tolerance: 1.1
- timestep_rate: 6.170264450761192
- timestep_rate: 3501.804327824161
resolution: 10.0
radius: 2.0
tolerance: 1.1
- timestep_rate: 29747.91329835354
resolution: 20.0
radius: 2.0
tolerance: 1.1
Apple M1 Pro:
CPU:
Float64:
Expand Down
Loading

0 comments on commit ac86f22

Please sign in to comment.