Add A100 benchmark results (#6)

facebookresearch · May 12, 2024 · ac86f22 · ac86f22
1 parent c8224d5
commit ac86f22
Show file tree

Hide file tree

Showing 9 changed files with 165 additions and 51 deletions.
diff --git a/benchmark/README.md b/benchmark/README.md
@@ -14,4 +14,40 @@ Future benchmarks include
 * `grating_coupler.jl` - A simple silicon photonics grating coupler simulation (pulled from SiEPIC).
 * `directional_coupler.jl` - A simple silicon photonics directional coupler simulation (pulled from SiEPIC).
 * `metalens.jl` - A simple metalens simulation (pulled from the Tidy3D paper).
-* `uLED.jl` - A simple uLED pixel.
+* `uLED.jl` - A simple uLED pixel.
+
+## Usage
+
+To run the suite of benchmarks on your platform, simply run:
+
+```bash
+julia run_benchmarks.jl
+```
+
+To (optionally) specify a particular hardware platform (either `CUDA`, `METAL` or `CPU`), use the `--backend` flag:
+
+```bash
+julia run_benchmarks.jl --backend=CUDA
+```
+
+To (optionally) specify the arithemtic precision (either `Float32` or `Float64`), use the `--precision` flag:
+
+```bash
+julia run_benchmarks.jl --backend=CUDA --precision=Float64
+```
+
+To (optionally) _profile_ the current hardware and save the results, us the `--profile` flag:
+
+```bash
+julia run_benchmarks.jl --backend=CUDA --precision=Float64 --profile
+```
+
+## Saving profiling results
+
+All profiling results can be saved to the benchmark's corresponding yaml file. For example, all `dipole.jl` results will be saved in `dipole.yml`.
+
+Whenever adding a new hardware platform or precision configuration, you must manually add in the appropriate tests to the yaml file. This allows you to cherrypick specific simulation parameters best geared for that particular hardware platform.
+
+For example, since an NVIDIA H100 GPU has significantly more VRAM than an NVIDIA V100, you'll want to setup tests with larger domains, resolutions, etc.
+
+Similarly, all profile configurations accept a "tolerance" parameter, which can be used to specify how sensitive a change in performance needs to be before alerting a user. For example, a tolerance of 1.1 indicates a change of 10% will raise a warning, encouraging the user to record the change.
diff --git a/benchmark/benchmark_utils.jl b/benchmark/benchmark_utils.jl
@@ -2,7 +2,7 @@
 
 module BenchmarkUtils
 
-import fdtd
+import Khronos
 
 using CUDA
 using ArgParse
@@ -15,9 +15,9 @@ export detect_and_set_backend, get_hardware_key, benchmark_result
 UPDATE_FACTOR = 2.0
 
 backend_string_to_struct = Dict([
-    ("CUDA", fdtd.CUDADevice()),
-    ("Metal", fdtd.MetalDevice()),
-    ("CPU", fdtd.CPUDevice()),
+    ("CUDA", Khronos.CUDADevice()),
+    ("Metal", Khronos.MetalDevice()),
+    ("CPU", Khronos.CPUDevice()),
 ])
 
 precision_string_to_type = Dict([("Float32", Float32), ("Float64", Float64)])
@@ -65,7 +65,7 @@ function detect_and_set_backend(backend::String, precision::String)
 
     backend_struct = backend_string_to_struct[backend]
     precision_type = precision_string_to_type[precision]
-    fdtd.choose_backend(backend_struct, precision_type)
+    Khronos.choose_backend(backend_struct, precision_type)
 
     return backend, precision
 end
@@ -100,11 +100,11 @@ end
 
 function detect_and_set_backend(backend::Nothing, precision::Nothing)
 
-    if fdtd.CUDA.functional()
+    if Khronos.CUDA.functional()
         # Check for CUDA
         default_backend = "CUDA"
         default_precision = "Float64"
-    elseif fdtd.Metal.functional()
+    elseif Khronos.Metal.functional()
         # Check for Metal
         default_backend = "Metal"
         default_precision = "Float32"

diff --git a/benchmark/dipole.jl b/benchmark/dipole.jl
@@ -1,7 +1,7 @@
 # (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 #
 import YAML
-import fdtd
+import Khronos
 
 using KernelAbstractions
 using Logging
@@ -25,15 +25,15 @@ hardware_key = get_hardware_key()
 
 function build_dipole_simulation(resolution, sim_xyz)
     sources = [
-        fdtd.UniformSource(
-            time_profile = fdtd.ContinuousWaveSource(fcen = 1.0),
-            component = fdtd.Ez(),
+        Khronos.UniformSource(
+            time_profile = Khronos.ContinuousWaveSource(fcen = 1.0),
+            component = Khronos.Ez(),
             center = [0.0, 0.0, 0.0],
             size = [0.0, 0.0, 0.0],
         ),
     ]
 
-    sim = fdtd.Simulation(
+    sim = Khronos.Simulation(
         cell_size = sim_xyz * [1.0, 1.0, 1.0],
         cell_center = [0.0, 0.0, 0.0],
         resolution = resolution,
@@ -58,7 +58,7 @@ end
         @testset "resolution: $resolution | size_xyz: $size_xyz" begin
 
             sim = build_dipole_simulation(resolution, size_xyz)
-            timstep_rate = fdtd.run_benchmark(sim, 110)
+            timstep_rate = Khronos.run_benchmark(sim, 110)
             benchmark_result(
                 timstep_rate,
                 benchmark_rate,

diff --git a/benchmark/dipole.yml b/benchmark/dipole.yml
@@ -1,4 +1,39 @@
 simple_dipole:
+  NVIDIA A100-SXM4-80GB:
+    CUDA:
+      Float64:
+        - timestep_rate: 1171.2865695086832
+          resolution: 10.0
+          size_xyz: 4.0
+          tolerance: 1.1
+        - timestep_rate: 9404.763282823858
+          resolution: 20.0
+          size_xyz: 4.0
+          tolerance: 1.1
+        - timestep_rate: 74542.75690545407
+          resolution: 40.0
+          size_xyz: 4.0
+          tolerance: 1.1
+        - timestep_rate: 609755.7829281278
+          resolution: 80.0
+          size_xyz: 4.0
+          tolerance: 1.1
+        - timestep_rate: 18532.62636974196
+          resolution: 10.0
+          size_xyz: 10.0
+          tolerance: 1.1
+        - timestep_rate: 152326.27564917377
+          resolution: 20.0
+          size_xyz: 10.0
+          tolerance: 1.1
+        - timestep_rate: 1.1860356824106395e6
+          resolution: 40.0
+          size_xyz: 10.0
+          tolerance: 1.1
+        - timestep_rate: 1.4693758582999355e6
+          resolution: 50.0
+          size_xyz: 10.0
+          tolerance: 1.1
   NVIDIA H100:
     CUDA:
       Float64:

diff --git a/benchmark/periodic_stack.jl b/benchmark/periodic_stack.jl
@@ -1,7 +1,7 @@
 # (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 #
 import YAML
-import fdtd
+import Khronos
 
 using KernelAbstractions
 using Logging
@@ -30,23 +30,23 @@ function build_periodic_stack(resolution::Real, z_scaling::Real)
     #TODO swap out for actual planewave source once ready
     src_z = (-z_thickness / 2.0) + 1.0
     sources = [
-        fdtd.UniformSource(
-            time_profile = fdtd.ContinuousWaveSource(fcen = 1.0),
-            component = fdtd.Ex(),
+        Khronos.UniformSource(
+            time_profile = Khronos.ContinuousWaveSource(fcen = 1.0),
+            component = Khronos.Ex(),
             center = [0.0, 0.0, src_z],
             size = [Inf, Inf, 0.0],
         ),
-        fdtd.UniformSource(
-            time_profile = fdtd.ContinuousWaveSource(fcen = 1.0),
-            component = fdtd.Hy(),
+        Khronos.UniformSource(
+            time_profile = Khronos.ContinuousWaveSource(fcen = 1.0),
+            component = Khronos.Hy(),
             center = [0.0, 0.0, src_z],
             size = [Inf, Inf, 0.0],
         ),
     ]
 
-    mat_low = fdtd.Material(ε = 1.5)
-    mat_mid = fdtd.Material(ε = 2.5)
-    mat_high = fdtd.Material(ε = 3.5)
+    mat_low = Khronos.Material(ε = 1.5)
+    mat_mid = Khronos.Material(ε = 2.5)
+    mat_high = Khronos.Material(ε = 3.5)
 
     materials = [mat_low, mat_mid, mat_low, mat_high, mat_mid, mat_high]
     thicknesses = [0.5, 1.0, 0.75, 1.0, 0.25, 0.5] * z_scaling
@@ -58,7 +58,7 @@ function build_periodic_stack(resolution::Real, z_scaling::Real)
         append!(
             geometry,
             [
-                fdtd.Object(
+                Khronos.Object(
                     Cuboid([0.0, 0.0, z_cur], [4.0, 4.0, current_thick]),
                     current_mat,
                 ),
@@ -67,7 +67,7 @@ function build_periodic_stack(resolution::Real, z_scaling::Real)
         z_cur += current_thick / 2.0
     end
 
-    sim = fdtd.Simulation(
+    sim = Khronos.Simulation(
         cell_size = [4.0, 4.0, z_thickness],
         cell_center = [0.0, 0.0, 0.0],
         resolution = resolution,
@@ -93,7 +93,7 @@ end
         @testset "resolution: $resolution | z_scaling: $z_scaling" begin
 
             sim = build_periodic_stack(resolution, z_scaling)
-            timstep_rate = fdtd.run_benchmark(sim, 110)
+            timstep_rate = Khronos.run_benchmark(sim, 110)
             benchmark_result(
                 timstep_rate,
                 benchmark_rate,

diff --git a/benchmark/periodic_stack.yml b/benchmark/periodic_stack.yml
@@ -1,4 +1,27 @@
 dielectric_periodic_stack:
+  NVIDIA A100-SXM4-80GB:
+    CUDA:
+      Float64:
+        - timestep_rate: 1209.9099268020047
+          z_scaling: 0.5
+          resolution: 10.0
+          tolerance: 1.1
+        - timestep_rate: 9692.560245531686
+          z_scaling: 0.5
+          resolution: 20.0
+          tolerance: 1.1
+        - timestep_rate: 80896.69434189708
+          z_scaling: 0.5
+          resolution: 40.0
+          tolerance: 1.1
+        - timestep_rate: 2550.213338400152
+          z_scaling: 1.0
+          resolution: 10.0
+          tolerance: 1.1
+        - timestep_rate: 18446.00281738533
+          z_scaling: 1.0
+          resolution: 20.0
+          tolerance: 1.1
   NVIDIA H100:
     CUDA:
       Float64:

diff --git a/benchmark/sphere.jl b/benchmark/sphere.jl
@@ -1,7 +1,7 @@
 # (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 #
 import YAML
-import fdtd
+import Khronos
 
 using KernelAbstractions
 using Logging
@@ -31,29 +31,29 @@ function build_sphere_sim(resolution, radius; include_loss = false)
     src_z = -s_xyz / 2.0 + 1.0
     #TODO swap out for actual planewave source once ready
     sources = [
-        fdtd.UniformSource(
-            time_profile = fdtd.ContinuousWaveSource(fcen = 1.0),
-            component = fdtd.Ex(),
+        Khronos.UniformSource(
+            time_profile = Khronos.ContinuousWaveSource(fcen = 1.0),
+            component = Khronos.Ex(),
             center = [0.0, 0.0, src_z],
             size = [Inf, Inf, 0.0],
         ),
-        fdtd.UniformSource(
-            time_profile = fdtd.ContinuousWaveSource(fcen = 1.0),
-            component = fdtd.Hy(),
+        Khronos.UniformSource(
+            time_profile = Khronos.ContinuousWaveSource(fcen = 1.0),
+            component = Khronos.Hy(),
             center = [0.0, 0.0, src_z],
             size = [Inf, Inf, 0.0],
         ),
     ]
 
     if include_loss
-        mat = fdtd.Material(ε = 3, σD = 5)
+        mat = Khronos.Material(ε = 3, σD = 5)
     else
-        mat = fdtd.Material(ε = 3)
+        mat = Khronos.Material(ε = 3)
     end
 
-    geometry = [fdtd.Object(Ball([0.0, 0.0, 0.0], radius), mat)]
+    geometry = [Khronos.Object(Ball([0.0, 0.0, 0.0], radius), mat)]
 
-    sim = fdtd.Simulation(
+    sim = Khronos.Simulation(
         cell_size = [s_xyz, s_xyz, s_xyz],
         cell_center = [0.0, 0.0, 0.0],
         resolution = resolution,
@@ -79,7 +79,7 @@ end
         @testset "resolution: $resolution | radius: $radius" begin
 
             sim = build_sphere_sim(resolution, radius)
-            timstep_rate = fdtd.run_benchmark(sim, 110)
+            timstep_rate = Khronos.run_benchmark(sim, 110)
             benchmark_result(
                 timstep_rate,
                 benchmark_rate,

diff --git a/benchmark/sphere.yml b/benchmark/sphere.yml
@@ -1,37 +1,42 @@
 scattering_off_sphere:
-  NVIDIA H100:
+  NVIDIA A100-SXM4-80GB:
     CUDA:
       Float64:
-        - timestep_rate: 1276.1677579534114
+        - timestep_rate: 1961.7151837162314
           resolution: 10.0
           radius: 1.0
           tolerance: 1.1
-        - timestep_rate: 10478.948683355819
+        - timestep_rate: 15772.803850782191
           resolution: 20.0
           radius: 1.0
           tolerance: 1.1
-        - timestep_rate: 3501.804327824161
+        - timestep_rate: 5458.308123079258
           resolution: 10.0
           radius: 2.0
           tolerance: 1.1
-        - timestep_rate: 29747.91329835354
+        - timestep_rate: 43314.779933009675
           resolution: 20.0
           radius: 2.0
           tolerance: 1.1
-    CPU:
+  NVIDIA H100:
+    CUDA:
       Float64:
-        - timestep_rate: 5.024192476494856
+        - timestep_rate: 1276.1677579534114
           resolution: 10.0
           radius: 1.0
           tolerance: 1.1
-        - timestep_rate: 7.351783347815566
+        - timestep_rate: 10478.948683355819
           resolution: 20.0
           radius: 1.0
           tolerance: 1.1
-        - timestep_rate: 6.170264450761192
+        - timestep_rate: 3501.804327824161
           resolution: 10.0
           radius: 2.0
           tolerance: 1.1
+        - timestep_rate: 29747.91329835354
+          resolution: 20.0
+          radius: 2.0
+          tolerance: 1.1
   Apple M1 Pro:
     CPU:
       Float64: