From 2fb0ccd6a8e512d1891c5cab0342399b49493639 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 21 Nov 2023 13:24:56 -0800
Subject: [PATCH] Make the fast inverse test throughput-limited rather than
 latency-limited

---
 test/performance/fast_inverse.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/test/performance/fast_inverse.cpp b/test/performance/fast_inverse.cpp
index 6ddfa4f620d7..cfd9b7c796c2 100644
--- a/test/performance/fast_inverse.cpp
+++ b/test/performance/fast_inverse.cpp
@@ -14,6 +14,8 @@ int main(int argc, char **argv) {
 
     if (target.arch == Target::ARM &&
         target.os == Target::OSX) {
+        // vrecpe, vrecps, fmul have inverse throughputs of 1, 0.25, 0.25
+        // respectively, while fdiv has inverse throughput of 1.
         printf("[SKIP] Apple M1 chips have division performance roughly on par with the reciprocal instruction\n");
         return 0;
     }
@@ -31,13 +33,16 @@ int main(int argc, char **argv) {
     slow(x) = p / (slow(x) + 1) + 0 * r;
     fast(x) = fast_inverse((fast(x) + 1) + 0 * r);
 
-    slow.update().vectorize(x, 4);
-    fast.update().vectorize(x, 4);
+    // Use wide vectors to ensure we're throughput-limited rather than latency-limited.
+    const int vec = 32;
+
+    slow.update().vectorize(x, vec);
+    fast.update().vectorize(x, vec);
 
     slow.compile_jit();
     fast.compile_jit();
 
-    Buffer<float> out_fast(8), out_slow(8);
+    Buffer<float> out_fast(vec), out_slow(vec);
 
     double slow_time = benchmark([&]() { slow.realize(out_slow); });
     double fast_time = benchmark([&]() { fast.realize(out_fast); });